<a href="https://colab.research.google.com/github/matthewroche/house_prices/blob/main/House_Prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json



Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 68 bytes


In [3]:
import kaggle
!kaggle competitions download house-prices-advanced-regression-techniques

Downloading train.csv to /content
  0% 0.00/450k [00:00<?, ?B/s]
100% 450k/450k [00:00<00:00, 61.6MB/s]
Downloading data_description.txt to /content
  0% 0.00/13.1k [00:00<?, ?B/s]
100% 13.1k/13.1k [00:00<00:00, 10.8MB/s]
Downloading test.csv to /content
  0% 0.00/441k [00:00<?, ?B/s]
100% 441k/441k [00:00<00:00, 58.0MB/s]
Downloading sample_submission.csv to /content
  0% 0.00/31.2k [00:00<?, ?B/s]
100% 31.2k/31.2k [00:00<00:00, 32.5MB/s]


In [158]:
import pandas as pd

train = pd.read_csv("train.csv", na_filter=False)

print(train.shape)
print(train.columns)

(1460, 81)
Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 

In [159]:
# Drop ID column
train.drop(labels="Id", axis=1, inplace=True)

In [160]:
# Split inputs and outputs
y = train["SalePrice"]
X = train.drop(["SalePrice"], axis=1)

In [161]:
# Create a test set
from sklearn.model_selection import train_test_split
train, test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(train)

      MSSubClass MSZoning LotFrontage  ...  YrSold SaleType SaleCondition
615           85       RL          80  ...    2010       WD       Abnorml
613           20       RL          70  ...    2007      New       Partial
1303          20       RL          73  ...    2006       WD        Normal
486           20       RL          79  ...    2007       WD        Normal
561           20       RL          77  ...    2006       WD        Normal
...          ...      ...         ...  ...     ...      ...           ...
1095          20       RL          78  ...    2007       WD        Normal
1130          50       RL          65  ...    2009       WD        Normal
1294          20       RL          60  ...    2006       WD        Normal
860           50       RL          55  ...    2007       WD        Normal
1126         120       RL          53  ...    2009       WD        Normal

[978 rows x 79 columns]


In [162]:
# Notice lot frontage has been processed as a string as it contains "NA"s
# Replace NA's with mean

def handleColumnWithNA(df, colName, **means):
  df = df.copy()
  df[colName] = pd.to_numeric(df[colName], errors="coerce", downcast="float")
  if colName in means:
    df[colName].fillna(means[colName], inplace = True)
  else:
    df[colName].fillna(df[colName].mean(), inplace = True)
  return df

train = handleColumnWithNA(train, "LotFrontage")

means = train.mean(axis=0)

In [163]:
# Wide range of means and standard deviations - lets adjust

trainBeforeNormalisation = train

import numpy as np
from sklearn import preprocessing
from pandas.api.types import is_numeric_dtype
min_max_scaler = preprocessing.MinMaxScaler()

def performNumericScaling(df, initial):

  print(df.isnull().sum().sum())

  # First get only the columns containing numeric data
  numericCols = []
  for col in df.columns:
    if is_numeric_dtype(df.dtypes[col]):
      numericCols.append(col)

  # Build a new data frame with just these columns
  numericDf = df[numericCols]

  # # Do the scaling
  x = numericDf.values #returns a numpy array
  if initial:
    min_max_scaler.fit(x)
  x_scaled = min_max_scaler.transform(x)
  numericDf = pd.DataFrame(x_scaled, columns=numericDf.columns)

  print(numericDf.isnull().sum().sum())
  print(df.isnull().sum().sum())

  # Put the scaled columns back into our original data frame
  df.drop(numericCols, axis=1, inplace=True)
  df = df.join(numericDf) # NAs are being indroduced here somehow!!!  <<<-----------------------------------------------------------------------

  print(numericDf.isnull().sum().sum())
  print(df.isnull().sum().sum())

  return df

train = performNumericScaling(train, True)

0
0
0
0
10948


In [133]:
print(train.shape)
print(train.isnull().sum())

(978, 79)
MSZoning         0
Street           0
Alley            0
LotShape         0
LandContour      0
              ... 
ScreenPorch    322
PoolArea       322
MiscVal        322
MoSold         322
YrSold         322
Length: 79, dtype: int64


In [43]:
# Encode strings

from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore', sparse=True)

def performOneHotEncoding(df, initial):

  strCols = []
  for col in df.columns:
    if df.dtypes[col] == object:
      strCols.append(col)

  # Build a new data frame with just these columns
  stringDf = df[strCols]
  
  if initial:
    enc.fit(stringDf.values)
  fit = enc.transform(stringDf.values).toarray()

  # Not adding column headers as they don't matter
  stringDfEncoded = pd.DataFrame(fit)

  df.drop(labels=strCols, axis=1, inplace=True)
  df = df.join(stringDfEncoded)
  return df
  
train = performOneHotEncoding(train, True)

In [44]:
# Process training data too
test = handleColumnWithNA(test, "LotFrontage")
test = performNumericScaling(test, True)
test = performOneHotEncoding(test, True)

In [55]:
# Train a neural net
from sklearn.neural_network import MLPRegressor
clf = MLPRegressor(solver="adam", alpha=1e-5, hidden_layer_sizes=(100, 25, 10), random_state=1, max_iter=500, tol=0.000001, learning_rate="adaptive", verbose=True)
print(train[train.isna()])
train = train.reset_index()
y_train = y_train.reset_index()
clf.fit(train, y_train)

     index  MSSubClass  LotFrontage  LotArea  ...  616  617  618  619
0      NaN         NaN          NaN      NaN  ...  NaN  NaN  NaN  NaN
1      NaN         NaN          NaN      NaN  ...  NaN  NaN  NaN  NaN
2      NaN         NaN          NaN      NaN  ...  NaN  NaN  NaN  NaN
3      NaN         NaN          NaN      NaN  ...  NaN  NaN  NaN  NaN
4      NaN         NaN          NaN      NaN  ...  NaN  NaN  NaN  NaN
..     ...         ...          ...      ...  ...  ...  ...  ...  ...
973    NaN         NaN          NaN      NaN  ...  NaN  NaN  NaN  NaN
974    NaN         NaN          NaN      NaN  ...  NaN  NaN  NaN  NaN
975    NaN         NaN          NaN      NaN  ...  NaN  NaN  NaN  NaN
976    NaN         NaN          NaN      NaN  ...  NaN  NaN  NaN  NaN
977    NaN         NaN          NaN      NaN  ...  NaN  NaN  NaN  NaN

[978 rows x 655 columns]


ValueError: ignored

In [398]:
# Load prediction data
predict = pd.read_csv("test.csv", na_filter=False, dtype=object)

# Save IDs
IDs = predict["Id"]
test = predict.drop(labels=["Id"], axis=1)

In [400]:
# Process Test Data

# Several of the columns that were processed as numbers before contin "NA" in the test data
# We've therefore imported all columns as strings, and will need to match the column types
for col in trainBeforeNormalisation.columns:
  if predict.dtypes[col] != trainBeforeNormalisation.dtypes[col]:
    # As we've imported all the columns as strings, if they don't match this must be because the column was an integer in the training data
    predict = handleColumnWithNA(predict, col, **means)

# Scale
predict = performNumericScaling(predict, False)
predict = performOneHotEncoding(predict, False)

print(predict.head())

   MSSubClass  LotFrontage   LotArea  OverallQual  ...  690  691  692  693
0        20.0     0.202055  0.048246     0.444444  ...  0.0  0.0  1.0  0.0
1        20.0     0.205479  0.060609     0.555556  ...  0.0  0.0  1.0  0.0
2        60.0     0.181507  0.058566     0.444444  ...  0.0  0.0  1.0  0.0
3        60.0     0.195205  0.040562     0.555556  ...  0.0  0.0  1.0  0.0
4       120.0     0.075342  0.017318     0.777778  ...  0.0  0.0  1.0  0.0

[5 rows x 728 columns]


In [413]:
predictions = clf.predict(predict)
predictions = pd.concat((IDs, pd.Series(predictions, name="Prediction")), axis=1)
print(predictions)

        Id    Prediction
0     1461   8677.988355
1     1462  35098.164442
2     1463     29.961642
3     1464     29.567193
4     1465     37.792228
...    ...           ...
1454  2915     42.264167
1455  2916     43.256083
1456  2917  42401.949573
1457  2918     32.667099
1458  2919     29.497160

[1459 rows x 2 columns]
