In [98]:
# Matplotlib for additional plotting
from matplotlib import pyplot as plt
%matplotlib notebook

In [99]:
import csv
import json
import pandas as pd # data processing, csv file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra
from scipy import stats # statistics
from datetime import datetime # date time manipulation

# Exploratory Data Analysis

In [100]:
train = pd.read_csv("train.csv", error_bad_lines=False)
test = pd.read_csv("test.csv", error_bad_lines=False)

In [101]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Data overview

In [102]:
groups = train.columns.to_series().groupby(train.dtypes).groups
groups.keys()

dict_keys([dtype('int64'), dtype('float64'), dtype('O')])

### Integer keys

In [103]:
groups[np.dtype('int64')]

Index(['Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
       'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

### Float Keys

In [104]:
groups[np.dtype('float64')]

Index(['LotFrontage', 'MasVnrArea', 'GarageYrBlt'], dtype='object')

### Object keys

In [105]:
groups[np.dtype('O')]

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [106]:
int_data = [
    'Id', 'MSSubClass', 'LotArea', 'OverallQual', 'OverallCond',
    'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
    'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
    'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
    'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice'
]

float_data = ['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

categorical_data = [
    'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
    'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
    'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
    'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
    'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
    'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
    'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
    'SaleType', 'SaleCondition'
]

### Feeling NaN

In [107]:
train[int_data] = train[int_data].fillna(value=-1)
train[float_data] = train[float_data].fillna(value=-1.0)
train[categorical_data] = train[categorical_data].fillna(value="NaN")

In [111]:
# Test data does not contain sales Price, hence int_data[0:-1]
test[int_data[0:-1]] = test[int_data[0:-1]].fillna(value=-1)
test[float_data] = test[float_data].fillna(value=-1.0)
test[categorical_data] = test[categorical_data].fillna(value="NaN")

### Converting object keys to dummies

In [112]:
train_data = pd.get_dummies(train, columns=categorical_data)
test_data = pd.get_dummies(test, columns=categorical_data)

In [113]:
X_train = train_data.loc[:, train_data.columns != 'SalePrice']
y_train = train_data.loc[:, train_data.columns == 'SalePrice']

In [114]:
X_test = test_data.loc[:, test_data.columns != 'SalePrice']
y_test = test_data.loc[:, test_data.columns == 'SalePrice']

In [117]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

# Basic Logistic Regression Classifier

In [144]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import mean_squared_error

In [132]:
X_val.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
529,530,20,-1.0,32668,6,3,1957,1975,-1.0,1219,...,0,0,0,1,0,0,1,0,0,0
491,492,50,79.0,9490,6,7,1941,1950,0.0,403,...,0,0,0,1,0,0,0,0,1,0
459,460,50,-1.0,7015,5,4,1950,1950,161.0,185,...,0,0,0,1,0,0,0,0,1,0
279,280,60,83.0,10005,7,5,1977,1977,299.0,392,...,0,0,0,1,0,0,0,0,1,0
655,656,160,21.0,1680,6,5,1971,1971,381.0,0,...,0,0,0,1,0,0,0,1,0,0


In [116]:
regressor = LinearRegression()  
regressor.fit(X_train, y_train) #training the algorithm

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [118]:
y_pred = regressor.predict(X_val)

In [128]:
df = pd.DataFrame({'Actual': y_val.to_numpy().flatten(), 'Predicted': y_pred.flatten()})
df.head()

Unnamed: 0,Actual,Predicted
0,200624,224514.73473
1,133000,145796.351705
2,110000,98249.822271
3,192000,204811.630978
4,88000,97724.866239


In [142]:
# idxs = np.argsort(X_val['Id'].to_numpy().flatten())

# X_axis = X_val['Id'].to_numpy().flatten()[idxs]
# y_axis = y_val.to_numpy().flatten()[idxs]
# y_axis_pred = y_pred.flatten()[idxs]

In [150]:
mean_squared_error(
    y_val.to_numpy().flatten(),
    y_pred.flatten()
) # Can I do better?

487355636.8805294