## Importing packages

In [None]:
%matplotlib inline
import numpy as np
import scipy.stats as stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
plt.style.use('ggplot')
test_ = pd.read_csv('test.csv')

In [42]:
ames = pd.read_csv('train.csv')
ames.shape

(1460, 81)

In [None]:
ames.columns = list(map(lambda x: x.replace(".","_").replace("1","_1").replace("2","_2").replace("3","_3").replace("___","_").replace("__","_"), ames.columns))
test_.columns = list(map(lambda x: x.replace(".","_").replace("1","_1").replace("2","_2").replace("3","_3").replace("___","_").replace("__","_"), test_.columns))

## Dropping irrelevant columns

In [None]:
ames.drop('Id', axis = 1, inplace = True)
test_.drop('Id', axis = 1, inplace = True)

In [None]:
#drop MSSubClass because it was included into three other columns: BdlgType, YearBuilt, 'HouseStyle'
ames.drop('MSSubClass', axis=1, inplace=True)
test_.drop('MSSubClass', axis=1, inplace=True)

In [None]:
#drop street because a majority of values is paved and only 6 values are grvl. Thus, we conclued that street does not affect SalePrice.
ames.drop('Street', axis=1, inplace=True)
test_.drop('Street', axis=1, inplace=True)

In [None]:
# drop Utilities as only one value is NoSewa. The rest are AllPub. Thus, we concluded that utilities was not a factor in determining SalePrice.
ames.drop('Utilities', axis=1, inplace=True)
test_.drop('Utilities', axis=1, inplace=True)

In [None]:
#keep roof style & material the same since the SalePrice is different for different materials
ames.groupby('RoofMatl').mean()['SalePrice'].sort_values(ascending=False)

In [None]:
ames.drop('ExterQual', axis=1, inplace=True)
ames.drop('ExterCond', axis=1, inplace=True)

In [None]:
test_.drop('ExterQual', axis=1, inplace=True)
test_.drop('ExterCond', axis=1, inplace=True)

In [None]:
ames.drop('BsmtQual', axis=1, inplace=True)
ames.drop('BsmtCond', axis=1, inplace=True)

In [None]:
test_.drop('BsmtQual', axis=1, inplace=True)
test_.drop('BsmtCond', axis=1, inplace=True)

In [None]:
ames.drop('GarageQual', axis=1, inplace=True)
ames.drop('GarageCond', axis=1, inplace=True)

In [None]:
test_.drop('GarageQual', axis=1, inplace=True)
test_.drop('GarageCond', axis=1, inplace=True)

In [None]:
ames.drop('PoolQC', axis=1, inplace=True)
test_.drop('PoolQC', axis=1, inplace=True)

In [None]:
ames.drop('LotFrontage', axis=1, inplace=True)
test_.drop('LotFrontage', axis=1, inplace=True)

In [None]:
ames.drop('FireplaceQu', axis=1, inplace=True)
test_.drop('FireplaceQu', axis=1, inplace=True)

In [None]:
ames.drop('KitchenQual', axis=1, inplace=True)
test_.drop('KitchenQual', axis=1, inplace=True)

In [None]:
# we dropped a majority of the Quality and Condition columns because we assumed that OverallCondition and OverallQuality basically described all those other features since OveralQ and OverallC described the entire house.

In [None]:
#drop these features since combined they equal another feature already included: TotalBsmtSF
ames.drop('BsmtFinSF_1', axis=1, inplace=True)
ames.drop('BsmtFinSF_2', axis=1, inplace=True)
ames.drop('BsmtUnfSF', axis=1, inplace=True)

In [None]:
test_.drop('BsmtFinSF_1', axis=1, inplace=True)
test_.drop('BsmtFinSF_2', axis=1, inplace=True)
test_.drop('BsmtUnfSF', axis=1, inplace=True)

In [None]:
#drop these features since combined they equal another feature already included: GrLivArea
ames.drop('_1stFlrSF', axis=1, inplace=True)
ames.drop('_2ndFlrSF', axis=1, inplace=True)
ames.drop('LowQualFinSF', axis=1, inplace=True)

In [None]:
test_.drop('_1stFlrSF', axis=1, inplace=True)
test_.drop('_2ndFlrSF', axis=1, inplace=True)
test_.drop('LowQualFinSF', axis=1, inplace=True)

In [None]:
#removed under the assumption of little importance for SalePrice
ames.drop('GarageYrBlt', axis=1, inplace=True)
test_.drop('GarageYrBlt', axis=1, inplace=True)

In [None]:
# heavily correlated with GarageCars but not as highly correlated with SalePrice as GarageCars is
ames.drop('GarageArea', axis=1, inplace=True)
test_.drop('GarageArea', axis=1, inplace=True)

In [None]:
ames.drop('MiscVal', axis=1, inplace=True)
test_.drop('MiscVal', axis=1, inplace=True)

In [None]:
#change these features to categorical... as to not skew the results...1 is better than 2006
ames['YrSold']= ames['YrSold'].astype('category')
test_['YrSold']= test_['YrSold'].astype('category')

In [None]:
ames['MoSold']= ames['MoSold'].astype('category')
test_['MoSold']= test_['MoSold'].astype('category')

## Feature Engineering

In [None]:
#Since Landslope is increasing in slope size, we determined it as an ordinal value. We thus used ordinal encoding for Landslope so that we can use it in the model.
ames.LandSlope = ames.LandSlope.str.replace('Gtl', '1')
ames.LandSlope = ames.LandSlope.str.replace('Mod', '2')
ames.LandSlope =ames.LandSlope.str.replace('Sev', '3')
ames.LandSlope = ames.LandSlope.astype(int)
##################
test_.LandSlope = test_.LandSlope.str.replace('Gtl', '1')
test_.LandSlope = test_.LandSlope.str.replace('Mod', '2')
test_.LandSlope = test_.LandSlope.str.replace('Sev', '3')
test_.LandSlope = test_.LandSlope.astype(int)

In [None]:
#New Column YrSinceRemod and remove YrRemodAdd. This will show how long the houses have been there since construction. However, there are a few values in YrRemodAdd that are different than YrBuilt.

YearSinceRemod = ames.YrSold - ames.YearRemodAdd
new_columns = ames.columns.tolist()
new_columns.append('YearSinceRemod')

ames = pd.concat([ames,YearSinceRemod], axis=1)

ames.columns = new_columns

ames.drop('YearRemodAdd', axis=1, inplace=True)

In [None]:
#merge bathroom features togethor to create one big bathroom column. Most real estate sites usually post total bathrooms rather than half baths and bsmt full baths
Bathrooms = ames.BsmtFullBath + ames.FullBath + .5*(ames.BsmtHalfBath + ames.HalfBath)

new_columns = ames.columns.tolist()
new_columns.append('TotBathrooms')

ames = pd.concat([ames,Bathrooms], axis=1)
ames.columns = new_columns

ames.drop(['BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath'], axis=1,inplace=True)



## Replacing NA's that have meaning 

In [None]:
#followed description provided by kaggle for imputation
ames['MasVnrArea'].fillna(value=0.0, inplace=True)
test_['MasVnrArea'].fillna(value=0.0, inplace=True)

In [None]:
ames['Alley']=ames['Alley'].fillna('No_Alley')
ames['MasVnrType']=ames['MasVnrType'].fillna('None')
ames['BsmtExposure']=ames['BsmtExposure'].fillna('No_basement')
ames['BsmtFinType_1']=ames['BsmtFinType_1'].fillna('No_basement')
ames['BsmtFinType_2']=ames['BsmtFinType_2'].fillna('No_basement')
#electrical might be the real missing value, but it only has one missing value
ames['Electrical']=ames['Electrical'].fillna('SBrkr')

In [None]:
test_['Alley']=test_['Alley'].fillna('No_Alley')
test_['MasVnrType']=test_['MasVnrType'].fillna('None')
test_['BsmtExposure']=test_['BsmtExposure'].fillna('No_basement')
test_['BsmtFinType_1']=test_['BsmtFinType_1'].fillna('No_basement')
test_['BsmtFinType_2']=test_['BsmtFinType_2'].fillna('No_basement')
#electrical might be the real missing value, but it only has one missing value
test_['Electrical']=test_['Electrical'].fillna('SBrkr')

In [None]:
ames['GarageType']=ames['GarageType'].fillna('No_Garage')
ames['GarageFinish']=ames['GarageFinish'].fillna('No_Garage')
ames['Fence']=ames['Fence'].fillna('No_Fence')
ames['MiscFeature']=ames['MiscFeature'].fillna('None')

In [None]:
test_['GarageType']=test_['GarageType'].fillna('No_Garage')
test_['GarageFinish']=test_['GarageFinish'].fillna('No_Garage')
test_['Fence']=test_['Fence'].fillna('No_Fence')
test_['MiscFeature']=test_['MiscFeature'].fillna('None')

In [None]:
#for encoding paved driveway to used for regression
ames['PavedDrive'] = [1 if i=='N' else (2 if i=='P' else 3) for i in ames['PavedDrive']]
test_['PavedDrive'] = [1 if i=='N' else (2 if i=='P' else 3) for i in test_['PavedDrive']]

In [None]:
#ordinal encoding as there is a progression in lotshape
ames.LotShape = ames.LotShape.str.replace('Reg', '4')
ames.LotShape = ames.LotShape.str.replace('IR1', '3')
ames.LotShape =ames.LotShape.str.replace('IR2', '2')
ames.LotShape =ames.LotShape.str.replace('IR3', '1')
ames.LotShape = ames.LotShape.astype(int)

In [None]:
test_.LotShape = test_.LotShape.str.replace('Reg', '4')
test_.LotShape = test_.LotShape.str.replace('IR1', '3')
test_.LotShape =test_.LotShape.str.replace('IR2', '2')
test_.LotShape =test_.LotShape.str.replace('IR3', '1')
test_.LotShape = test_.LotShape.astype(int)

In [None]:
#ordinal encoding as there is a progression in GarageFinish
ames['GarageFinish'] = ames['GarageFinish'].replace({'No_Garage':1, 'Unf':2,'RFn':3,'Fin':4})
test_['GarageFinish'] = test_['GarageFinish'].replace({'No_Garage':1, 'Unf':2,'RFn':3,'Fin':4})

In [None]:
#Following documentation for imputation
ames['GrLivArea'] = ames['GrLivArea'].fillna(value = 0.0)
