In [1]:
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

In [2]:
data = df_train = pd.read_csv('/Users/mugilrajark/Downloads/house-prices-advanced-regression-techniques/train.csv')
print(" Data shape -->", data.shape)
data.head()


 Data shape --> (1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [8]:
cols_not_useful = ['Id', 'MoSold', 'YrSold', 'GarageYrBlt','MasVnrArea', 'LotFrontage']
categorical_cols = [ 'MSSubClass']

In [9]:
def get_numerical_features(df):
    Numerical_columns = list(df.select_dtypes("int").columns)
    Numerical_columns_float = list(df.select_dtypes("float").columns)
    filtered_numerical_columns = list(Numerical_columns) + list(Numerical_columns_float) 
    filtered_numerical_columns =  set(filtered_numerical_columns) - set(cols_not_useful)
    filtered_numerical_columns =  filtered_numerical_columns - set(categorical_cols)
    return list(filtered_numerical_columns)

In [11]:
def get_categorical_features(df):
    categorical_features = list(df.select_dtypes("object").columns)
    categorical_features = list(set(categorical_cols + categorical_features))
    return categorical_features

numerical_cols =  get_numerical_features(data)
categorical_features =  get_categorical_features(data)

In [12]:
categorical_features

['ExterCond',
 'Condition1',
 'PoolQC',
 'RoofStyle',
 'ExterQual',
 'RoofMatl',
 'SaleCondition',
 'LotShape',
 'GarageCond',
 'LandSlope',
 'BldgType',
 'Heating',
 'BsmtFinType1',
 'CentralAir',
 'Foundation',
 'KitchenQual',
 'Street',
 'LandContour',
 'Electrical',
 'FireplaceQu',
 'MiscFeature',
 'MasVnrType',
 'MSZoning',
 'LotConfig',
 'BsmtFinType2',
 'GarageFinish',
 'SaleType',
 'Fence',
 'BsmtQual',
 'MSSubClass',
 'BsmtCond',
 'Exterior1st',
 'Neighborhood',
 'HouseStyle',
 'PavedDrive',
 'Condition2',
 'Utilities',
 'Exterior2nd',
 'GarageType',
 'Functional',
 'GarageQual',
 'BsmtExposure',
 'Alley',
 'HeatingQC']

In [13]:
na_fill_dict = {'Alley' : 'No alley access',
                'BsmtQual' : 'No Basement',
                'BsmtCond' : 'No Basement',
                'BsmtExposure' : 'No Basement',
                'BsmtFinType1' : 'No Basement',
                'BsmtFinType2' : 'No Basement',
                'FireplaceQu' : 'No Fireplace',
                'GarageType': 'No Garage',
                'GarageFinish': 'No Garage',
                'GarageQual': 'No Garage',
                'GarageType': 'No Garage',
                'GarageCond': 'No Garage',
                'PoolQC': 'No Pool',
                'Fence': 'No Fence',
                'MiscFeature': 'None'
}

def fill_na_categorical(df, na_fill_dict):
    for col in na_fill_dict:
         df[col].fillna(value=na_fill_dict[col], inplace=True)
    return df

data = fill_na_categorical(data, na_fill_dict)

In [14]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,No Fence,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,No Fence,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,No alley access,IR1,Lvl,AllPub,...,0,No Pool,No Fence,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,No alley access,IR1,Lvl,AllPub,...,0,No Pool,No Fence,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,No alley access,IR1,Lvl,AllPub,...,0,No Pool,No Fence,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,No Fence,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,No Fence,,0,4,2010,WD,Normal,142125


In [15]:
data[numerical_cols].isna().sum()

BedroomAbvGr     0
HalfBath         0
BsmtFinSF1       0
FullBath         0
1stFlrSF         0
3SsnPorch        0
YearRemodAdd     0
OverallCond      0
EnclosedPorch    0
KitchenAbvGr     0
OverallQual      0
SalePrice        0
LowQualFinSF     0
GrLivArea        0
MiscVal          0
TotalBsmtSF      0
Fireplaces       0
TotRmsAbvGrd     0
YearBuilt        0
GarageArea       0
BsmtHalfBath     0
PoolArea         0
OpenPorchSF      0
BsmtFinSF2       0
GarageCars       0
BsmtUnfSF        0
LotArea          0
BsmtFullBath     0
2ndFlrSF         0
WoodDeckSF       0
ScreenPorch      0
dtype: int64

In [16]:
for col in list(categorical_features):
    print(f'{col}   -- >',sum(data[col].isna()))

ExterCond   -- > 0
Condition1   -- > 0
PoolQC   -- > 0
RoofStyle   -- > 0
ExterQual   -- > 0
RoofMatl   -- > 0
SaleCondition   -- > 0
LotShape   -- > 0
GarageCond   -- > 0
LandSlope   -- > 0
BldgType   -- > 0
Heating   -- > 0
BsmtFinType1   -- > 0
CentralAir   -- > 0
Foundation   -- > 0
KitchenQual   -- > 0
Street   -- > 0
LandContour   -- > 0
Electrical   -- > 1
FireplaceQu   -- > 0
MiscFeature   -- > 0
MasVnrType   -- > 8
MSZoning   -- > 0
LotConfig   -- > 0
BsmtFinType2   -- > 0
GarageFinish   -- > 0
SaleType   -- > 0
Fence   -- > 0
BsmtQual   -- > 0
MSSubClass   -- > 0
BsmtCond   -- > 0
Exterior1st   -- > 0
Neighborhood   -- > 0
HouseStyle   -- > 0
PavedDrive   -- > 0
Condition2   -- > 0
Utilities   -- > 0
Exterior2nd   -- > 0
GarageType   -- > 0
Functional   -- > 0
GarageQual   -- > 0
BsmtExposure   -- > 0
Alley   -- > 0
HeatingQC   -- > 0


In [17]:
pd.options.mode.chained_assignment = None

In [18]:
data[categorical_features].dropna(inplace=True)

In [19]:
data

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,No Fence,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,No Fence,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,No alley access,IR1,Lvl,AllPub,...,0,No Pool,No Fence,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,No alley access,IR1,Lvl,AllPub,...,0,No Pool,No Fence,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,No alley access,IR1,Lvl,AllPub,...,0,No Pool,No Fence,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,No Fence,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,No alley access,Reg,Lvl,AllPub,...,0,No Pool,No Fence,,0,4,2010,WD,Normal,142125


In [20]:
data[numerical_cols].corr()[['SalePrice']].sort_values("SalePrice", ascending=False)

Unnamed: 0,SalePrice
SalePrice,1.0
OverallQual,0.790982
GrLivArea,0.708624
GarageCars,0.640409
GarageArea,0.623431
TotalBsmtSF,0.613581
1stFlrSF,0.605852
FullBath,0.560664
TotRmsAbvGrd,0.533723
YearBuilt,0.522897


In [21]:
non_correlated_cols = ['PoolArea', 'KitchenAbvGr', 'EnclosedPorch', 'BsmtFinSF2', 'MiscVal', 'BsmtHalfBath',
                       'LowQualFinSF','3SsnPorch','ScreenPorch','OverallCond']

In [22]:
final_numerical_cols = list(set(numerical_cols) - set(non_correlated_cols))
final_categorical_cols = list(set(categorical_features) - set(get_categorical_features(data)))


In [23]:
preprocessed_df = data[list(final_categorical_cols) + list(final_numerical_cols) ]
preprocessed_df = pd.get_dummies(data=preprocessed_df, columns=final_categorical_cols)


In [24]:
train, test = train_test_split(preprocessed_df, test_size=0.2)
y = train['SalePrice']
X = train.drop(['SalePrice'], axis=1)


In [25]:
from sklearn.model_selection import GridSearchCV

In [29]:
parameters = {'n_estimators' : [10,30, 50, 70, 100, 200], 'min_samples_leaf':[2,5,10]}
model = RandomForestRegressor()
reg = GridSearchCV(model, parameters, verbose=3,cv=10)
reg.fit(X, y)


Fitting 10 folds for each of 18 candidates, totalling 180 fits
[CV 1/10] END min_samples_leaf=2, n_estimators=10;, score=0.825 total time=   0.1s
[CV 2/10] END min_samples_leaf=2, n_estimators=10;, score=0.881 total time=   0.1s
[CV 3/10] END min_samples_leaf=2, n_estimators=10;, score=0.830 total time=   0.1s
[CV 4/10] END min_samples_leaf=2, n_estimators=10;, score=0.880 total time=   0.1s
[CV 5/10] END min_samples_leaf=2, n_estimators=10;, score=0.873 total time=   0.1s
[CV 6/10] END min_samples_leaf=2, n_estimators=10;, score=0.668 total time=   0.1s
[CV 7/10] END min_samples_leaf=2, n_estimators=10;, score=0.844 total time=   0.1s
[CV 8/10] END min_samples_leaf=2, n_estimators=10;, score=0.801 total time=   0.1s
[CV 9/10] END min_samples_leaf=2, n_estimators=10;, score=0.725 total time=   0.1s
[CV 10/10] END min_samples_leaf=2, n_estimators=10;, score=0.861 total time=   0.1s
[CV 1/10] END min_samples_leaf=2, n_estimators=30;, score=0.870 total time=   0.2s
[CV 2/10] END min_sampl

[CV 9/10] END min_samples_leaf=5, n_estimators=70;, score=0.737 total time=   0.4s
[CV 10/10] END min_samples_leaf=5, n_estimators=70;, score=0.897 total time=   0.4s
[CV 1/10] END min_samples_leaf=5, n_estimators=100;, score=0.841 total time=   0.6s
[CV 2/10] END min_samples_leaf=5, n_estimators=100;, score=0.891 total time=   0.6s
[CV 3/10] END min_samples_leaf=5, n_estimators=100;, score=0.835 total time=   0.6s
[CV 4/10] END min_samples_leaf=5, n_estimators=100;, score=0.876 total time=   0.6s
[CV 5/10] END min_samples_leaf=5, n_estimators=100;, score=0.876 total time=   0.6s
[CV 6/10] END min_samples_leaf=5, n_estimators=100;, score=0.717 total time=   0.6s
[CV 7/10] END min_samples_leaf=5, n_estimators=100;, score=0.884 total time=   0.6s
[CV 8/10] END min_samples_leaf=5, n_estimators=100;, score=0.807 total time=   0.6s
[CV 9/10] END min_samples_leaf=5, n_estimators=100;, score=0.733 total time=   0.6s
[CV 10/10] END min_samples_leaf=5, n_estimators=100;, score=0.888 total time=

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'min_samples_leaf': [2, 5, 10],
                         'n_estimators': [10, 30, 50, 70, 100, 200]},
             verbose=3)

In [30]:
reg.best_estimator_

RandomForestRegressor(min_samples_leaf=2)

In [31]:
reg.best_score_

0.8429373673324033

In [32]:
y_test = test['SalePrice']
X_test = test.drop(['SalePrice'], axis=1)

predictions = reg.predict(X_test)

In [33]:
from sklearn.metrics import r2_score

In [34]:
r2_score(y_test.values, predictions)

0.893864606161802

In [35]:
mean_squared_error(y_test.values, predictions)

636102588.8660995

In [36]:
for act, pred in zip(y_test.values[:5], predictions[:5]):
    print(f'y_ground_truth - {act}    y_prediction - {pred}')

y_ground_truth - 173900    y_prediction - 173295.4460952381
y_ground_truth - 120000    y_prediction - 121887.19090476191
y_ground_truth - 219210    y_prediction - 227557.45193650795
y_ground_truth - 132500    y_prediction - 116961.18642857141
y_ground_truth - 109500    y_prediction - 98171.10321428569


In [37]:
new_train = train[(np.abs(stats.zscore(train[final_numerical_cols])) < 5).all(axis=1)]


In [38]:
y = new_train['SalePrice']
X = new_train.drop(['SalePrice'], axis=1)
parameters = {'n_estimators' : [10,30, 50, 70, 100, 200], 'min_samples_leaf':[2,5,10]}
model = RandomForestRegressor()
reg = GridSearchCV(model, parameters, verbose=3,cv=10)
reg.fit(X, y)

Fitting 10 folds for each of 18 candidates, totalling 180 fits
[CV 1/10] END min_samples_leaf=2, n_estimators=10;, score=0.833 total time=   0.1s
[CV 2/10] END min_samples_leaf=2, n_estimators=10;, score=0.873 total time=   0.1s
[CV 3/10] END min_samples_leaf=2, n_estimators=10;, score=0.856 total time=   0.1s
[CV 4/10] END min_samples_leaf=2, n_estimators=10;, score=0.868 total time=   0.1s
[CV 5/10] END min_samples_leaf=2, n_estimators=10;, score=0.891 total time=   0.1s
[CV 6/10] END min_samples_leaf=2, n_estimators=10;, score=0.873 total time=   0.1s
[CV 7/10] END min_samples_leaf=2, n_estimators=10;, score=0.863 total time=   0.1s
[CV 8/10] END min_samples_leaf=2, n_estimators=10;, score=0.776 total time=   0.1s
[CV 9/10] END min_samples_leaf=2, n_estimators=10;, score=0.857 total time=   0.1s
[CV 10/10] END min_samples_leaf=2, n_estimators=10;, score=0.889 total time=   0.1s
[CV 1/10] END min_samples_leaf=2, n_estimators=30;, score=0.849 total time=   0.2s
[CV 2/10] END min_sampl

[CV 9/10] END min_samples_leaf=5, n_estimators=70;, score=0.885 total time=   0.4s
[CV 10/10] END min_samples_leaf=5, n_estimators=70;, score=0.900 total time=   0.4s
[CV 1/10] END min_samples_leaf=5, n_estimators=100;, score=0.844 total time=   0.6s
[CV 2/10] END min_samples_leaf=5, n_estimators=100;, score=0.888 total time=   0.6s
[CV 3/10] END min_samples_leaf=5, n_estimators=100;, score=0.862 total time=   0.6s
[CV 4/10] END min_samples_leaf=5, n_estimators=100;, score=0.885 total time=   0.6s
[CV 5/10] END min_samples_leaf=5, n_estimators=100;, score=0.894 total time=   0.6s
[CV 6/10] END min_samples_leaf=5, n_estimators=100;, score=0.867 total time=   0.6s
[CV 7/10] END min_samples_leaf=5, n_estimators=100;, score=0.871 total time=   0.6s
[CV 8/10] END min_samples_leaf=5, n_estimators=100;, score=0.792 total time=   0.6s
[CV 9/10] END min_samples_leaf=5, n_estimators=100;, score=0.892 total time=   0.6s
[CV 10/10] END min_samples_leaf=5, n_estimators=100;, score=0.896 total time=

GridSearchCV(cv=10, estimator=RandomForestRegressor(),
             param_grid={'min_samples_leaf': [2, 5, 10],
                         'n_estimators': [10, 30, 50, 70, 100, 200]},
             verbose=3)

In [39]:
y_test = test['SalePrice']
X_test = test.drop(['SalePrice'], axis=1)

predictions = reg.predict(X_test)
r2_score(y_test.values, predictions)

0.8662816782049596