In [173]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import dill

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [134]:
data = pd.read_csv('dataset/train.csv',index_col=0)

In [106]:
print(data.shape)
data.info()

(1460, 80)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   object 
 2   LotFrontage    1201 non-null   float64
 3   LotArea        1460 non-null   int64  
 4   Street         1460 non-null   object 
 5   Alley          91 non-null     object 
 6   LotShape       1460 non-null   object 
 7   LandContour    1460 non-null   object 
 8   Utilities      1460 non-null   object 
 9   LotConfig      1460 non-null   object 
 10  LandSlope      1460 non-null   object 
 11  Neighborhood   1460 non-null   object 
 12  Condition1     1460 non-null   object 
 13  Condition2     1460 non-null   object 
 14  BldgType       1460 non-null   object 
 15  HouseStyle     1460 non-null   object 
 16  OverallQual    1460 non-null   int64  
 17  OverallCond    1460 non-null   int64  
 1

In [107]:
#### Check if there are any NULL values in Train Data
null_col_train = data.columns[data.isnull().sum() != 0]

print(f'Total Train Features with NaN Values = {null_col_train.size}')
print(f'Features with NaN => {list(null_col_train)}')
data[null_col_train].isnull().sum().sort_values(ascending = False)

Total Train Features with NaN Values = 19
Features with NaN => ['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
FireplaceQu      690
LotFrontage      259
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
BsmtExposure      38
BsmtFinType2      38
BsmtFinType1      37
BsmtCond          37
BsmtQual          37
MasVnrArea         8
MasVnrType         8
Electrical         1
dtype: int64

In [117]:
num_col = data.select_dtypes(include=[np.number]).columns
cat_col = data.select_dtypes(exclude=[np.number]).columns

print(f'Numerical Columns -> {list(num_col)}')
print(f'Categorical Columns -> {list(cat_col)}')

Numerical Columns -> ['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold', 'SalePrice']
Categorical Columns -> ['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu

In [135]:
print(data.select_dtypes(exclude=[np.number]).shape)
data.head()

(1460, 43)


Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [136]:
# Process Numerical NaN's
for nc in num_col:
    data[nc].fillna(data[nc].mean(), inplace=True)

# Process Categorical Nan's
le = preprocessing.LabelEncoder()
for cc in cat_col:
    encoded_cc = le.fit_transform(data[cc])
    data[cc] = encoded_cc
#data = pd.get_dummies(data, columns=cat_col)

In [137]:
# Check wether no categorical features left
print(data.select_dtypes(exclude=[np.number]).shape)
data.head()

(1460, 0)


Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,3,65.0,8450,1,2,3,3,0,4,...,0,3,4,4,0,2,2008,8,4,208500
2,20,3,80.0,9600,1,2,3,3,0,2,...,0,3,4,4,0,5,2007,8,4,181500
3,60,3,68.0,11250,1,2,0,3,0,4,...,0,3,4,4,0,9,2008,8,4,223500
4,70,3,60.0,9550,1,2,0,3,0,0,...,0,3,4,4,0,2,2006,8,0,140000
5,60,3,84.0,14260,1,2,0,3,0,2,...,0,3,4,4,0,12,2008,8,4,250000


In [147]:
# Transform data
y = np.log1p(data.SalePrice.to_numpy())
X = data.drop(['SalePrice'], axis=1).to_numpy()

print(y.shape)
print(X.shape)

(1460,)
(1460, 79)


In [150]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=98987)

In [184]:
def print_rmsel(pred, y):
    print(f'RMSEL: {np.sqrt(mean_squared_error(pred, y))}')

# sklearn

In [169]:
params = {
    'n_estimators': (100, 1000),
    'criterion': ('squared_error',), 
    'max_depth': (None,100,1000),
    'max_features':(1/3,'auto'),

}

rf_clf = GridSearchCV(RandomForestRegressor(), param_grid=params, verbose=3, cv=3)

rf_clf.fit(X_train, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END criterion=squared_error, max_depth=None, max_features=0.3333333333333333, n_estimators=100;, score=0.871 total time=   0.6s
[CV 2/3] END criterion=squared_error, max_depth=None, max_features=0.3333333333333333, n_estimators=100;, score=0.894 total time=   0.6s
[CV 3/3] END criterion=squared_error, max_depth=None, max_features=0.3333333333333333, n_estimators=100;, score=0.866 total time=   0.7s
[CV 1/3] END criterion=squared_error, max_depth=None, max_features=0.3333333333333333, n_estimators=1000;, score=0.875 total time=   6.5s
[CV 2/3] END criterion=squared_error, max_depth=None, max_features=0.3333333333333333, n_estimators=1000;, score=0.899 total time=   5.8s
[CV 3/3] END criterion=squared_error, max_depth=None, max_features=0.3333333333333333, n_estimators=1000;, score=0.870 total time=   5.6s
[CV 1/3] END criterion=squared_error, max_depth=None, max_features=auto, n_estimators=100;, score=0.873 total time

GridSearchCV(cv=3, estimator=RandomForestRegressor(),
             param_grid={'criterion': ('squared_error',),
                         'max_depth': (None, 100, 1000),
                         'max_features': (0.3333333333333333, 'auto'),
                         'n_estimators': (100, 1000)},
             verbose=3)

In [185]:
print(rf_clf.best_params_)
print_rmsel(rf_clf.predict(X_test), y_test)
rf_clf.score(X_test, y_test)

{'criterion': 'squared_error', 'max_depth': None, 'max_features': 0.3333333333333333, 'n_estimators': 1000}
RMSEL: 0.15659507782748036


0.8472830032007098

# XGBoost

In [175]:
xgb_params = {
    "max_depth": (4, 5),
    "n_estimators": (100, 1000),
    "learning_rate": (0.01, 0.3)
}

xgb_clf = GridSearchCV(XGBRegressor(), param_grid=xgb_params, verbose=3, cv=3)

xgb_clf.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV 1/3] END learning_rate=0.01, max_depth=4, n_estimators=100;, score=-103.791 total time=   3.4s
[CV 2/3] END learning_rate=0.01, max_depth=4, n_estimators=100;, score=-122.704 total time=   0.1s
[CV 3/3] END learning_rate=0.01, max_depth=4, n_estimators=100;, score=-111.314 total time=   0.2s
[CV 1/3] END learning_rate=0.01, max_depth=4, n_estimators=1000;, score=0.885 total time=   3.7s
[CV 2/3] END learning_rate=0.01, max_depth=4, n_estimators=1000;, score=0.908 total time=   3.1s
[CV 3/3] END learning_rate=0.01, max_depth=4, n_estimators=1000;, score=0.877 total time=   3.1s
[CV 1/3] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=-103.791 total time=   0.2s
[CV 2/3] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=-122.704 total time=   0.1s
[CV 3/3] END learning_rate=0.01, max_depth=5, n_estimators=100;, score=-111.314 total time=   0.1s
[CV 1/3] END learning_rate=0.01, max_depth=5, n_estimat

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
       

In [186]:
print(xgb_clf.best_params_)
print_rmsel(xgb_clf.predict(X_test), y_test)
xgb_clf.score(X_test, y_test)

{'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 1000}
RMSEL: 0.14377246858261866


0.8712691548892362

# LightGBM

In [None]:
lgbm_params = {
    "max_depth": (4, 5, 10),
    "n_estimators": (100, 1000),
    "learning_rate": (0.01, 0.1, 1)
}

lgbm_clf = GridSearchCV(LGBMRegressor(), param_grid=xgb_params, verbose=3, cv=3)

lgbm_clf.fit(X_train, y_train)

In [None]:
dill.dump_session('task_4_notebook_env.db')