**Import libraries**

In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score, StratifiedKFold

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

import optuna

import joblib

**Read the data**

In [4]:
# read into DataFrame
df_train = pd.read_csv('input/train.csv')
df_test= pd.read_csv('input/test.csv')

In [5]:
# enable to see all columns in a notebook
pd.set_option('display.max_columns', None)

# enable to see all rows in a notebook
pd.set_option('display.max_rows', None)

In [6]:
df_train.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [7]:
df_train.tail(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125
1459,1460,20,RL,75.0,9937,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Edwards,Norm,Norm,1Fam,1Story,5,6,1965,1965,Gable,CompShg,HdBoard,HdBoard,,0.0,Gd,TA,CBlock,TA,TA,No,BLQ,830,LwQ,290,136,1256,GasA,Gd,Y,SBrkr,1256,0,0,1256,1,0,1,1,3,1,TA,6,Typ,0,,Attchd,1965.0,Fin,1,276,TA,TA,Y,736,68,0,0,0,0,,,,0,6,2008,WD,Normal,147500


**Check for NaN**

In [8]:
train_nan=df_train.isnull().sum().sort_values(ascending = False)[0:20]
train_nan

PoolQC          1453
MiscFeature     1406
Alley           1369
Fence           1179
MasVnrType       872
FireplaceQu      690
LotFrontage      259
GarageYrBlt       81
GarageCond        81
GarageType        81
GarageFinish      81
GarageQual        81
BsmtFinType2      38
BsmtExposure      38
BsmtQual          37
BsmtCond          37
BsmtFinType1      37
MasVnrArea         8
Electrical         1
Id                 0
dtype: int64

In [9]:
test_nan=df_test.isnull().sum().sort_values(ascending = False)[0:20]
test_nan

PoolQC          1456
MiscFeature     1408
Alley           1352
Fence           1169
MasVnrType       894
FireplaceQu      730
LotFrontage      227
GarageYrBlt       78
GarageQual        78
GarageFinish      78
GarageCond        78
GarageType        76
BsmtCond          45
BsmtQual          44
BsmtExposure      44
BsmtFinType1      42
BsmtFinType2      42
MasVnrArea        15
MSZoning           4
BsmtHalfBath       2
dtype: int64

**Dropping values depending on percentage of NaN**

In [10]:
# dropping columns with certain percentage of NaN
perc = 80 #percentage of NaN


min_count =  int(((100-perc)/100)*df_train.shape[0] + 1)
df_train = df_train.dropna( axis=1, thresh=min_count)


min_count =  int(((100-perc)/100)*df_test.shape[0] + 1)
df_test = df_test.dropna( axis=1, thresh=min_count)

In [11]:
# Check for NaN
train_nan=df_train.isnull().sum().sort_values(ascending = False)[0:20]
train_nan




MasVnrType      872
FireplaceQu     690
LotFrontage     259
GarageCond       81
GarageYrBlt      81
GarageFinish     81
GarageQual       81
GarageType       81
BsmtFinType2     38
BsmtExposure     38
BsmtFinType1     37
BsmtCond         37
BsmtQual         37
MasVnrArea        8
Electrical        1
BsmtFullBath      0
Functional        0
TotRmsAbvGrd      0
GrLivArea         0
HalfBath          0
dtype: int64

In [12]:
test_nan=df_test.isnull().sum().sort_values(ascending = False)[0:20]
test_nan

MasVnrType      894
FireplaceQu     730
LotFrontage     227
GarageYrBlt      78
GarageFinish     78
GarageQual       78
GarageCond       78
GarageType       76
BsmtCond         45
BsmtQual         44
BsmtExposure     44
BsmtFinType1     42
BsmtFinType2     42
MasVnrArea       15
MSZoning          4
BsmtHalfBath      2
Utilities         2
Functional        2
BsmtFullBath      2
BsmtFinSF1        1
dtype: int64

**Replacing string, Filling-in NaN by mean, Droping columns with NaN**

In [13]:
# Replacing string by NaN
for col in df_train.columns:
    df_train[col] = pd.to_numeric(df_train[col], errors='coerce')

for col in df_test.columns:
    df_test[col] = pd.to_numeric(df_test[col], errors='coerce')
    
# fill in the NaN with the mean in all columns
df_train=df_train.fillna(df_train.mean())
df_test=df_test.fillna(df_test.mean())


# delete all columns containing NAN - in case of colums full of string become NaN-impossible to do a mean from string
df_train=df_train.dropna(axis=1, how='all')
df_test=df_test.dropna(axis=1, how='all')


**Final check for NaN**

In [14]:
train_nan=df_train.isnull().sum().sort_values(ascending = False)[0:20]
test_nan=df_test.isnull().sum().sort_values(ascending = False)[0:20]

x = pd.DataFrame(train_nan, columns=['train'])
x['test']=test_nan

# Using Dataframe.apply() and lambda function
x["train [%]"] = x["train"].apply(lambda x: round((x/df_train.shape[0])*100))

print(x)

print("Number of features:", df_train.shape[1])
print("Number of features:", df_test.shape[1])

               train  test  train [%]
Id                 0   0.0          0
WoodDeckSF         0   0.0          0
BedroomAbvGr       0   0.0          0
KitchenAbvGr       0   0.0          0
TotRmsAbvGrd       0   0.0          0
Fireplaces         0   0.0          0
GarageYrBlt        0   0.0          0
GarageCars         0   0.0          0
GarageArea         0   0.0          0
OpenPorchSF        0   0.0          0
MSSubClass         0   0.0          0
EnclosedPorch      0   0.0          0
3SsnPorch          0   0.0          0
ScreenPorch        0   0.0          0
PoolArea           0   0.0          0
MiscVal            0   0.0          0
MoSold             0   0.0          0
YrSold             0   NaN          0
HalfBath           0   0.0          0
FullBath           0   0.0          0
Number of features: 38
Number of features: 37


# Processing

In [15]:
# dropping ID and other columns
df_train=df_train.drop(['Id'], axis=1)

df_test=df_test.drop(['Id'], axis=1)

In [16]:
# defining input=X and output=y for models
input = df_train.drop(['SalePrice'], axis=1) #dropping our Y value
output = df_train['SalePrice']

# independant
X = input

# ependant
y=output

In [17]:
# Split the training dataset

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [18]:
model = [
    LinearRegression()
    ,DecisionTreeRegressor()
    ,RandomForestRegressor()
    ,GradientBoostingRegressor()
    ,SVR()
    ,XGBRegressor()
    ,CatBoostRegressor(silent=True)
]

In [19]:
# Preperation 

table = pd.DataFrame()
idx = 0

# Train and score algorithms
for a in model:
    
    a.fit(X_train, y_train)

    # Metrics
    cv = cross_val_score(a, X_train, y_train).mean()
    accu_train = a.score(X_train, y_train)
    accu_test = a.score(X_test, y_test)
    rmse = root_mean_squared_error(y_test, a.predict(X_test))


    # Write into the table
    Alg = a.__class__.__name__

    table.loc[idx, 'Algorithm'] = Alg
    table.loc[idx, 'CV Score'] = round(cv * 100, 2)
    table.loc[idx, 'Accuracy [train]'] = round(accu_train * 100, 2)
    table.loc[idx, 'Accuracy [test]'] = round(accu_test * 100, 2)
    table.loc[idx, 'RMSE'] = round(rmse, 2)

    idx+=1

In [20]:
# printing the scores
table.sort_values(by='CV Score', ascending=False)

Unnamed: 0,Algorithm,CV Score,Accuracy [train],Accuracy [test],RMSE
6,CatBoostRegressor,87.8,99.4,87.21,29062.0
3,GradientBoostingRegressor,85.45,96.56,85.58,30853.64
2,RandomForestRegressor,85.37,97.81,85.16,31297.74
5,XGBRegressor,85.27,99.98,81.8,34662.69
0,LinearRegression,82.83,84.48,68.07,45913.45
1,DecisionTreeRegressor,70.79,100.0,73.03,42200.24
4,SVR,-5.91,-5.32,-5.43,83429.46


# Hyperparameter tuning with Grid Search

In [21]:
def tuning(model):
    
    model=model()

    param_grid={
                    'max_depth': [2, 3, 4, 5, 6],
                    'n_estimators':[100, 150, 200, 250, 300]
                }

    # definding the grid
    grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=3)
      
    # fit all combinations 
    grid.fit(X_train, y_train)

    # get the best model
    best_params=grid.best_params_
    best_estimator=grid.best_estimator_

    cv_score = cross_val_score(best_estimator, X_train, y_train).mean()
    train_score = best_estimator.score(X_train, y_train)
    test_score = best_estimator.score(X_test, y_test)


    print('*********************************************')
    print('Model: ', model.__class__.__name__)   
    print('Best parametrs: ', best_params)
    print('CV score: ', round(cv_score*100,2), '%')
    print('Train score: ', round(train_score*100,2),'%')
    print('Test score: ', round(test_score*100,2),'%')
    print('*********************************************')
    
    return best_estimator

In [22]:
# Executing GridSearch with the best model
results=tuning(GradientBoostingRegressor)
results

Fitting 5 folds for each of 25 candidates, totalling 125 fits
*********************************************
Model:  GradientBoostingRegressor
Best parametrs:  {'max_depth': 3, 'n_estimators': 300}
CV score:  85.65 %
Train score:  98.69 %
Test score:  86.69 %
*********************************************


# Hyperparameter tuning with Optuna

In [23]:
# Creating the function which will be optimized by Optuna
def objective(trial):
    
    # define the ranges:
    n_estimators=trial.suggest_int('n_estimators', 100, 600, step=100) #range, step 
    max_depth=trial.suggest_int('max_depth', 2, 6, step=1) #range, step


    model=GradientBoostingRegressor
    reg=model(n_estimators=n_estimators,max_depth=max_depth)
    reg.fit(X_train, y_train)


    cv_score = cross_val_score(reg, X_train, y_train).mean()

    return cv_score

In [24]:
# Silent the iterations
optuna.logging.set_verbosity(optuna.logging.WARNING) 

# Run optimization
study = optuna.create_study(direction = 'maximize')

study.optimize(func=objective, n_trials=10)

In [25]:
# Show the best score and parameters
print(f'The best CV_score is: {round(study.best_trial.value*100,2)} %')
print('The best parameters are:', study.best_params)

The best CV_score is: 85.28 %
The best parameters are: {'n_estimators': 500, 'max_depth': 3}


In [26]:
# Objective value x Trials
optuna.visualization.plot_optimization_history(study)

**Run with the best Optuna trial**

In [27]:
def detailed_objective(trial):
    # Use same code objective to reproduce the best model
    
    n_estimators=trial.suggest_int('n_estimators',100, 600, step=100) #range, step 
    max_depth=trial.suggest_int('max_depth',2, 6, step=1) #range, step

    model=GradientBoostingRegressor
    reg=model(n_estimators=n_estimators,max_depth=max_depth)
    reg.fit(X_train, y_train)

    cv_score = cross_val_score(reg, X_train, y_train).mean()
    train_score = reg.score(X_train, y_train)
    test_score = reg.score(X_test, y_test)

    print('*********************************************')
    print('CV score: ', round(cv_score*100,2), '%')
    print('Train score: ', round(train_score*100,2),'%')
    print('Test score: ', round(test_score*100,2),'%')
    print('*********************************************')

    return cv_score, train_score, test_score

In [28]:
# run the function with the best parameters
detailed_objective(study.best_trial)

*********************************************
CV score:  85.53 %
Train score:  99.36 %
Test score:  88.64 %
*********************************************


(0.8553055274086419, 0.9936341147439806, 0.8864057737107196)

# Predict on test data (Kaggle)

In [29]:
# Loading X values from the test data for submision 
X_real_test_Id= pd.read_csv('input/test.csv', usecols=["Id"])
X_real_test = df_test


In [30]:
# Optuna version
best_params_optuna=study.best_params
results=GradientBoostingRegressor(**best_params_optuna)

In [31]:
# fit the data on training data with the best parameters from the hyperparametr tunning
m=results

m.fit(X_train,y_train)

# predict data from the test data
y_pred = m.predict(X_real_test)

# craeate dataframe for submission
df_newsub = X_real_test_Id.copy()
df_newsub['SalePrice'] = y_pred


In [32]:
# write data to sumbmission csv
df_newsub.to_csv("output/kaggle-submission.csv", index = False)

# Export model using joblib

In [35]:
# Save model
joblib.dump(m, filename='output/GradientBoosting.pkl')

['output/GradientBoosting.pkl']