This notebook wants to make use of the evaluation techniques previously developed to select the best algorithms for this problem.

In [1]:
import pandas as pd
import numpy as np

import tubesml as tml

from sklearn.model_selection import KFold

from sklearn.pipeline import Pipeline

from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import mean_squared_error, mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sys
sys.path.append("..")
from source.clean import general_cleaner
from source.transf_category import recode_cat, make_ordinal
from source.transf_numeric import tr_numeric
import source.transf_univ as dfp
import source.utility as ut
import source.report as rp

import warnings

warnings.filterwarnings("ignore", 
                        message="The dummies in this set do not match the ones in the train set, we corrected the issue.")

pd.set_option('max_columns', 500)

# Data preparation

Get the data ready to flow into the pipeline

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

df_train['Target'] = np.log1p(df_train.SalePrice)

df_train = df_train[df_train.GrLivArea < 4500].copy().reset_index()

del df_train['SalePrice']

train_set, test_set = ut.make_test(df_train, 
                                test_size=0.2, random_state=654, 
                                strat_feat='Neighborhood')

y = train_set['Target'].copy()
del train_set['Target']

y_test = test_set['Target']
del test_set['Target']

## Building the pipeline

This was introduced in another notebook and imported above

In [3]:
numeric_pipe = Pipeline([('fs', tml.DtypeSel(dtype='numeric')),
                         ('imputer', tml.DfImputer(strategy='median')),
                         ('transf', tr_numeric())])


cat_pipe = Pipeline([('fs', tml.DtypeSel(dtype='category')),
                     ('imputer', tml.DfImputer(strategy='most_frequent')), 
                     ('ord', make_ordinal(['BsmtQual', 'KitchenQual',
                                           'ExterQual', 'HeatingQC'])), 
                     ('recode', recode_cat()), 
                     ('dummies', tml.Dummify(drop_first=True))])


processing_pipe = tml.FeatureUnionDf(transformer_list=[('cat_pipe', cat_pipe),
                                                    ('num_pipe', numeric_pipe)])


## Evaluation method

We have seen how it works in the previous notebook, we have thus imported the necessary functions above.

In [4]:
models = [('lasso', Lasso(alpha=0.01)), ('ridge', Ridge()), ('sgd', SGDRegressor()), 
          ('forest', RandomForestRegressor(n_estimators=200)), ('xtree', ExtraTreesRegressor(n_estimators=200)), 
          ('svr', SVR()), 
          ('kneig', KNeighborsRegressor()),
          ('xgb', xgb.XGBRegressor(n_estimators=200, objective='reg:squarederror')), 
          ('lgb', lgb.LGBMRegressor(n_estimators=200))]

In [5]:
mod_name = []
rmse_train = []
rmse_test = []
mae_train = []
mae_test = []

folds = KFold(5, shuffle=True, random_state=541)

for model in models:
    
    train = train_set.copy()
    test = test_set.copy()
    print(model[0])
    mod_name.append(model[0])
    
    pipe = [('gen_cl', general_cleaner()),
            ('processing', processing_pipe),
            ('scl', dfp.df_scaler())] + [model]
    
    model_pipe = Pipeline(pipe)
            
    inf_preds = tml.cv_score(data=train, target=y, cv=folds, estimator=model_pipe)
    
    model_pipe.fit(train, y)
    
    preds = model_pipe.predict(test)
    
    rp.plot_predictions(test, y_test, preds, savename=model[0]+'_preds.png')
    rp.plot_predictions(train, y, inf_preds, savename=model[0]+'_inf_preds.png')
    
    rmse_train.append(mean_squared_error(y, inf_preds))
    rmse_test.append(mean_squared_error(y_test, preds))
    mae_train.append(mean_absolute_error(np.expm1(y), np.expm1(inf_preds)))
    mae_test.append(mean_absolute_error(np.expm1(y_test), np.expm1(preds)))
    
    print(f'\tTrain set RMSE: {round(np.sqrt(mean_squared_error(y, inf_preds)), 4)}')
    print(f'\tTrain set MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(inf_preds)), 2)}')
    print(f'\tTest set RMSE: {round(np.sqrt(mean_squared_error(y_test, preds)), 4)}')
    print(f'\tTrain set MAE: {round(mean_absolute_error(np.expm1(y_test), np.expm1(preds)), 2)}')
    
    print('_'*40)
    print('\n')
    
results = pd.DataFrame({'model_name': mod_name, 
                        'rmse_train': rmse_train, 'rmse_test': rmse_test, 
                        'mae_train': mae_train, 'mae_test': mae_test})

results

lasso
	Train set RMSE: 0.1217
	Train set MAE: 15576.36
	Test set RMSE: 0.1332
	Train set MAE: 15983.01
________________________________________


ridge
	Train set RMSE: 0.1181
	Train set MAE: 14713.42
	Test set RMSE: 0.1347
	Train set MAE: 15546.02
________________________________________


sgd
	Train set RMSE: 0.1234
	Train set MAE: 15626.24
	Test set RMSE: 0.1396
	Train set MAE: 16580.93
________________________________________


forest
	Train set RMSE: 0.141
	Train set MAE: 17817.3
	Test set RMSE: 0.1539
	Train set MAE: 18149.16
________________________________________


xtree
	Train set RMSE: 0.1351
	Train set MAE: 17392.39
	Test set RMSE: 0.1511
	Train set MAE: 17459.41
________________________________________


svr
	Train set RMSE: 0.1535
	Train set MAE: 19051.97
	Test set RMSE: 0.1556
	Train set MAE: 17500.13
________________________________________


kneig
	Train set RMSE: 0.1781
	Train set MAE: 23218.1
	Test set RMSE: 0.1765
	Train set MAE: 21696.33
___________________________

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
0,lasso,0.014819,0.017741,15576.361327,15983.005735
1,ridge,0.013941,0.018134,14713.419185,15546.015968
2,sgd,0.01522,0.019474,15626.235006,16580.925867
3,forest,0.019891,0.023694,17817.300843,18149.158599
4,xtree,0.018242,0.022825,17392.394237,17459.407641
5,svr,0.023564,0.024217,19051.967698,17500.131132
6,kneig,0.031736,0.031156,23218.10299,21696.329432
7,xgb,0.019235,0.023072,17368.846217,17741.960322
8,lgb,0.016961,0.02075,16642.750104,15966.022833


In [6]:
results.sort_values(by='rmse_train').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
1,ridge,0.013941,0.018134,14713.419185,15546.015968
0,lasso,0.014819,0.017741,15576.361327,15983.005735


In [7]:
results.sort_values(by='rmse_test').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
0,lasso,0.014819,0.017741,15576.361327,15983.005735
1,ridge,0.013941,0.018134,14713.419185,15546.015968


In [8]:
results.sort_values(by='mae_train').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
1,ridge,0.013941,0.018134,14713.419185,15546.015968
0,lasso,0.014819,0.017741,15576.361327,15983.005735


In [9]:
results.sort_values(by='mae_test').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
1,ridge,0.013941,0.018134,14713.419185,15546.015968
8,lgb,0.016961,0.02075,16642.750104,15966.022833
