This notebook wants to make use of the evaluation techniques previously developed to select the best algorithms for this problem.

In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

from sklearn.pipeline import Pipeline

from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import mean_squared_error, mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sys
sys.path.append("..")
from source.clean import general_cleaner
from source.transf_category import recode_cat, make_ordinal
from source.transf_numeric import tr_numeric
import source.transf_univ as dfp
import source.utility as ut
import source.report as rp

import warnings

warnings.filterwarnings("ignore", 
                        message="The dummies in this set do not match the ones in the train set, we corrected the issue.")

pd.set_option('max_columns', 500)

# Data preparation

Get the data ready to flow into the pipeline

In [3]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

df_train['Target'] = np.log1p(df_train.SalePrice)

df_train = df_train[df_train.GrLivArea < 4500].copy().reset_index()

del df_train['SalePrice']

train_set, test_set = ut.make_test(df_train, 
                                test_size=0.2, random_state=654, 
                                strat_feat='Neighborhood')

y = train_set['Target'].copy()
del train_set['Target']

y_test = test_set['Target']
del test_set['Target']

## Building the pipeline

This was introduced in another notebook and imported above

In [8]:
numeric_pipe = Pipeline([('fs', dfp.feat_sel('numeric')),
                         ('imputer', dfp.df_imputer(strategy='median')),
                         ('transf', tr_numeric())])


cat_pipe = Pipeline([('fs', dfp.feat_sel('category')),
                     ('imputer', dfp.df_imputer(strategy='most_frequent')), 
                     ('ord', make_ordinal(['BsmtQual', 'KitchenQual',
                                           'ExterQual', 'HeatingQC'])), 
                     ('recode', recode_cat()), 
                     ('dummies', dfp.dummify(drop_first=True))])


processing_pipe = dfp.FeatureUnion_df(transformer_list=[('cat_pipe', cat_pipe),
                                                    ('num_pipe', numeric_pipe)])

## Evaluation method

We have seen how it works in the previous notebook, we have thus imported the necessary functions above.

In [9]:
models = [('lasso', Lasso(alpha=0.01)), ('ridge', Ridge()), ('sgd', SGDRegressor()), 
          ('forest', RandomForestRegressor(n_estimators=200)), ('xtree', ExtraTreesRegressor(n_estimators=200)), 
          ('svr', SVR()), 
          ('kneig', KNeighborsRegressor()),
          ('xgb', xgb.XGBRegressor(n_estimators=200, objective='reg:squarederror')), 
          ('lgb', lgb.LGBMRegressor(n_estimators=200))]

In [12]:
mod_name = []
rmse_train = []
rmse_test = []
mae_train = []
mae_test = []

folds = KFold(5, shuffle=True, random_state=541)

for model in models:
    
    train = train_set.copy()
    test = test_set.copy()
    print(model[0])
    mod_name.append(model[0])
    
    pipe = [('gen_cl', general_cleaner()),
            ('processing', processing_pipe),
            ('scl', dfp.df_scaler())] + [model]
    
    model_pipe = Pipeline(pipe)
            
    inf_preds = ut.cv_score(train, y, folds, model_pipe)
    
    model_pipe.fit(train, y)
    
    preds = model_pipe.predict(test)
    
    rp.plot_predictions(test, y_test, preds, savename=model[0]+'_preds.png')
    rp.plot_predictions(train, y, inf_preds, savename=model[0]+'_inf_preds.png')
    
    rmse_train.append(mean_squared_error(y, inf_preds))
    rmse_test.append(mean_squared_error(y_test, preds))
    mae_train.append(mean_absolute_error(np.expm1(y), np.expm1(inf_preds)))
    mae_test.append(mean_absolute_error(np.expm1(y_test), np.expm1(preds)))
    
    print(f'\tTrain set RMSE: {round(np.sqrt(mean_squared_error(y, inf_preds)), 4)}')
    print(f'\tTrain set MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(inf_preds)), 2)}')
    print(f'\tTest set RMSE: {round(np.sqrt(mean_squared_error(y_test, preds)), 4)}')
    print(f'\tTrain set MAE: {round(mean_absolute_error(np.expm1(y_test), np.expm1(preds)), 2)}')
    
    print('_'*40)
    print('\n')
    
results = pd.DataFrame({'model_name': mod_name, 
                        'rmse_train': rmse_train, 'rmse_test': rmse_test, 
                        'mae_train': mae_train, 'mae_test': mae_test})

results

lasso
	Train set RMSE: 0.1336
	Train set MAE: 17848.47
	Test set RMSE: 0.1486
	Train set MAE: 18862.58
________________________________________


ridge
	Train set RMSE: 0.1365
	Train set MAE: 18056.94
	Test set RMSE: 0.1395
	Train set MAE: 17031.92
________________________________________


sgd
	Train set RMSE: 0.1402
	Train set MAE: 18405.15
	Test set RMSE: 0.1451
	Train set MAE: 18936.79
________________________________________


forest
	Train set RMSE: 0.1426
	Train set MAE: 18108.48
	Test set RMSE: 0.152
	Train set MAE: 17984.85
________________________________________


xtree
	Train set RMSE: 0.1516
	Train set MAE: 19842.3
	Test set RMSE: 0.1848
	Train set MAE: 22549.56
________________________________________


svr




	Train set RMSE: 0.2486
	Train set MAE: 31975.35
	Test set RMSE: 0.3872
	Train set MAE: 51587.75
________________________________________


kneig
	Train set RMSE: 0.2255
	Train set MAE: 29547.2
	Test set RMSE: 0.3535
	Train set MAE: 50983.76
________________________________________


xgb


  if getattr(data, 'base', None) is not None and \


	Train set RMSE: 0.134
	Train set MAE: 17317.55
	Test set RMSE: 0.146
	Train set MAE: 17293.24
________________________________________


lgb
	Train set RMSE: 0.1366
	Train set MAE: 17476.95
	Test set RMSE: 0.1481
	Train set MAE: 17416.42
________________________________________




Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
0,lasso,0.017856,0.022078,17848.466775,18862.581602
1,ridge,0.018629,0.019465,18056.935842,17031.919135
2,sgd,0.019645,0.021061,18405.150669,18936.794872
3,forest,0.020336,0.0231,18108.482849,17984.851967
4,xtree,0.022981,0.034139,19842.302895,22549.55799
5,svr,0.06179,0.149952,31975.345059,51587.751807
6,kneig,0.050835,0.124984,29547.200616,50983.763905
7,xgb,0.017948,0.021306,17317.54566,17293.241626
8,lgb,0.018653,0.021939,17476.945197,17416.417507


In [13]:
results.sort_values(by='rmse_train').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
0,lasso,0.017856,0.022078,17848.466775,18862.581602
7,xgb,0.017948,0.021306,17317.54566,17293.241626


In [14]:
results.sort_values(by='rmse_test').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
1,ridge,0.018629,0.019465,18056.935842,17031.919135
2,sgd,0.019645,0.021061,18405.150669,18936.794872


In [15]:
results.sort_values(by='mae_train').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
7,xgb,0.017948,0.021306,17317.54566,17293.241626
8,lgb,0.018653,0.021939,17476.945197,17416.417507


In [16]:
results.sort_values(by='mae_test').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
1,ridge,0.018629,0.019465,18056.935842,17031.919135
7,xgb,0.017948,0.021306,17317.54566,17293.241626
