This notebook wants to make use of the evaluation techniques previously developed to select the best algorithms for this problem.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

from sklearn.pipeline import Pipeline

from sklearn.linear_model import Lasso, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb

from sklearn.metrics import mean_squared_error, mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import sys
sys.path.append("..")
from source.clean import general_cleaner
from source.transf_category import recode_cat, make_ordinal
from source.transf_numeric import tr_numeric
import source.transf_univ as dfp
import source.utility as ut
import source.report as rp

import warnings

warnings.filterwarnings("ignore", 
                        message="The dummies in this set do not match the ones in the train set, we corrected the issue.")

pd.set_option('max_columns', 500)

# Data preparation

Get the data ready to flow into the pipeline

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

df_train['Target'] = np.log1p(df_train.SalePrice)

df_train = df_train[df_train.GrLivArea < 4500].copy().reset_index()

del df_train['SalePrice']

train_set, test_set = ut.make_test(df_train, 
                                test_size=0.2, random_state=654, 
                                strat_feat='Neighborhood')

y = train_set['Target'].copy()
del train_set['Target']

y_test = test_set['Target']
del test_set['Target']

## Building the pipeline

This was introduced in another notebook and imported above

In [3]:
numeric_pipe = Pipeline([('fs', dfp.feat_sel('numeric')),
                         ('imputer', dfp.df_imputer(strategy='median')),
                         ('transf', tr_numeric())])


cat_pipe = Pipeline([('fs', dfp.feat_sel('category')),
                     ('imputer', dfp.df_imputer(strategy='most_frequent')), 
                     ('ord', make_ordinal(['BsmtQual', 'KitchenQual',
                                           'ExterQual', 'HeatingQC'])), 
                     ('recode', recode_cat()), 
                     ('dummies', dfp.dummify(drop_first=True))])


processing_pipe = dfp.FeatureUnion_df(transformer_list=[('cat_pipe', cat_pipe),
                                                    ('num_pipe', numeric_pipe)])

## Evaluation method

We have seen how it works in the previous notebook, we have thus imported the necessary functions above.

In [4]:
models = [('lasso', Lasso(alpha=0.01)), ('ridge', Ridge()), ('sgd', SGDRegressor()), 
          ('forest', RandomForestRegressor(n_estimators=200)), ('xtree', ExtraTreesRegressor(n_estimators=200)), 
          ('svr', SVR()), 
          ('kneig', KNeighborsRegressor()),
          ('xgb', xgb.XGBRegressor(n_estimators=200, objective='reg:squarederror')), 
          ('lgb', lgb.LGBMRegressor(n_estimators=200))]

In [5]:
mod_name = []
rmse_train = []
rmse_test = []
mae_train = []
mae_test = []

folds = KFold(5, shuffle=True, random_state=541)

for model in models:
    
    train = train_set.copy()
    test = test_set.copy()
    print(model[0])
    mod_name.append(model[0])
    
    pipe = [('gen_cl', general_cleaner()),
            ('processing', processing_pipe),
            ('scl', dfp.df_scaler())] + [model]
    
    model_pipe = Pipeline(pipe)
            
    inf_preds = ut.cv_score(train, y, folds, model_pipe)
    
    model_pipe.fit(train, y)
    
    preds = model_pipe.predict(test)
    
    rp.plot_predictions(test, y_test, preds, savename=model[0]+'_preds.png')
    rp.plot_predictions(train, y, inf_preds, savename=model[0]+'_inf_preds.png')
    
    rmse_train.append(mean_squared_error(y, inf_preds))
    rmse_test.append(mean_squared_error(y_test, preds))
    mae_train.append(mean_absolute_error(np.expm1(y), np.expm1(inf_preds)))
    mae_test.append(mean_absolute_error(np.expm1(y_test), np.expm1(preds)))
    
    print(f'\tTrain set RMSE: {round(np.sqrt(mean_squared_error(y, inf_preds)), 4)}')
    print(f'\tTrain set MAE: {round(mean_absolute_error(np.expm1(y), np.expm1(inf_preds)), 2)}')
    print(f'\tTest set RMSE: {round(np.sqrt(mean_squared_error(y_test, preds)), 4)}')
    print(f'\tTrain set MAE: {round(mean_absolute_error(np.expm1(y_test), np.expm1(preds)), 2)}')
    
    print('_'*40)
    print('\n')
    
results = pd.DataFrame({'model_name': mod_name, 
                        'rmse_train': rmse_train, 'rmse_test': rmse_test, 
                        'mae_train': mae_train, 'mae_test': mae_test})

results

lasso
	Train set RMSE: 0.1217
	Train set MAE: 15576.36
	Test set RMSE: 0.1332
	Train set MAE: 15983.01
________________________________________


ridge
	Train set RMSE: 0.118
	Train set MAE: 14714.69
	Test set RMSE: 0.1275
	Train set MAE: 14594.55
________________________________________


sgd
	Train set RMSE: 0.1235
	Train set MAE: 15684.98
	Test set RMSE: 0.1313
	Train set MAE: 15696.13
________________________________________


forest
	Train set RMSE: 0.1404
	Train set MAE: 17822.89
	Test set RMSE: 0.1479
	Train set MAE: 17571.47
________________________________________


xtree
	Train set RMSE: 0.1344
	Train set MAE: 17312.14
	Test set RMSE: 0.1468
	Train set MAE: 16822.37
________________________________________


svr
	Train set RMSE: 0.1524
	Train set MAE: 18989.06
	Test set RMSE: 0.1627
	Train set MAE: 18146.5
________________________________________


kneig
	Train set RMSE: 0.1785
	Train set MAE: 23188.76
	Test set RMSE: 0.1882
	Train set MAE: 22429.05
__________________________

  if getattr(data, 'base', None) is not None and \


	Train set RMSE: 0.1259
	Train set MAE: 15930.22
	Test set RMSE: 0.1294
	Train set MAE: 14661.1
________________________________________


lgb
	Train set RMSE: 0.1302
	Train set MAE: 16642.75
	Test set RMSE: 0.1366
	Train set MAE: 15841.19
________________________________________




Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
0,lasso,0.014819,0.017741,15576.361327,15983.005735
1,ridge,0.013935,0.016258,14714.688585,14594.551585
2,sgd,0.015262,0.017232,15684.984451,15696.125372
3,forest,0.019704,0.02186,17822.889674,17571.468681
4,xtree,0.018072,0.021543,17312.14411,16822.372219
5,svr,0.023236,0.026469,18989.058222,18146.504954
6,kneig,0.031859,0.035436,23188.761124,22429.050975
7,xgb,0.015857,0.016739,15930.218084,14661.098499
8,lgb,0.016961,0.018655,16642.750104,15841.189221


In [6]:
results.sort_values(by='rmse_train').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
1,ridge,0.013935,0.016258,14714.688585,14594.551585
0,lasso,0.014819,0.017741,15576.361327,15983.005735


In [7]:
results.sort_values(by='rmse_test').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
1,ridge,0.013935,0.016258,14714.688585,14594.551585
7,xgb,0.015857,0.016739,15930.218084,14661.098499


In [8]:
results.sort_values(by='mae_train').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
1,ridge,0.013935,0.016258,14714.688585,14594.551585
0,lasso,0.014819,0.017741,15576.361327,15983.005735


In [9]:
results.sort_values(by='mae_test').head(2)

Unnamed: 0,model_name,rmse_train,rmse_test,mae_train,mae_test
1,ridge,0.013935,0.016258,14714.688585,14594.551585
7,xgb,0.015857,0.016739,15930.218084,14661.098499
