This notebook tests 6 models to analyse the most promising

    - LinearRegression
    - RandomForestRegressor
    - KNeighborsRegressor
    - GradientBoostingRegressor
    - XGBRegressor
    - KernelRidge

 * The models are tested using RepeatedKFold with an integrated pipeline
 * The pipeline is introduced to integrate the data normalization into the
RepeatedKFold process. This cannot be done before the RepeatedKFold since it would 
cause data leakage.

Saves the results to a log file: select_models.log

In [None]:
linmodel=create_models.linmodel(n_jobs=4)
random_forest=create_models.random_forest(n_jobs=4)
knn=create_models.knn(n_jobs=4)
ridge=create_models.ridge(alpha=0.68083180840980384, coef0=3.9929790032748049, degree=2,kernel='polynomial')
gradboost=create_models.gradboost()
xgboost=create_models.xgboost(n_jobs=4)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import create_models
import pickle
from sklearn.pipeline import Pipeline
import logging

pd.set_option('display.float_format', lambda x: '%.9f' % x)

#scaler: 'minmax', 'robust'
scaler='minmax'

#use selected features ('yes) vs all features 'no'
sfeat='yes'

if sfeat=='no':
   read_features='None'
else:
   read_features='features_2906'

#test and train files
train_file='train_data_2906_ro'
test_file='test_data_2906_ro'

logging.basicConfig(level=logging.DEBUG, filename="select_models.log", filemode="a+",
                        format="%(asctime)-15s %(levelname)-8s %(message)s")

logging.info("\n---------------- NEW RUN ----------------\n"+
'* selected features: '+sfeat+'\n'+
'*features file: '+read_features+'\n'+
'*train file: '+train_file+'\n'+
'*outliers: removed\n'+
'*scaler: '+scaler+'\n'+
'*normalized data: yes'+'\n\n')

In [47]:
#define models

linmodel=create_models.linmodel(n_jobs=4)
random_forest=create_models.random_forest(n_jobs=4)
knn=create_models.knn(n_jobs=4)
ridge=create_models.ridge(alpha=0.68083180840980384, coef0=3.9929790032748049, degree=2,kernel='polynomial')
gradboost=create_models.gradboost()
xgboost=create_models.xgboost(n_jobs=4)

#create dictionary with models:
m=[linmodel,random_forest,knn,ridge,gradboost,xgboost]
labels=['linmodel','random_forest','knn','lasso','gradboost','xgboost']
model_dict=dict(zip(labels, m))

In [48]:

df=pd.read_pickle(train_file+".pkl")
df_test=pd.read_pickle(test_file+".pkl")
X_test=df_test

y=np.array(df.SalePrice)

if sfeat=='yes':
    with open(read_features, "rb") as fp:   # Unpickling
        features = pickle.load(fp)
    df=df[features]
    X_test=X_test[features]
else:
    df=df.drop('SalePrice', axis=1)
    X_test=df_test.drop('Id' ,axis=1)
#--------------------------------------------
X=np.array(df)
X_test=np.array(X_test)

#transform y
y=np.log(y)


In [49]:
#check shapes
logging.info('\nX_shape: '+str(X.shape)+'\n'+
            'Xtest_shape: '+str(X_test.shape))

print('X_shape', X.shape)
print('Xtest_shape', X_test.shape)

X_shape (1347, 58)
Xtest_shape (1459, 58)


In [50]:
#do k-fold validation to select the best two models
lscore=[]
avg_r2=[]

for i,model in enumerate(model_dict.values()):
    # define the pipeline
    steps = list()
    if scaler=='minmax':
        steps.append(('scaler', MinMaxScaler()))
    elif scaler=='robust':
        steps.append(('scaler', RobustScaler()))
    steps.append(('model', model))
    pipeline = Pipeline(steps=steps)
    
    # define the evaluation procedure
    cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=1)
    
    # evaluate the model using cross-validation
    scores = cross_val_score(pipeline, X, y, scoring='r2', cv=cv, n_jobs=4)
    

    avg=sum(scores)/len(scores)
    lscore.append(scores)
    avg_r2.append(avg)
    



In [51]:
avg_score=pd.DataFrame(avg_r2, index=model_dict.keys(),columns=['avg_scores'])
avg_score.sort_values(by=['avg_scores'], ascending=False,inplace=True)

logging.info('\nOverall mean accuracies\n____________\n'+
avg_score.index[0]+': '+str(avg_score.avg_scores[0])+'\n'+
avg_score.index[1]+': '+str(avg_score.avg_scores[1])+'\n'+
avg_score.index[2]+': '+str(avg_score.avg_scores[2])+'\n'+
avg_score.index[3]+': '+str(avg_score.avg_scores[3])+'\n'+
avg_score.index[4]+': '+str(avg_score.avg_scores[4])+'\n____________\n')

avg_score

Unnamed: 0,avg_scores
linmodel,0.905658688
gradboost,0.90551584
xgboost,0.902890928
lasso,0.902543203
random_forest,0.881327348
knn,0.77732734
