This notebook tunes two of the most promising models, and creates the predictions of the test set for the best tunned model

- A validation set (10%) is created to evaluate the final models
- RandomizedSearchCV is performed with the remaining training set (90%). A pipeline is introduced so that the RandomizedSearchCV is performed with KFold and provides robuster results
- XGboost and the KernelRidge were chosen to be tuned
- The best estimators were then retrained for 90% of the training set and evaluated with the validation set. 
- XGBoost showed the best result and was chosen to make the predictions of the test set and create the submission file

* The results are saved to a log file: tune_and_predict.log

Note: We tried GridSearchCV with worst results. However, this could be used for finer tunning

In [None]:
import logging
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold
import create_models
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np
import pickle
from datetime import datetime
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline


logging.basicConfig(level=logging.DEBUG, filename="run_gridsearch.log", filemode="a+",
                        format="%(asctime)-15s %(levelname)-8s %(message)s",force=True)



# Define a function to calculate RMSE
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true-y_pred)**2))

# Define a function to calculate negative RMSE (as a score)
def nrmse(y_true, y_pred):
    return -1.0*rmse(y_true, y_pred)

neg_rmse = make_scorer(nrmse)

In [2]:
#use selected features ('yes) vs all features 'no'
sfeat='yes'

if sfeat=='no':
   read_features='None'
else:
   read_features='features_2906'

#test and train files
train_file='train_data_2906_ro'
test_file='test_data_2906_ro'

logging.info("\n---------------- NEW RUN ----------------\n"+
'* selected features: '+sfeat+'\n'+
'*features file '+read_features+'\n'+
'*outliers: removed\n'
'*normalized data: yes'+'\n\n')

#read data

df=pd.read_pickle(train_file+".pkl")
df_test=pd.read_pickle(test_file+".pkl")
X_test=df_test

y=np.array(df.SalePrice)

y=np.log(y)

if sfeat=='yes':
    with open(read_features, "rb") as fp:   # Unpickling
        features = pickle.load(fp)
    df=df[features]
    X_test=X_test[features]
else:
    df=df.drop('SalePrice', axis=1)
    X_test=df_test.drop('Id')
#--------------------------------------------
X=np.array(df)
X_test=np.array(X_test)

xgboost=create_models.xgboost(n_jobs=4)
ridge=create_models.ridge()

In [3]:
X_train,X_val,y_train,y_val=train_test_split(X,y,test_size=0.1,random_state=0)

In [4]:
logging.info('___________\nRandomizeSearchCV: XG Boost\n____________\n')

import warnings
warnings.filterwarnings("ignore")

# define the pipeline
steps = list()

steps.append(('scaler', MinMaxScaler()))
steps.append(('model', xgboost))
pipeline = Pipeline(steps=steps)
    
# define the evaluation procedure
cv = KFold(n_splits=5, random_state=1,shuffle=True)


param_grid_xgb = {
    "model__n_estimators":list(range(50, 500, 10)),
    'model__max_depth': list(range(2, 20)),
    "model__learning_rate": list(np.arange(0.05, 0.55,0.01)),
    "model__gamma": list(np.arange(0, 1.5, 0.1)),
    "model__reg_alpha": list(np.arange(0, 1.5, 0.05)),
    "model__reg_lambda": list(np.arange(0, 5, 0.5)),
    "model__scale_pos_weight": list(np.arange(0.1, 1.5, 0.1)),
    "model__subsample": list(np.arange(0.1, 2.0, 0.1)),
    "model__min_child_weight":list(np.arange(1, 7,1))
}
    
# tune model
rs_xgb=RandomizedSearchCV(pipeline, param_distributions=param_grid_xgb, n_iter = 2000,scoring =neg_rmse, cv=cv, n_jobs=10)

rs_xgb.fit(X_train,y_train)

# print best parameter after tuning 
print(rs_xgb.best_params_)
print(rs_xgb.best_score_)
logging.info('\nXGBoost Best parameters:\n'+str(rs_xgb.best_params_))
logging.info('\nXGBoost Best score:\n'+str(rs_xgb.best_score_))

{'model__subsample': 0.6, 'model__scale_pos_weight': 0.2, 'model__reg_lambda': 1.0, 'model__reg_alpha': 1.25, 'model__n_estimators': 470, 'model__min_child_weight': 4, 'model__max_depth': 2, 'model__learning_rate': 0.13, 'model__gamma': 0.0}
-0.1135583875606003


In [5]:
logging.info('___________\nRandomizedSearchCV: Ridge\n____________\n')

import warnings
warnings.filterwarnings("ignore")
from scipy.stats import uniform
ridge=create_models.ridge()

# define the pipeline
steps = list()

steps.append(('scaler', MinMaxScaler()))
steps.append(('model', ridge))
pipeline = Pipeline(steps=steps)
    
# define the evaluation procedure
cv = KFold(n_splits=5, random_state=1,shuffle=True)


param_grid_ridge = {
    'model__alpha': uniform(0.05, 1.0), 
    'model__kernel': ['polynomial'], 
    'model__degree': [2], 
    'model__coef0':uniform(0.5, 3.5)}
    
# tune model
rs_ridge=RandomizedSearchCV(pipeline, param_distributions=param_grid_ridge,n_iter = 2000, scoring =neg_rmse, cv=cv, n_jobs=10)

rs_ridge.fit(X_train,y_train)

# print best parameter after tuning 
print(rs_ridge.best_params_)
print(rs_ridge.best_score_)
logging.info('\nRidge Best parameters:\n'+str(rs_ridge.best_params_))
logging.info('\nRidge Best score:\n'+str(rs_ridge.best_score_))

{'model__alpha': 0.05014627465656736, 'model__coef0': 2.696019523063539, 'model__degree': 2, 'model__kernel': 'polynomial'}
-0.11263980386958301


In [7]:
best_models=[rs_xgb.best_estimator_, rs_ridge.best_estimator_]
labels=['xgboost', 'ridge']


for i,model in enumerate(best_models):
    scaler=MinMaxScaler()
    X_train_scaled=scaler.fit_transform(X_train)
    X_val_scaled=scaler.transform(X_val)

    model.fit(X_train_scaled,y_train)

    y_pred=model.predict(X_val_scaled)

    rmse_score=rmse(y_val,y_pred)

    print(labels[i]+': '+str(rmse_score))
    logging.info('\n'+labels[i]+': '+str(rmse_score)+'\n')

xgboost: 0.1313217520354582
ridge: 0.13333515854437233


In [8]:
model=rs_xgb.best_estimator_

#Predict Test Set
scaler=MinMaxScaler()
X_scaled=scaler.fit_transform(X)

X_test_scaled=scaler.transform(X_test)

model.fit(X_scaled,y)

# make predictions for test data
y_pred = model.predict(X_test_scaled)

#return predictions from log
y_pred=np.exp(y_pred)

In [9]:
#save predictions to .csv file
dateTimeObj=datetime.now()
submission=pd.DataFrame(columns=('Id','SalePrice'))
submission['Id']=df_test['Id']
submission['Id'] = submission['Id'].astype(int)
submission['SalePrice']=y_pred
submission.to_csv('submission'+str(dateTimeObj.year)+str(dateTimeObj.month)+
str(dateTimeObj.day)+'_'+str(dateTimeObj.hour)+str(dateTimeObj.minute)+str(dateTimeObj.second)+'.csv',index=False)