# Results

Generate and cache predictions for best models and collate test and validation metrics.

In [22]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from data import train, validation, test
from data import X, y, categorical, numerical
from utils import StandardizedGridSearchCV

# Gather best models

In [23]:
root = Path('models')

searches = dict()
for path in root.glob('*.p'):
    searches[path.stem] = StandardizedGridSearchCV.load(path)

searches = pd.Series(searches)



In [24]:
intermediate_models = [
    'RF',
    'RF3',
    'RF4',
    'RF6',
    'RFe50',
    'Boosting9',
    'DT2',
    'lasso',
    'ridge',
    'elasticnet',
]

searches = searches.drop(intermediate_models)

In [38]:
from sklearn.base import clone

candidates = []
for model in searches.index:
    search = searches[model]
    best_params = search.results.head(1)
    best_params = best_params.index.to_frame()
    best_params = best_params.to_dict(orient='records')
    for params in best_params:
        candidates.append({
            'model': model,
            'params': {**params, 'random_state': 0},
            'estimator': clone(search.best_estimator_.set_params(**params))
        })
candidates = pd.DataFrame.from_records(candidates)
candidates.head()

Unnamed: 0,model,params,estimator
0,SVR_Radial,"{'C': 1300, 'epsilon': 3.0, 'gamma': 0.05, 'ra...","SVR(C=1300, epsilon=3.0, gamma=0.05, max_iter=..."
1,Boosting10,"{'ccp_alpha': 1.0, 'learning_rate': 0.09030612...","GradientBoostingRegressor(ccp_alpha=1.0, learn..."
2,RF5,"{'ccp_alpha': 0.01, 'criterion': 'mse', 'max_d...","RandomForestRegressor(ccp_alpha=0.01, max_dept..."
3,regression,{'dim_reduction__method': RFECV(estimator=Line...,"((passthrough, ColumnTransformer(transformers=..."
4,DT1,"{'ccp_alpha': 0.01, 'max_depth': 9, 'max_featu...","DecisionTreeRegressor(ccp_alpha=0.01, max_dept..."


In [39]:
# Some parameters are wrongly stored as np.nan instead of None likely due to versioning differences
errors = candidates.model.isin([
    'RF4',
    'RF5',
    'Boosting9',
    'Boosting10',
])

# Manually correct these parameters
candidates.loc[errors, 'estimator'] = candidates.loc[errors, 'estimator'].apply(lambda x: x.set_params(max_features=None, random_state=0))

# Cache predictions

In [40]:
# Fit to train
def safe_fit(model, train, test):
    try:
        return model.fit(train, test)
    except Exception as e:
        print(model)
        print(e)
        return model

candidates['estimator'].apply(lambda x: safe_fit(x, train[X], train[y]))

def safe_predict(model, data):
    try:
        return model.predict(data)
    except:
        print(model)
        return np.nan

# Predict on different datasets
candidates['train_pred'] = candidates['estimator'].apply(lambda x: safe_predict(x, train[X]))
candidates['val_pred'] = candidates['estimator'].apply(lambda x: safe_predict(x, validation[X]))
candidates['test_pred'] = candidates['estimator'].apply(lambda x: safe_predict(x, test[X]))

# candidates = candidates.dropna()
candidates



Unnamed: 0,model,params,estimator,train_pred,val_pred,test_pred
0,SVR_Radial,"{'C': 1300, 'epsilon': 3.0, 'gamma': 0.05, 'ra...","SVR(C=1300, epsilon=3.0, gamma=0.05, max_iter=...","[262.2640454388101, 5.717017775514172, 18.5886...","[37.596976310750534, 17.021324345994188, 20.09...","[41.99600361765639, 0.8111412286762416, 1.7551..."
1,Boosting10,"{'ccp_alpha': 1.0, 'learning_rate': 0.09030612...","([DecisionTreeRegressor(ccp_alpha=1.0, criteri...","[254.93746553030667, 7.072357431953551, 15.690...","[28.92613873835542, 7.44763895331337, 14.57596...","[72.1128821846632, 22.333590692642403, 15.9416..."
2,RF5,"{'ccp_alpha': 0.01, 'criterion': 'mse', 'max_d...","(DecisionTreeRegressor(ccp_alpha=0.01, max_dep...","[278.3164337980836, 5.554892701417249, 15.9720...","[31.794718862832116, 9.20324932411681, 12.4810...","[60.689190020167565, 21.617186831067308, 16.91..."
3,regression,{'dim_reduction__method': RFECV(estimator=Line...,"((passthrough, ColumnTransformer(transformers=...","[224.18158336011084, -4.723311143137925, 23.33...","[29.84125258058939, 32.183968055660415, 5.3289...","[79.56372534891187, 38.03625655839451, 11.2811..."
4,DT1,"{'ccp_alpha': 0.01, 'max_depth': 9, 'max_featu...","DecisionTreeRegressor(ccp_alpha=0.01, max_dept...","[284.05263157894734, 8.365384615384615, 20.297...","[29.074074074074073, 2.6666666666666665, 19.0,...","[84.15942028985508, 25.170212765957448, 20.297..."
5,SVR_Poly,"{'C': 1, 'degree': 2, 'epsilon': 0.8, 'gamma':...","SVR(C=1, degree=2, epsilon=0.8, gamma=1, kerne...","[327.8377845898885, 0.9494100921964872, 9.9556...","[58.99952136424865, 0.8744695891600003, 9.7634...","[18.941577495608655, 2.337662046202169, 46.100..."
6,SVR_Linear,"{'C': 0.8, 'epsilon': 0.0025, 'random_state': 0}","SVR(C=0.8, epsilon=0.0025, kernel='linear', ma...","[273.2778301236363, 6.00525971296417, 15.00272...","[21.729703999817552, 20.897531282810114, 13.75...","[55.588547019229104, 1.089859508971637, 16.175..."


# Generate Metrics

In [41]:
from sklearn.metrics import mean_squared_error

candidates['train_mse'] = candidates['train_pred'].apply(lambda x: mean_squared_error(train[y], x))
candidates['val_mse'] = candidates['val_pred'].apply(lambda x: mean_squared_error(validation[y], x))

candidates.sort_values('val_mse')[['model', 'train_mse', 'val_mse']].reset_index(drop=True).head(30)

Unnamed: 0,model,train_mse,val_mse
0,SVR_Radial,836.609672,1265.690847
1,RF5,356.13409,1603.256527
2,regression,2228.412243,1649.95301
3,Boosting10,96.06843,1685.044105
4,SVR_Poly,2327.551109,2014.32566
5,SVR_Linear,3524.85369,2149.461351
6,DT1,249.478363,2676.234752


# Save predictions by final candidates

In [42]:
# Filter by MSE
candidates = candidates[candidates['val_mse'] <= 2000]
candidates.sort_values('val_mse')[['model', 'train_mse', 'val_mse']].reset_index(drop=True).head(30)

Unnamed: 0,model,train_mse,val_mse
0,SVR_Radial,836.609672,1265.690847
1,RF5,356.13409,1603.256527
2,regression,2228.412243,1649.95301
3,Boosting10,96.06843,1685.044105


In [43]:
# Save predictions
train_pred = candidates['train_pred'].apply(pd.Series).T
val_pred = candidates['val_pred'].apply(pd.Series).T
test_pred = candidates['test_pred'].apply(pd.Series).T

train_pred.columns = candidates['model'].values
val_pred.columns = candidates['model'].values
test_pred.columns = candidates['model'].values

train_pred.to_csv('predictions/train.csv')
val_pred.to_csv('predictions/validation.csv')
test_pred.to_csv('predictions/test.csv')