# Results

Generate and cache predictions for best models and collate test and validation metrics.

In [8]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from data import train, validation, test
from data import X, y, categorical, numerical
from utils import StandardizedGridSearchCV

# Gather best models

In [9]:
root = Path('models')

searches = dict()
for path in root.glob('*.p'):
    searches[path.stem] = StandardizedGridSearchCV.load(path)

searches = pd.Series(searches)



In [42]:
from sklearn.base import clone

candidates = []
for model in searches.index:
    search = searches[model]
    best_params = search.results.head(1)
    best_params = best_params.index.to_frame()
    best_params = best_params.to_dict(orient='records')
    for params in best_params:
        candidates.append({
            'model': model,
            'params': {**params, 'random_state': 123456789},
            'estimator': clone(search.best_estimator_.set_params(**params))
        })
candidates = pd.DataFrame.from_records(candidates)
candidates.head()

Unnamed: 0,model,params,estimator
0,SVR_Radial,"{'C': 1300, 'epsilon': 3.0, 'gamma': 0.05, 'ra...","SVR(C=1300, epsilon=3.0, gamma=0.05, max_iter=..."
1,elasticnet,"{'model_fitting__method__l1_ratio': 1e-08, 'ra...","((passthrough, ColumnTransformer(transformers=..."
2,lasso,"{'model_fitting__method__alpha': 0.0, 'random_...","((passthrough, ColumnTransformer(transformers=..."
3,RF6,"{'ccp_alpha': 0.13, 'criterion': 'mse', 'max_d...","RandomForestRegressor(ccp_alpha=0.13, max_dept..."
4,RFe50,"{'ccp_alpha': 1.0, 'criterion': 'mse', 'max_de...","RandomForestRegressor(ccp_alpha=1.0, max_depth..."


# Cache predictions

In [43]:
# Fit to train
def safe_fit(model, train, test):
    try:
        return model.fit(train, test)
    except:
        return model

candidates['estimator'].apply(lambda x: safe_fit(x, train[X], train[y]))

def safe_predict(model, data):
    try:
        return model.predict(data)
    except:
        return np.nan

# Predict on different datasets
candidates['train_pred'] = candidates['estimator'].apply(lambda x: safe_predict(x, train[X]))
candidates['val_pred'] = candidates['estimator'].apply(lambda x: safe_predict(x, validation[X]))
candidates['test_pred'] = candidates['estimator'].apply(lambda x: safe_predict(x, test[X]))

candidates = candidates.dropna()

  self._final_estimator.fit(Xt, y, **fit_params_last_step)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


# Generate Metrics

In [44]:
from sklearn.metrics import mean_squared_error

candidates['train_mse'] = candidates['train_pred'].apply(lambda x: mean_squared_error(train[y], x))
candidates['val_mse'] = candidates['val_pred'].apply(lambda x: mean_squared_error(validation[y], x))

candidates.sort_values('val_mse')[['model', 'params', 'train_mse', 'val_mse']].reset_index(drop=True).head(30)

Unnamed: 0,model,params,train_mse,val_mse
0,SVR_Radial,"{'C': 1300, 'epsilon': 3.0, 'gamma': 0.05, 'ra...",836.609672,1265.690847
1,RF3,"{'ccp_alpha': 0.4, 'criterion': 'mse', 'max_de...",357.719938,1509.779416
2,RF6,"{'ccp_alpha': 0.13, 'criterion': 'mse', 'max_d...",739.795473,1523.013369
3,RFe50,"{'ccp_alpha': 1.0, 'criterion': 'mse', 'max_de...",386.136015,1534.041902
4,ridge,{'model_fitting__method__alpha': 133.073073073...,2303.883259,1647.867402
5,regression,{'dim_reduction__method': RFECV(estimator=Line...,2228.412243,1649.95301
6,lasso,"{'model_fitting__method__alpha': 0.0, 'random_...",2228.412243,1649.95301
7,elasticnet,"{'model_fitting__method__l1_ratio': 1e-08, 'ra...",2324.04099,1661.578498
8,RF,"{'criterion': 'mse', 'max_depth': 9, 'max_feat...",849.784284,1905.309757
9,SVR_Poly,"{'C': 1, 'degree': 2, 'epsilon': 0.8, 'gamma':...",2327.551109,2014.32566


# Save predictions by final candidates

In [45]:
candidates = candidates[candidates['val_mse'] <= 2000]

# Save predictions
candidates['train_pred'].apply(pd.Series).T.to_csv('predictions/train.csv')
candidates['val_pred'].apply(pd.Series).T.to_csv('predictions/validation.csv')
candidates['test_pred'].apply(pd.Series).T.to_csv('predictions/test.csv')