# Import library

In [15]:
import os, pickle, json

import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import  AdaBoostRegressor, BaggingRegressor

## set how numbers are shown in pandas dataframe
# pd.set_option('display.float_format', lambda x: f'{x:5.5f}')
pd.set_option('display.float_format', '{:.2e}'.format)

In [8]:
use_selected_features = True

test_file_dir = '../dataset/cleaned/test_processed_selected_features.csv'
model_dir = '../model/'

# Load test set

In [3]:
df = pd.read_csv(test_file_dir)

print('load data finished')
print('-'*30)

df = df.reset_index()


y = df['completion-time-in-minutes']
x = df.drop(['completion-time-in-minutes', 'index'], axis=1)

print('prepare data finished')
print('-'*30)
print('total test set:', len(x))

load data finished
------------------------------
prepare data finished
------------------------------
total test set: 2501632


# Evaluation

To evaluate the model, mean squared error (MSE), mean absolute error (MAE) and R-squared are measured.

In [9]:
def load_model(model_name):
    print('loading model', model_name)
    
    model = None

    with open(os.path.join(model_dir,model_name, 'model.pkl'), 'rb') as f:
        model = pickle.load(f)

    ## just in case linear regression is evaluated
    try:
        with open(os.path.join(model_dir, model_name, 'best_params.json'), 'r') as f:
            best_params = json.load(f)

        
        print('best params')

        for k,v in best_params.items():
            print('  {}:{}'.format(k,v))

    except:
        pass

    print('*'*30)

    return model

def evaluate(model_name, pred):

    mse = mean_squared_error(y, pred)
    mae = mean_absolute_error(y, pred)
    r2 = r2_score(y,pred)

    result_dict = {
        'model': model_name.replace('Regressor', ''),
        'MSE': mse,
        'MAE': mae,
        'R2': r2
    }

    return result_dict

In [10]:
model_names = ['LinearRegression', 'Lasso', 'ElasticNet', 'Ridge', 'AdaBoostRegressor_LinearRegression', 'AdaBoostRegressor_Lasso', 'AdaBoostRegressor_ElasticNet', 'AdaBoostRegressor_Ridge', 'BaggingRegressor_LinearRegression', 'BaggingRegressor_Lasso', 'BaggingRegressor_ElasticNet', 'BaggingRegressor_Ridge']

result_list = []
raw_prediction_list = []

for model_name in model_names:
    model = load_model(model_name)
    pred = model.predict(x)
    res = evaluate(model_name, pred)

    result_list.append(res)

    if model_name in ['Lasso', 'ElasticNet', 'Ridge']:
        raw_prediction_list.append(pred.reshape(-1,1))

loading model LinearRegression
******************************
loading model Lasso
best params
  alpha:1
  max_iter:100
******************************
loading model ElasticNet
best params
  alpha:1
  l1_ratio:0.3
  max_iter:100
******************************
loading model Ridge
best params
  alpha:5
  max_iter:100
******************************
loading model AdaBoostRegressor_LinearRegression
best params
  learning_rate:0.5
  loss:linear
  n_estimators:50
******************************
loading model AdaBoostRegressor_Lasso
best params
  learning_rate:0.1
  loss:linear
  n_estimators:10
******************************
loading model AdaBoostRegressor_ElasticNet
best params
  learning_rate:0.1
  loss:linear
  n_estimators:10
******************************
loading model AdaBoostRegressor_Ridge
best params
  learning_rate:5.0
  loss:square
  n_estimators:10
******************************
loading model BaggingRegressor_LinearRegression
best params
  n_estimators:100
***************************

In [11]:
### for average results from Ridge, Lasso and ElasticNet

all_result = np.concatenate(raw_prediction_list, axis=1)
pred = all_result.mean(axis=1)

res = evaluate('Average', pred)

result_list.append(res)

In [16]:
def get_base_model_type(s):
    if s.endswith('LinearRegression'):
        return 'LinearRegression'
    elif s.endswith('Lasso'):
        return 'Lasso'
    elif s.endswith('ElasticNet'):
        return 'ElasticNet'
    elif s.endswith('Ridge'):
        return 'Ridge'
    else:
        return s

result_df = pd.DataFrame(result_list)
result_df['base_model_type'] = result_df['model'].apply(lambda x: get_base_model_type(x))
result_df = result_df.sort_values(by = 'base_model_type')
result_df = result_df[['base_model_type', 'model', 'R2', 'MSE', 'MAE']]
result_df
# result_df = pd.melt(result_df, id_vars='model')

Unnamed: 0,base_model_type,model,R2,MSE,MAE
12,Average,Average,0.19,1.75,1.03
2,ElasticNet,ElasticNet,0.0976,1.96,1.09
6,ElasticNet,AdaBoost_ElasticNet,0.102,1.95,1.1
10,ElasticNet,Bagging_ElasticNet,0.0976,1.96,1.09
1,Lasso,Lasso,-0.000287,2.17,1.16
5,Lasso,AdaBoost_Lasso,-0.00544,2.18,1.18
9,Lasso,Bagging_Lasso,-0.000287,2.17,1.16
0,LinearRegression,LinearRegression,-7.34e+17,1.59e+18,798000.0
4,LinearRegression,AdaBoost_LinearRegression,0.236,1.66,1.02
8,LinearRegression,Bagging_LinearRegression,-409000000000000.0,885000000000000.0,18800.0
