Authors:
    <br>Alejandro Alvarez (axa)
    <br>Brenda Palma (bpalmagu)

# <center>ML-Jokes: Model ensemble</center>

In [1]:
# Path to ml-jokes folder
import os
if os.getcwd().split('/')[-2] == 'ml-jokes': os.chdir('..')
print(f'Current directory: {os.getcwd()}')
assert set(['data', 'mljokes', 'environment.yml', 'nbs']) <= set(os.listdir()), \
    'Wrong path; go to ./heinz-95729-project/api/ml-jokes'

Current directory: /home/alejandroxag/my_files/heinz-95729-project/api/ml-jokes


In [45]:
import optuna
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

In [4]:
with open('./results/model_inputs_nov28.pkl', 'rb') as f: model_inputs = pickle.load(f)
with open('./results/predictions_nov28.pkl', 'rb') as f: predictions = pickle.load(f)
predictions.rename(columns={'joke:id': 'joke_id'}, inplace=True)

In [11]:
predictions.rename(columns={'rating_pred': 'cb_rating'}, inplace=True)
# COLLABORATIVE FILTERING PREDICTIONS
predictions.insert(3, 'cf_rating', predictions.cb_rating + np.random.rand(len(predictions.cb_rating)))

In [78]:
full_df = pd.concat([model_inputs['user_joke_info'].loc[:, ['user_id', 'joke_id', 'rating']],
                     predictions.loc[:, ['cb_rating', 'cf_rating']]],
                     axis=1)

In [35]:
train_idxs = model_inputs['user_joke_info'].index \
                [model_inputs['user_joke_info'].training_example + \
                    model_inputs['user_joke_info'].val_example == 1]
test_idxs = model_inputs['user_joke_info'].index \
                [model_inputs['user_joke_info'].test_example == 1]

In [70]:
def tune(objective, n_trials=10):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    params = study.best_params
    best_score = study.best_value
    print(f'Best score: {best_score}\n')
    print(f'Optimized parameters: {params}\n')
    return params

def lm_objective(trial):
    _alpha = trial.suggest_loguniform('alpha', 1e-4, 10)
    _random_state = trial.suggest_int('random_state', 0, 1000)

    lm = Ridge(alpha=_alpha, random_state=_random_state, fit_intercept=False)
    scores = cross_val_score(lm, 
                             full_df.iloc[train_idxs, 3:].values, 
                             full_df.iloc[train_idxs, 2].values, 
                             cv=[(slice(None), slice(None))],
                             n_jobs=-1,
                             verbose=4,
                             scoring='neg_mean_absolute_error')
    return scores.mean()

In [71]:
lm_params = tune(lm_objective, n_trials=100)

[32m[I 2021-11-28 19:23:48,162][0m A new study created in memory with name: no-name-d0eac7ab-887c-4c5a-9baf-f8fd233bbeaa[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished
[32m[I 2021-11-28 19:23:48,666][0m Trial 0 finished with value: -3.2884380808757965 and parameters: {'alpha': 0.39973999543891725, 'random_state': 683}. Best is trial 0 with value: -3.2884380808757965.[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.2s finished
[32m[I 2021-11-28 19:23:49,136][0m Trial 1 finished with value: -3.2884380770360404 and parameters: {'alpha': 0.7335008748833022, 'random_state': 820}. Best is trial 1 with value: -3.2884380770360404.[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.2s finished
[32m[I 2021-11-2

Best score: -3.2884379856343307

Optimized parameters: {'alpha': 8.679629030110046, 'random_state': 350}



In [72]:
lm = Ridge(**lm_params)
lm.set_params(**{'fit_intercept': False})
print(f'Ridge params: {lm.get_params()}')
lm.fit(full_df.iloc[train_idxs, 3:].values, full_df.iloc[train_idxs, 2].values)
rating = full_df.iloc[test_idxs, 2].values
rating_pred = lm.predict(full_df.iloc[test_idxs, 3:].values)
print(f'MAE: {mean_absolute_error(rating, rating_pred):0.2f}')

Ridge params: {'alpha': 8.679629030110046, 'copy_X': True, 'fit_intercept': False, 'max_iter': None, 'normalize': 'deprecated', 'positive': False, 'random_state': 350, 'solver': 'auto', 'tol': 0.001}
MAE: 3.50


In [80]:
predictions_ensemble = full_df.copy()
predictions_ensemble.insert(len(full_df.columns),
                                                   'ensemble_rating', 
                                                   lm.predict(full_df.iloc[:, 3:].values))

display(predictions_ensemble.head())
with open('./results/ensemble_nov28.pkl', 'wb') as f: pickle.dump(predictions_ensemble, f)

Unnamed: 0,user_id,joke_id,rating,cb_rating,cf_rating,ensemble_rating
0,0,1,99.0,-0.162202,0.668646,-0.447613
1,0,2,99.0,-0.974697,-0.259347,-1.168026
2,0,3,99.0,-0.172788,0.25615,-0.314548
3,0,4,99.0,-2.501286,-1.645284,-2.648846
4,0,5,-1.65,1.915733,2.616649,1.546138
