Authors:
    <br>Alejandro Alvarez (axa)
    <br>Brenda Palma (bpalmagu)

# <center>ML-Jokes: Model ensemble</center>

## Setup

In [1]:
# Path to ml-jokes folder
import os
if os.getcwd().split('/')[-2] == 'ml-jokes': os.chdir('..')

print(f'Current directory: {os.getcwd()}')
assert set(['data', 'mljokes', 'environment.yml', 'nbs']) <= set(os.listdir()), \
    'Wrong path; go to ./heinz-95729-project/api/ml-jokes'

Current directory: /home/brendapalmag/eCommerce/heinz-95729-project/api/ml-jokes


In [47]:
import optuna
import pickle
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from mljokes.data import read_ratings, read_jokes   

## Data

In [4]:
# Load cb results (ordered by user, and joke asc)
with open('./results/predictions_nov28.pkl', 'rb') as f: predictions = pickle.load(f)
predictions.rename(columns={'joke:id': 'joke_id', 'rating_pred': 'cb_rating'}, inplace=True)

# Load cf results (ordered by user, and joke asc)
pred_cf = pd.read_csv('./results/cf_results.csv')

# Merge results
predictions['cf_rating'] = pred_cf['pred_cf']
del pred_cf

#with open('./results/model_inputs_nov28.pkl', 'rb') as f: model_inputs = pickle.load(f)

  mask |= (ar1 == a)


In [9]:
# Load real ratings (ordered by user, and joke asc)
ratings = read_ratings()

# Merge with 
predictions['real_rating'] = ratings['rating']

predictions.head()

Unnamed: 0,user_id,joke_id,cb_rating,cf_rating,real_rating
0,0,1,-0.162202,-0.760591,99.0
1,0,2,-0.974697,-1.241289,99.0
2,0,3,-0.172788,-2.495778,99.0
3,0,4,-2.501286,-3.28885,99.0
4,0,5,1.915733,-0.267254,-1.65


In [12]:
# Train and test split
all_idxs = predictions.index[predictions['real_rating']!=99.]
test_size = int(len(all_idxs)*0.3)

train_idxs = np.random.choice(all_idxs, size = len(all_idxs) - test_size, replace=False)
test_idxs = set(all_idxs) - set(train_idxs)

In [13]:
train_idxs, test_idxs = list(train_idxs), list(test_idxs)
print(f'Validate indexes: {len(all_idxs)==(len(train_idxs)+len(test_idxs))}')

Validate indexes: True


In [16]:
predictions.head()

Unnamed: 0,user_id,joke_id,cb_rating,cf_rating,real_rating
0,0,1,-0.162202,-0.760591,99.0
1,0,2,-0.974697,-1.241289,99.0
2,0,3,-0.172788,-2.495778,99.0
3,0,4,-2.501286,-3.28885,99.0
4,0,5,1.915733,-0.267254,-1.65


In [24]:
def tune(objective, n_trials=10):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials)

    params = study.best_params
    best_score = study.best_value
    print(f'Best score: {best_score}\n')
    print(f'Optimized parameters: {params}\n')
    return params

def lm_objective(trial):
    _alpha = trial.suggest_loguniform('alpha', 1e-4, 10)
    _random_state = trial.suggest_int('random_state', 0, 1000)

    lm = Ridge(alpha=_alpha, random_state=_random_state, fit_intercept=False)
    scores = cross_val_score(lm, 
                             predictions.loc[train_idxs, ['cb_rating', 'cf_rating']].values, 
                             predictions.loc[train_idxs, 'real_rating'].values, 
                             cv=[(slice(None), slice(None))],
                             n_jobs=-1,
                             verbose=4,
                             scoring='neg_mean_absolute_error')
    return scores.mean()

In [25]:
lm_params = tune(lm_objective, n_trials=100)

[32m[I 2021-11-29 13:17:01,865][0m A new study created in memory with name: no-name-6abecf8e-71fd-4951-8836-0e44799342db[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    7.5s finished
[32m[I 2021-11-29 13:17:10,200][0m Trial 0 finished with value: -2.9208921412559166 and parameters: {'alpha': 0.025404477191855374, 'random_state': 875}. Best is trial 0 with value: -2.9208921412559166.[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished
[32m[I 2021-11-29 13:17:11,884][0m Trial 1 finished with value: -2.920892143361492 and parameters: {'alpha': 0.35430499092974116, 'random_state': 426}. Best is trial 0 with value: -2.9208921412559166.[0m
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   27.1s finished
[32m[I 2021-11-

Best score: -2.920892141093922

Optimized parameters: {'alpha': 0.00010007734840328384, 'random_state': 61}



In [29]:
lm = Ridge(**lm_params)
lm.set_params(**{'fit_intercept': False})
print(f'Ridge params: {lm.get_params()}')
lm.fit(predictions.loc[train_idxs, ['cb_rating', 'cf_rating']].values, predictions.loc[train_idxs, 'real_rating'].values)
rating = predictions.loc[test_idxs, ['real_rating']].values
rating_pred = lm.predict(predictions.loc[test_idxs, ['cb_rating', 'cf_rating']].values)
print(f'MAE: {mean_absolute_error(rating, rating_pred):0.2f}')

Ridge params: {'alpha': 0.00010007734840328384, 'copy_X': True, 'fit_intercept': False, 'max_iter': None, 'normalize': 'deprecated', 'positive': False, 'random_state': 61, 'solver': 'auto', 'tol': 0.001}
MAE: 2.92


In [28]:
predictions_ensemble = predictions.copy()
predictions_ensemble.insert(len(predictions.columns),
                                                   'ensemble_rating', 
                                                   lm.predict(predictions.loc[:, ['cb_rating', 'cf_rating']].values))

display(predictions_ensemble.head())
with open('./results/ensemble_nov28.pkl', 'wb') as f: pickle.dump(predictions_ensemble, f)

Unnamed: 0,user_id,joke_id,cb_rating,cf_rating,real_rating,ensemble_rating
0,0,1,-0.162202,-0.760591,99.0,-0.67343
1,0,2,-0.974697,-1.241289,99.0,-1.296617
2,0,3,-0.172788,-2.495778,99.0,-2.109742
3,0,4,-2.501286,-3.28885,99.0,-3.412844
4,0,5,1.915733,-0.267254,-1.65,0.312347


In [34]:
# Example
user_id = 4493

x_user = predictions.loc[(predictions['user_id']==user_id) & (predictions['real_rating']==99.), ['cb_rating', 'cf_rating']].values
rating_pred_user = lm.predict(x_user)
sorted_idx = np.argsort(rating_pred_user)[::-1]
sorted_ratings = rating_pred_user[sorted_idx]

In [46]:
# Load jokes
jokes = read_jokes()

In [55]:
# Display top k jokes
k = 4

for i in sorted_idx[:k]:
    print(jokes['text'][i], end='\n\n')

A guy goes into confession and says to the priest, "Father, I'm 80 years old, widower, with 11 grandchildren. Last night I met two beautiful flight attendants. They took me home and I made love to both of them. Twice." The priest said: "Well, my son, when was the last time you were in confession?" "Never Father, I'm Jewish." "So then, why are you telling me?" "I'm telling everybody."

If pro- is the opposite of con- then congress must be the opposite of progress.

Q: What is the difference between Mechanical Engineers and Civil Engineers? A: Mechanical Engineers build weapons, Civil Engineers build targets.

A lawyer opened the door of his BMW, when suddenly a car came along and hit the door, ripping it off completely. When the police arrived at the scene, the lawyer was complaining bitterly about the damage to his precious BMW. "Officer, look what they've done to my Beeeeemer!!!", he whined. "You lawyers are so materialistic, you make me sick!!!" retorted the officer. "You're so worri