Authors:
    <br>Alejandro Alvarez (axa)
    <br>Brenda Palma (bpalmagu)

# <center>ML-Jokes: Content-Based Filtering</center>

In [1]:
# Path to ml-jokes folder
import os
if os.getcwd().split('/')[-2] == 'ml-jokes': os.chdir('..')
print(f'Current directory: {os.getcwd()}')
assert set(['data', 'mljokes', 'environment.yml', 'nbs']) <= set(os.listdir()), \
    'Wrong path; go to ./heinz-95729-project/api/ml-jokes'

Current directory: /home/alejandroxag/my_files/heinz-95729-project/api/ml-jokes


In [98]:
# imports
import time
import tqdm
import pickle
import numpy as np
import pandas as pd
from mljokes.topics import get_lda_topics
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from mljokes.data import read_jokes, read_ratings
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor

# Regression model inputs

In [3]:
def get_reg_model_inputs(n_topics=5, random_state=0):

    # Load jokes and ratings
    jokes_df = read_jokes()
    ratings_df = read_ratings()

    # Topics
    topics = pd.DataFrame(get_lda_topics(text=jokes_df.clean_text, 
                                         n_topics=n_topics, 
                                         random_state=random_state)) \
               .reset_index().rename(columns={'index': 'joke_id'})
    topics.loc[:, 'joke_id'] = topics.joke_id + 1

    # User-joke information
    user_joke_info = ratings_df.copy()
    user_joke_info.reset_index(inplace=True)
    user_joke_info.rename(columns={'index': 'user_id'}, inplace=True)
    user_joke_info.drop(columns='count_rated', inplace=True)
    user_joke_info = pd.melt(user_joke_info, 
                                id_vars=['user_id'],
                                var_name='joke_id',
                                value_vars=user_joke_info.columns[1:],
                                value_name='rating')
    user_joke_info = user_joke_info.loc[user_joke_info.rating != 99.0]
    user_joke_info.sort_values(by=['user_id', 'joke_id'], inplace=True)
    user_joke_info = user_joke_info.merge(topics, how='left', on='joke_id')
    user_joke_info.rename(columns=dict(zip(user_joke_info.columns[3:],
                                    ['topic_percentage_' + c  for c in np.array(user_joke_info.columns[3:], dtype=str)])), 
                        inplace=True)
    user_joke_info.reset_index(drop=True, inplace=True)

    del jokes_df, ratings_df

    # User profiles
    user_profiles = user_joke_info.iloc[:, :2]
    user_profiles.loc[:, list(range(len(user_joke_info.columns[3:])))] = \
        (user_joke_info.rating / 10).values.reshape((-1, 1)) * user_joke_info.iloc[:, 3:].values

    user_profiles = user_profiles.groupby(['user_id'])[user_profiles.columns[2:]].agg('mean').reset_index()
    user_profiles.rename(columns=dict(zip(user_profiles.columns[1:],
                                    ['topic_user_rating_' + c for c in np.array(user_profiles.columns[1:], dtype=str)])), 
                        inplace=True)

    # User-joke inputs
    user_joke_inputs = user_joke_info.merge(user_profiles, how='left', on='user_id')

    user_joke_idxs = user_joke_inputs.loc[:, ['user_id', 'joke_id']]
    y = user_joke_inputs.rating.values
    X = user_joke_inputs.iloc[:, 3:].values

    del user_joke_info, user_profiles, user_joke_inputs

    return X, y, user_joke_idxs, topics
    

# Regression model (HGB)

### Train-test split

In [4]:
# Data inputs
X, y, user_joke_idxs, topics = get_reg_model_inputs(n_topics=10)

# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)

./data/ratings/jester-data-3.xls
./data/ratings/jester-data-2.xls
./data/ratings/jester-data-1.xls


### Light hypertuning

In [5]:
subsample = np.random.randint(low=0, high=len(y_train), size=round(0.10 * len(y_train)))
X_gs, y_gs = X_train[subsample], y_train[subsample]

fixed_params = {
                'loss': 'absolute_error',
                'max_iter': 100_000,
                'early_stopping': True,
                'scoring': 'loss',
                'validation_fraction': 0.20,
                'tol': 1e-5,
                'n_iter_no_change': 10,
                'verbose': 4,
                'random_state': 0
               }

param_grid = {
              'learning_rate': [0.01, 0.1],
              'max_depth': [5, 7, 13, None],
              'l2_regularization': [0.01, 0.01, 0.1, 1]
             }

hgb_gs = HistGradientBoostingRegressor().set_params(**fixed_params)

grid_search = GridSearchCV(hgb_gs, 
                           param_grid=param_grid, 
                           n_jobs=-1, 
                           refit=False, 
                           cv=[(slice(None), slice(None))],
                           verbose=4)

start_time = time.time()
grid_search.fit(X_gs, y_gs)
elapsed_time = time.time() - start_time
print(f'\nElapsed time: {elapsed_time:0.2f} seconds.\n')

with open('./results/hgb_26nov1100.pkl', 'wb') as f: 
  pickle.dump({'gs_object': grid_search, 'elapsed_time': elapsed_time}, f)

Fitting 1 folds for each of 12 candidates, totalling 12 fits

Elapsed time: 228.87 seconds.



### Model fitting

In [6]:
# GS parameters loading
with open('./results/hgb_26nov1100.pkl', 'rb') as f: 
  grid_search = pickle.load(f)['gs_object']

# Data split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True)

# Train
hgb = HistGradientBoostingRegressor(**{**fixed_params, **grid_search.best_params_})
hgb.fit(X_train, y_train)

# Test
y_true = y_test
y_pred = hgb.predict(X_test)
print(f'MAE: {mean_absolute_error(y_true, y_pred)}')
predictions_df = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
predictions_df.insert(2, 'error', np.abs(predictions_df.y_true - predictions_df.y_pred))
display(predictions_df)

Binning 0.424 GB of training data: 2.138 s
Binning 0.106 GB of validation data: 0.418 s
Fitting gradient boosted rounds:
[1/1000] 1 tree, 31 leaves, max depth = 6, train loss: 4.28791, val loss: 4.28472, in 0.540s
[2/1000] 1 tree, 31 leaves, max depth = 7, train loss: 4.18044, val loss: 4.17728, in 0.510s
[3/1000] 1 tree, 31 leaves, max depth = 7, train loss: 4.08895, val loss: 4.08581, in 0.596s
[4/1000] 1 tree, 31 leaves, max depth = 7, train loss: 4.01022, val loss: 4.00693, in 0.503s
[5/1000] 1 tree, 31 leaves, max depth = 7, train loss: 3.93995, val loss: 3.93637, in 0.566s
[6/1000] 1 tree, 31 leaves, max depth = 7, train loss: 3.87935, val loss: 3.87567, in 0.476s
[7/1000] 1 tree, 31 leaves, max depth = 7, train loss: 3.82671, val loss: 3.82297, in 0.504s
[8/1000] 1 tree, 31 leaves, max depth = 7, train loss: 3.77991, val loss: 3.77609, in 0.483s
[9/1000] 1 tree, 31 leaves, max depth = 7, train loss: 3.73799, val loss: 3.73402, in 0.488s
[10/1000] 1 tree, 31 leaves, max depth = 7

Unnamed: 0,y_true,y_pred,error
0,-2.18,0.769478,2.949478
1,1.02,0.194918,0.825082
2,-6.46,0.399634,6.859634
3,7.77,1.625906,6.144094
4,2.18,5.216578,3.036578
...,...,...,...
827267,1.17,-0.027579,1.197579
827268,6.12,3.908489,2.211511
827269,8.74,5.796850,2.943150
827270,-0.44,-0.707504,0.267504


# Recommendations to the users

### Unseen jokes per user

In [109]:
def unseen_jokes_predict_ratings(user_id, estimator, user_joke_idxs, topics):
    full_combinations = user_joke_idxs.loc[:, ['user_id']].drop_duplicates() \
                                    .merge(pd.DataFrame({'joke_id': np.arange(1, 101)}), how='cross')
    
    unseen_jokes_idxs = full_combinations.merge(user_joke_idxs, how='outer', indicator=True)
    unseen_jokes_idxs = unseen_jokes_idxs[~(unseen_jokes_idxs._merge == 'both')].drop('_merge', axis=1)
    unseen_jokes_user_id = unseen_jokes_idxs[unseen_jokes_idxs.user_id == user_id]

    user_id_laugh_dist_per_topic = \
        np.unique(X[user_joke_idxs[user_joke_idxs.user_id == user_id].index][:, X.shape[1] // 2:], axis=0) 
    user_id_laugh_dist_per_topic = pd.DataFrame(user_id_laugh_dist_per_topic).reset_index()
    user_id_laugh_dist_per_topic.rename(columns={'index': 'user_id'}, inplace=True)

    unseen_jokes_user_id = unseen_jokes_user_id.merge(topics, how='left', on='joke_id')
    unseen_jokes_user_id.rename(columns=dict(zip(list(range(len(topics.columns[1:]))),
                                            [f'jt_{c}' for c in topics.columns[1:]])),
                                            inplace=True)
    
    unseen_jokes_user_id = unseen_jokes_user_id[unseen_jokes_user_id.user_id == user_id] \
                            .merge(user_id_laugh_dist_per_topic, how='left', on='user_id')
    unseen_jokes_user_id.rename(columns=dict(zip(list(range(len(topics.columns[1:]))),
                                            [f'ujr_{c}' for c in topics.columns[1:]])),
                                            inplace=True)

    unseen_jokes_rating_pred = \
        pd.concat([unseen_jokes_user_id.loc[:, ['user_id', 'joke_id']], 
                   pd.DataFrame({'rating_pred': estimator.predict(unseen_jokes_user_id.iloc[:, 2:].values)})],
                  axis=1)
    unseen_jokes_rating_pred.sort_values(by='rating_pred', ascending=False, inplace=True)

    return unseen_jokes_rating_pred

In [112]:
users_list = list(range(3))
unseen_jokes_predictions = pd.concat([unseen_jokes_predict_ratings(user_id=user_id, 
                                                 estimator=hgb,
                                                 user_joke_idxs=user_joke_idxs, 
                                                 topics=topics)\
                                      for user_id in tqdm.tqdm(users_list)],
                                     axis=0)

display(unseen_jokes_predictions)



[A[A

[A[A

[A[A

100%|██████████| 3/3 [01:03<00:00, 21.16s/it]


Unnamed: 0,user_id,joke_id,rating_pred
43,0,69,3.788371
7,0,11,2.300245
16,0,28,2.184911
61,0,88,2.166303
56,0,83,1.622193
...,...,...,...
13,2,24,-2.716903
32,2,44,-2.802243
45,2,57,-2.813451
59,2,74,-3.572958
