In [192]:
import cornac
import pandas as pd

from cornac.data.text import TextModality
from cornac.eval_methods import RatioSplit
from cornac.models import BPR, PMF
from cornac.models import VAECF
from cornac.metrics import Precision, Recall, RMSE, MAE
from cornac.data import Reader
from cornac.eval_methods import RatioSplit



# Reading Data

In [194]:
ratings = pd.read_csv('.surprise_data\\ml-100k\\ml-100k\\u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

items = pd.read_csv('.surprise_data\\ml-100k\\ml-100k\\u.item', sep='|', header=None, encoding='latin1',
            names=['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

users = pd.read_csv('.surprise_data\\ml-100k\\ml-100k\\u.user', sep='|', header=None, encoding='latin1',
            names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
#First Digit of the US zip code represents the a region group            
users['zipcode_reduced'] = users['zip_code'].str[:1]

In order to include items and user features, we need to create what's called a "modality" in Cornac.

The modality uses a bag-of-words representation of the text data.

In [195]:
# The movie data is on the one-hot encoding format, therefore we need to concatenate it into a single column, and create a modality

genre_cols = ['Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

def genres_to_text(row):
    return ' '.join(genre for genre in genre_cols if row[genre] == 1)

items['genre_concat']=items.apply(genres_to_text, axis=1)

item_modality = TextModality(corpus=items['genre_concat'], ids=items.item_id)

# The user modality is the occupation in this case
user_modality = TextModality(corpus=users.occupation, ids=users.user_id)

Definiting the dataset, Cornac offers a nice utility that handles the split between train and test.

Because the data has an explicity rating, the rating_threshold is used for calculating some of the metrics such as Precision_k


In [307]:
reader = Reader()
data = ratings[['user_id', 'item_id', 'rating']].values.tolist()

rs = RatioSplit(
    data=data,
    test_size=0.2,
    rating_threshold=4, 
    user_text=user_modality,
    item_text=item_modality,
    exclude_unknowns=False,
    verbose=True,
    seed=123
)

rating_threshold = 4.0
exclude_unknowns = False
---
Training data:
Number of users = 943
Number of items = 1656
Number of ratings = 80000
Max rating = 5.0
Min rating = 1.0
Global mean = 3.5
---
Test data:
Number of users = 943
Number of items = 1682
Number of ratings = 20000
Number of unknown users = 0
Number of unknown items = 26
---
Total users = 943
Total items = 1682


## Model training and evaluation

I am testing  different models

BPR - Bayesian Personalized Ranking.

PMF - Probabilistic Matrix Factorization.

VAECF - Variational Autoencoder for Collaborative Filtering. Considers the user and item features.

In [308]:
models = [BPR(k=10, max_iter=200, learning_rate=0.001, lambda_reg=0.01, seed=123),
            PMF(k=10, max_iter=100, learning_rate=0.001, lambda_reg=0.001, seed=123),
            VAECF(k=10, use_gpu=False)]

# Define metrics to evaluate the models
metrics = [RMSE(), MAE(), Precision(k=10), Recall(k=10)]

# Put it together in an experiment, voilà!
cornac.Experiment(eval_method=rs, models=models, metrics=metrics, user_based=True).run()


[BPR] Training started!

[BPR] Evaluation started!


Rating: 100%|██████████| 20000/20000 [00:00<00:00, 171614.26it/s]
Ranking: 100%|██████████| 942/942 [00:03<00:00, 260.24it/s]



[PMF] Training started!

[PMF] Evaluation started!


Rating: 100%|██████████| 20000/20000 [00:00<00:00, 167706.42it/s]
Ranking: 100%|██████████| 942/942 [00:00<00:00, 6101.37it/s]



[VAECF] Training started!

[VAECF] Evaluation started!


Rating: 100%|██████████| 20000/20000 [00:03<00:00, 5607.65it/s]
Ranking: 100%|██████████| 942/942 [00:00<00:00, 2827.61it/s]


TEST:
...
      |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
----- + ------ + ------ + ------------ + --------- + --------- + --------
BPR   | 2.0144 | 2.2268 |       0.1129 |    0.1199 |    1.3421 |   3.8465
PMF   | 0.7538 | 0.9143 |       0.0813 |    0.0639 |    2.1912 |   0.3678
VAECF | 2.5754 | 2.7651 |       0.1533 |    0.1714 |    8.8306 |   3.9634






VAECF offered the best results, followed by BPR and PMF.

## Grid search to improve BPR and VAECF model

- using precision@10 to optimize - I noticed that the precision@10 can vary drastically depending on the test, validation size, so keeping it at 20% to be comparable to the lightFM model split

In [305]:
# Redefine the split, adding validation set
rs = RatioSplit(
    data=data,
    test_size=0.2,
    val_size=0.2,
    rating_threshold=4.0, 
    user_text=user_modality,
    item_text=item_modality,
    exclude_unknowns=True,
    verbose=False,
    seed=123
)

#rs = RatioSplit(data=data, test_size=0.1, val_size=0.1, rating_threshold=4.0, seed=123)

# Instantiate the models
bpr = BPR(k=10, max_iter=200, learning_rate=0.001, lambda_reg=0.01, seed=123)
vaecf = VAECF(k=10, learning_rate=0.001, seed=123)


# Defining the Grid Search parameters and metric to optimize, Precision@10
gs_bpr = GridSearch(
    model=bpr,
    space=[
        Discrete(name="k", values=[5, 10]),
        Discrete(name="learning_rate", values=[0.001, 0.05])
    ],
    metric=Precision(k=10),
    eval_method=rs,
)

# Grid Search
gs_vaecf = GridSearch(
    model=vaecf,
    space=[
        Discrete(name="k", values=[5, 10]),
        Discrete(name="learning_rate", values=[0.001, 0.05]),
        Discrete(name="autoencoder_structure", values=[[20],[40]])        
    ],
    metric=Precision(k=10),
    eval_method=rs,
)

In [306]:
# Define the experiment
cornac.Experiment(eval_method=rs, models=[gs_vaecf, gs_bpr], metrics=[RMSE(), MAE(), Precision(k=10), Recall(k=10)], user_based=True).run()


VALIDATION:
...
                 |    MAE |   RMSE | Precision@10 | Recall@10 | Time (s)
---------------- + ------ + ------ + ------------ + --------- + --------
GridSearch_VAECF | 2.5832 | 2.7713 |       0.1407 |    0.1693 |   4.5174
GridSearch_BPR   | 1.5082 | 1.7492 |       0.1280 |    0.1503 |   3.7652

TEST:
...
                 |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
---------------- + ------ + ------ + ------------ + --------- + --------- + --------
GridSearch_VAECF | 2.5763 | 2.7657 |       0.1683 |    0.1874 |  102.0480 |   4.6161
GridSearch_BPR   | 1.5013 | 1.7427 |       0.1467 |    0.1605 |   18.4331 |   3.7826



In [298]:
# Obtain the best params
print('BRP best parameters for Precision@10')
print(gs_bpr.best_params)
print('VAECF best parameters for Precision@10')
print(gs_vaecf.best_params)

BRP best parameters for Precision@10
{'k': 10, 'learning_rate': 0.05}
VAECF best parameters for Precision@10
{'autoencoder_structure': [40], 'k': 5, 'learning_rate': 0.001}


## Getting a recommendation for a user based on user_id

Upon testing for some sample users, it was noticeable that the models differ significantly in terms of recommendation.

On a production system they could potentially be used in combination in order to retrieve candidates, adding variety, and then score those candidates with another model such as DNN.

Note: For the final model, you would usually also retrain based on the full data, after the model is chosen

In [302]:
def get_recommendation(user_id, model):
    #Function to get recommendations for a user
    recommendations = model.recommend(user_id, train_set=rs.train_set, remove_seen=True, k=10)

    df = pd.DataFrame([(item, items[items.item_id==item].movie_title.values[0]) for item in recommendations], columns=['item_id', 'movie_title'])

    return df

user_id=1
# Get recommendation for each of the best model configurations
for model in [gs_bpr.best_model, gs_vaecf.best_model]:
    print(f"Recommendations for user {user_id} using {model.name}:")
    print(get_recommendation(user_id=user_id, model=model).head(5))


Recommendations for user 1 using BPR:
   item_id                       movie_title
0       98  Silence of the Lambs, The (1991)
1      173        Princess Bride, The (1987)
2      588       Beauty and the Beast (1991)
3      433                   Heathers (1989)
4      176                     Aliens (1986)
Recommendations for user 1 using VAECF:
   item_id                       movie_title
0       98  Silence of the Lambs, The (1991)
1      176                     Aliens (1986)
2      234                       Jaws (1975)
3      195            Terminator, The (1984)
4      182                 GoodFellas (1990)


### Retrieving the scores as well

This method is being used just for checking, it returns the items based on the internal encoding and it does not exclude already watched items

In [303]:
# Get the ranked items and their scores for the user, does not remove seen items, returned items are on item_idx, need to convert to item_id
(ranked_items, item_scores) =  gs_vaecf.best_model.rank(user_idx=rs.train_set.uid_map[user_id], k=10)

print(ranked_items, item_scores)


[  83   16  114 ... 1654 1655  274] [2.6922766e-03 2.1611936e-04 3.0348392e-04 ... 6.9536118e-06 9.7517805e-06
 4.0015479e-06]


### Conversion example for reference

The cornac library converts the original ids into what they call user_idx and item_idx, this can be a little confusing when looking at the prediction results, the follow dictionarys can be used to obtain the mapping

In [304]:
# Converts user_id to user_idx
rs.train_set.uid_map[943]

# Converts item_id to item_idx
rs.train_set.iid_map[568]


341