In [1]:
import cornac

# Load a sample dataset (e.g., MovieLens)
ml_100k = cornac.datasets.movielens.load_feedback()

  from .autonotebook import tqdm as notebook_tqdm


In [166]:
from cornac.eval_methods import RatioSplit

# Split the data into training and testing sets
rs = RatioSplit(data=ml_100k, test_size=0.2, rating_threshold=4.0, seed=123)

In [167]:
import cornac
from cornac.eval_methods import RatioSplit
from cornac.models import BPR, PMF
from cornac.models import VAECF
from cornac.metrics import Precision, Recall, RMSE, MAE


# Instantiate a matrix factorization model (e.g., BPR)
models = [
    BPR(k=10, max_iter=200, learning_rate=0.001, lambda_reg=0.01, seed=123),
    PMF(k=10, max_iter=100, learning_rate=0.001, lambda_reg=0.001, seed=123),
    VAECF(k=10, use_gpu=False)
]

# Define metrics to evaluate the models
metrics = [RMSE(), MAE(), Precision(k=10), Recall(k=10)]

# Put it together in an experiment, voilà!
cornac.Experiment(eval_method=rs, models=models, metrics=metrics, user_based=True).run()


TEST:
...
      |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
----- + ------ + ------ + ------------ + --------- + --------- + --------
BPR   | 2.0143 | 2.2267 |       0.1110 |    0.1195 |    1.3603 |   3.6942
PMF   | 0.7534 | 0.9138 |       0.0813 |    0.0639 |    2.1885 |   0.3320
VAECF | 2.5756 | 2.7652 |       0.1534 |    0.1686 |    9.3684 |   3.9545



# Using User and Item Attributes

In [168]:
import pandas as pd
ratings = pd.read_csv('.surprise_data\\ml-100k\\ml-100k\\u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
ratings['timestamp_converted'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.sort_values(by=['user_id', 'timestamp'], inplace=True)
ratings['seconds_since_previous_rating'] = ratings['timestamp'] - ratings.groupby(['user_id'])['timestamp'].shift(1)


items = pd.read_csv('.surprise_data\\ml-100k\\ml-100k\\u.item', sep='|', header=None, encoding='latin1',
            names=['item_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])
items['release_date_converted'] = pd.to_datetime(items['release_date'])
#Calculates the release year relative to 1998 - date of the dataset
items['release_year_rltv'] = 1998-items['release_date_converted'].dt.year 

users = pd.read_csv('.surprise_data\\ml-100k\\ml-100k\\u.user', sep='|', header=None, encoding='latin1',
            names=['user_id', 'age', 'gender', 'occupation', 'zip_code'])
#First Digit of the US zip code represents the a region group            
users['zipcode_reduced'] = users['zip_code'].str[:1]

In [179]:
from cornac.data.text import TextModality

# Genre columns only (exclude 'item_id')
genre_cols = ['Action', 'Adventure', 'Animation', 'Childrens', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

# Extract genres for each row
def genres_to_text(row):
    return ' '.join(genre for genre in genre_cols if row[genre] == 1)

items['genre_concat']=items.apply(genres_to_text, axis=1)


# Item side info
item_modality = TextModality(corpus=items['genre_concat'], ids=items.item_id)


# User side info
user_modality = TextModality(corpus=users.occupation, ids=users.user_id)

In [180]:
from cornac.data import Reader
from cornac.eval_methods import BaseMethod
from cornac.eval_methods import RatioSplit

reader = Reader()
data = ratings[['user_id', 'item_id', 'rating']].values.tolist()

rs = RatioSplit(
    data=data,
    test_size=0.2,
    rating_threshold=4, 
    user_text=user_modality,
    item_text=item_modality,
    exclude_unknowns=False,
    verbose=True,
    seed=123
)

models = [BPR(k=10, max_iter=200, learning_rate=0.001, lambda_reg=0.01, seed=123),
            PMF(k=10, max_iter=100, learning_rate=0.001, lambda_reg=0.001, seed=123),
            VAECF(k=10, use_gpu=False)]

# Define metrics to evaluate the models
metrics = [RMSE(), MAE(), Precision(k=10), Recall(k=10)]

# Put it together in an experiment, voilà!
cornac.Experiment(eval_method=rs, models=models, metrics=metrics, user_based=True).run()

rating_threshold = 4.0
exclude_unknowns = False
---
Training data:
Number of users = 943
Number of items = 1657
Number of ratings = 80000
Max rating = 5.0
Min rating = 1.0
Global mean = 3.5
---
Test data:
Number of users = 943
Number of items = 1682
Number of ratings = 20000
Number of unknown users = 0
Number of unknown items = 25
---
Total users = 943
Total items = 1682

[BPR] Training started!

[BPR] Evaluation started!


Rating: 100%|██████████| 20000/20000 [00:00<00:00, 169320.77it/s]
Ranking: 100%|██████████| 941/941 [00:03<00:00, 259.30it/s]



[PMF] Training started!

[PMF] Evaluation started!


Rating: 100%|██████████| 20000/20000 [00:00<00:00, 131739.14it/s]
Ranking: 100%|██████████| 941/941 [00:00<00:00, 6530.88it/s]



[VAECF] Training started!

[VAECF] Evaluation started!


Rating: 100%|██████████| 20000/20000 [00:03<00:00, 5510.40it/s]
Ranking: 100%|██████████| 941/941 [00:00<00:00, 2867.96it/s]


TEST:
...
      |    MAE |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
----- + ------ + ------ + ------------ + --------- + --------- + --------
BPR   | 2.0267 | 2.2380 |       0.1136 |    0.1164 |    1.3550 |   3.8298
PMF   | 0.7542 | 0.9173 |       0.0775 |    0.0662 |    2.1656 |   0.3823
VAECF | 2.5978 | 2.7903 |       0.1518 |    0.1681 |    9.8058 |   4.0414






In [181]:
# Get recommendations for user id 4
user_id = 943
recommended_items = models[0].recommend(user_id)

# Print the recommended items
print(recommended_items)

[50, 100, 181, 258, 174, 1, 121, 294, 288, 286, 56, 98, 127, 172, 7, 204, 237, 300, 117, 222, 79, 405, 210, 69, 151, 168, 173, 22, 118, 313, 216, 234, 96, 423, 15, 176, 257, 25, 183, 195, 318, 9, 276, 89, 202, 28, 97, 269, 748, 328, 191, 135, 302, 186, 64, 111, 82, 12, 70, 742, 238, 153, 357, 228, 268, 194, 546, 275, 655, 197, 11, 289, 196, 125, 265, 132, 483, 143, 568, 144, 245, 333, 475, 323, 385, 282, 185, 301, 182, 435, 508, 471, 71, 603, 95, 215, 4, 161, 187, 88, 235, 678, 427, 180, 273, 496, 175, 403, 179, 597, 271, 322, 211, 527, 284, 8, 474, 250, 479, 200, 230, 124, 393, 209, 588, 23, 515, 208, 134, 274, 566, 298, 732, 272, 203, 99, 591, 480, 58, 13, 433, 751, 147, 283, 340, 226, 307, 24, 511, 83, 133, 651, 419, 628, 154, 137, 582, 367, 188, 14, 692, 319, 229, 410, 255, 451, 845, 326, 739, 514, 472, 218, 432, 402, 259, 411, 498, 327, 523, 199, 443, 252, 685, 321, 66, 1028, 248, 164, 227, 31, 476, 684, 193, 315, 285, 55, 239, 42, 240, 205, 654, 1016, 77, 550, 293, 86, 763, 484, 

In [None]:
# Converts user_id to user_idx
rs.train_set.uid_map[943]

# Converts item_id to item_idx
rs.train_set.iid_map[568]


208

In [163]:
print(ranked_items, item_scores)

[ 208  173  558 ... 1665 1666 1079] [2.10545328e-03 6.10579213e-04 1.19763112e-03 ... 1.87070018e-05
 2.30123223e-05 6.90273555e-06]


In [182]:
user_id = 943

selected_model = models[0]

# Get the ranked items and their scores for the user, does not remove seen items, returned items are on item_idx, need to convert to item_id
(ranked_items, item_scores) = selected_model.rank(user_idx=rs.train_set.uid_map[user_id], k=10)

#(ranked_items, item_scores) = selected_model.rank(user_id, k=10)

recommendations = selected_model.recommend(user_id, train_set=rs.train_set, remove_seen=True, k=10)

pd.DataFrame([(item, items[items.item_id==item].movie_title.values[0]) for item in recommendations], columns=['item_id', 'movie_title'])

Unnamed: 0,item_id,movie_title
0,181,Return of the Jedi (1983)
1,258,Contact (1997)
2,1,Toy Story (1995)
3,294,Liar Liar (1997)
4,288,Scream (1996)
5,286,"English Patient, The (1996)"
6,56,Pulp Fiction (1994)
7,7,Twelve Monkeys (1995)
8,204,Back to the Future (1985)
9,237,Jerry Maguire (1996)
