In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [27]:
from typing import Dict, List, Tuple

In [62]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

In [29]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k, reciprocal_rank

In [30]:
from recsys_training.data import Dataset, genres, preprocess_users, preprocess_items, get_user_profiles
from recsys_training.evaluation import retrieval_score, compute_mae, get_relevant_items, reciprocal_rank
from recsys_training.utils import df_to_coo

In [31]:
ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'
ml100k_item_filepath = '../data/raw/ml-100k/u.item'
ml100k_user_filepath = '../data/raw/ml-100k/u.user'

## Load Data

In [32]:
data = Dataset(ml100k_ratings_filepath)
data.rating_split(seed=42)

In [33]:
items = pd.read_csv(ml100k_item_filepath, sep='|', header=None,
                    names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,
                    engine='python')

In [34]:
users = pd.read_csv(ml100k_user_filepath, sep='|', header=None,
                    names=['user', 'age', 'gender', 'occupation', 'zip'])

## Preprocessing

In [35]:
min_rating = 4
train_ratings = data.train_ratings[data.train_ratings.rating >= min_rating]
test_ratings = data.test_ratings[data.test_ratings.rating >= min_rating]

In [36]:
items = preprocess_items(items)

In [37]:
# requires feature 'release_month' in items DataFrame
profiles = get_user_profiles(train_ratings, items)

In [38]:
# one-hot-encode release month
release_month_df = pd.get_dummies(items['release_month'], prefix='release_month')
items = pd.concat([items, release_month_df], axis=1)
items.drop('release_month', axis=1, inplace=True)

In [39]:
items.index = items['item'].values
items.drop('item', axis=1, inplace=True)

In [40]:
users = preprocess_users(users)

In [41]:
users = users.merge(profiles, on='user', how='left')

In [42]:
users.index = users['user'].values

In [43]:
occupation_1H = pd.get_dummies(users['occupation'], prefix='occupation')
zip_1H = pd.get_dummies(users['zip'], prefix='zip')

In [44]:
users.drop(['occupation', 'zip', 'user'], axis=1, inplace=True)
users = pd.concat([users, occupation_1H, zip_1H], axis=1)

In [45]:
users.fillna(0, inplace=True)

### Convert Rating Data to Sparse Matrices

In [50]:
train_coo = df_to_coo(train_ratings, n_users=data.n_users, n_items=data.n_items)
test_coo = df_to_coo(test_ratings, n_users=data.n_users, n_items=data.n_items)

### Convert Feature Data to Sparse Matrices

In [64]:
user_features = coo_matrix(users.values)
item_features = coo_matrix(items.values)

# Hybrid Recommender Systems with LightFM

[Maciej Kula: Metadata Embeddings for User and Item Cold-start Recommendations](https://arxiv.org/pdf/1507.08439.pdf)

[GitHub: LightFM](https://github.com/lyst/lightfm)

from the LightFM documentation:

BPR: Bayesian Personalised Ranking [1] pairwise loss. Maximises the prediction difference between a positive example and a randomly chosen negative example. Useful when only positive interactions are present and optimising ROC AUC is desired.

## Model Training

In [54]:
k = 16
N = 10
random_state = 42
learning_rate = 0.05

### CF

In [55]:
model_cf = LightFM(no_components=k, k=N, loss='bpr', learning_rate=learning_rate, random_state=random_state)

In [56]:
model_cf.fit(train_coo, epochs=10, verbose=True)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


<lightfm.lightfm.LightFM at 0x1210e51d0>

In [59]:
precision_at_k(model_cf, test_coo, train_coo, k=N).mean()

0.17261904

In [61]:
precision_at_k(model_cf, train_coo, k=N).mean()

0.48046705

### Hybrid

In [66]:
model_hybrid = LightFM(no_components=k, k=N, loss='bpr',
                       learning_rate=learning_rate, random_state=random_state)

In [67]:
model_hybrid.fit(train_coo, epochs=10, verbose=True,
                 user_features=user_features,
                 item_features=item_features)

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9


<lightfm.lightfm.LightFM at 0x121503cd0>

In [69]:
precision_at_k(model_hybrid, test_coo, train_coo, k=N,
               user_features=user_features,
               item_features=item_features).mean()

0.031709954

In [70]:
precision_at_k(model_hybrid, train_coo, k=N,
               user_features=user_features,
               item_features=item_features).mean()

0.1512739