In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from typing import Dict, List, Tuple

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [4]:
from pyfm import pylibfm

In [5]:
from recsys_training.data import Dataset, genres, preprocess_users, preprocess_items, get_user_profiles
from recsys_training.evaluation import retrieval_score, compute_mae, get_relevant_items, reciprocal_rank
from recsys_training.utils import get_sparsity, one_hot_encode_ids

In [6]:
ml100k_ratings_filepath = '../data/raw/ml-100k/u.data'
ml100k_item_filepath = '../data/raw/ml-100k/u.item'
ml100k_user_filepath = '../data/raw/ml-100k/u.user'

## Load Data

In [7]:
data = Dataset(ml100k_ratings_filepath)
data.rating_split(seed=42)

In [8]:
items = pd.read_csv(ml100k_item_filepath, sep='|', header=None,
                    names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,
                    engine='python')

In [9]:
users = pd.read_csv(ml100k_user_filepath, sep='|', header=None,
                    names=['user', 'age', 'gender', 'occupation', 'zip'])

## Preprocessing

In [10]:
min_rating = 4
ratings = data.train_ratings[data.train_ratings.rating >= min_rating]

In [11]:
items = preprocess_items(items)

In [12]:
# requires feature 'release_month' in items DataFrame
profiles = get_user_profiles(ratings, items)

In [13]:
# one-hot-encode release month
release_month_df = pd.get_dummies(items['release_month'], prefix='release_month')
items = pd.concat([items, release_month_df], axis=1)
items.drop('release_month', axis=1, inplace=True)

In [14]:
items.index = items['item'].values
items.drop('item', axis=1, inplace=True)

In [15]:
users = preprocess_users(users)

In [16]:
users = users.merge(profiles, on='user', how='left')

In [17]:
users.index = users['user'].values

In [18]:
occupation_1H = pd.get_dummies(users['occupation'], prefix='occupation')
zip_1H = pd.get_dummies(users['zip'], prefix='zip')

In [19]:
users.drop(['occupation', 'zip', 'user'], axis=1, inplace=True)
users = pd.concat([users, occupation_1H, zip_1H], axis=1)

In [20]:
users.fillna(0, inplace=True)

## Check

In [21]:
items.head()

Unnamed: 0,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,release_month_3,release_month_4,release_month_5,release_month_6,release_month_7,release_month_8,release_month_9,release_month_10,release_month_11,release_month_12
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
items.shape

(1682, 32)

In [23]:
users.head()

Unnamed: 0,age,gender,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,zip_90,zip_91,zip_92,zip_93,zip_94,zip_95,zip_96,zip_97,zip_98,zip_99
1,0.257576,1,0.00813,0.235772,0.097561,0.04065,0.04065,0.308943,0.081301,0.04065,...,0,0,0,0,0,0,0,0,0,0
2,0.69697,0,0.0,0.193548,0.064516,0.032258,0.032258,0.258065,0.193548,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0.242424,1,0.0,0.25,0.166667,0.0,0.0,0.25,0.25,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.257576,1,0.0,0.285714,0.142857,0.0,0.0,0.142857,0.214286,0.071429,...,0,0,0,0,0,0,0,0,0,0
5,0.393939,0,0.021277,0.382979,0.234043,0.12766,0.06383,0.595745,0.148936,0.0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
users.shape

(943, 141)

In [25]:
users.gender.value_counts()

1    670
0    273
Name: gender, dtype: int64

In [26]:
users.iloc[:, 0].describe()

count    943.000000
mean       0.409878
std        0.184738
min        0.000000
25%        0.272727
50%        0.363636
75%        0.545455
max        1.000000
Name: age, dtype: float64

## Factorization Maschines to Leverage Content Information

[pyFM - Factorization Machines in Python](https://github.com/coreylynch/pyFM)

### Content-based Filtering for Rating Prediction 

Sparsity of user/item content information

In [27]:
(users==0).sum().sum()/users.size

0.861337364529982

In [28]:
(items==0).sum().sum()/users.size

0.35775366079285215

#### Create Feature Matrices

In [29]:
# fetch content information for all observed user-item rating combinations
user_cb_feat_train = users.loc[data.train_ratings.user.values].values
user_cb_feat_test = users.loc[data.test_ratings.user.values].values
item_cb_feat_train = items.loc[data.train_ratings.item.values].values
item_cb_feat_test = items.loc[data.test_ratings.item.values].values

In [30]:
# concatenate user and item content information to form design matrices
# and convert to sparse matrix in Compressed Sparse Row (CSR) format
X_train = np.concatenate((user_cb_feat_train, item_cb_feat_train), axis=1)
X_train = csr_matrix(X_train)
X_test = np.concatenate((user_cb_feat_test, item_cb_feat_test), axis=1)
X_test = csr_matrix(X_test)

In [31]:
X_train

<80000x173 sparse matrix of type '<class 'numpy.float64'>'
	with 2075351 stored elements in Compressed Sparse Row format>

In [32]:
# Sparsity of Training Data
get_sparsity(X_train)

0.8500468930635838

In [33]:
X_test

<20000x173 sparse matrix of type '<class 'numpy.float64'>'
	with 518098 stored elements in Compressed Sparse Row format>

In [34]:
# Sparsity of Test Data
get_sparsity(X_test)

0.8502606936416185

#### Create Target Matrices for Rating Predictions

In [35]:
y_train = data.train_ratings.rating.values.astype(float)
y_test = data.test_ratings.rating.values

#### Train Factorization Machine for Rating Prediction as Regressor using pyFM

In [36]:
n_epochs = 50  # number of full stochastic passes through the training data
k = 16
random_seed = 28

In [37]:
fm_cb = pylibfm.FM(num_factors=k,
                num_iter=n_epochs,
                verbose=True,
                task="regression",
                initial_learning_rate=0.001,
                learning_rate_schedule="optimal",
                seed=random_seed)
fm_cb.fit(X_train, y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.57679
-- Epoch 2
Training MSE: 0.53779
-- Epoch 3
Training MSE: 0.52432
-- Epoch 4
Training MSE: 0.51722
-- Epoch 5
Training MSE: 0.51355
-- Epoch 6
Training MSE: 0.50994
-- Epoch 7
Training MSE: 0.50888
-- Epoch 8
Training MSE: 0.50753
-- Epoch 9
Training MSE: 0.50713
-- Epoch 10
Training MSE: 0.50617
-- Epoch 11
Training MSE: 0.50588
-- Epoch 12
Training MSE: 0.50531
-- Epoch 13
Training MSE: 0.50444
-- Epoch 14
Training MSE: 0.50426
-- Epoch 15
Training MSE: 0.50423
-- Epoch 16
Training MSE: 0.50332
-- Epoch 17
Training MSE: 0.50355
-- Epoch 18
Training MSE: 0.50423
-- Epoch 19
Training MSE: 0.50323
-- Epoch 20
Training MSE: 0.50316
-- Epoch 21
Training MSE: 0.50299
-- Epoch 22
Training MSE: 0.50365
-- Epoch 23
Training MSE: 0.50350
-- Epoch 24
Training MSE: 0.50306
-- Epoch 25
Training MSE: 0.50298
-- Epoch 26
Training MSE: 0.50266
-- Epoch 27
Training MSE: 0.50304
-- Epoch 28
Tra

In [38]:
y_pred = fm_cb.predict(X_test)

$MSE$

In [39]:
mean_squared_error(y_test, y_pred)

1.041548679956183

$MAE$

In [40]:
mean_absolute_error(y_test, y_pred)

0.8263579521053876

### Hybrid: Content-based and Collaborative Filtering for Rating Prediction

This is our first hybrid recommender algorithm as it combines both, collaborative, i.e. behavioral information, with content information, i.e. user/item features.

We implement this by augmenting the design matrices $X_{train}$ and $X_{test}$ with one-hot encoded representations of user-item ids.

In [41]:
# Subtract 1 to turn 1-base-indexed into 0-base-indexed IDs for 0-base-indexed array
user_cf_feat_train = one_hot_encode_ids(data.train_ratings.user.values-1, data.n_users)
user_cf_feat_test = one_hot_encode_ids(data.test_ratings.user.values-1, data.n_users)
item_cf_feat_train = one_hot_encode_ids(data.train_ratings.item.values-1, data.n_items)
item_cf_feat_test = one_hot_encode_ids(data.test_ratings.item.values-1, data.n_items)

In [42]:
# concatenate user and item content information
# and user and item collaborative information to form design matrices
# and convert to sparse matrix in Compressed Sparse Row (CSR) format
X_train = np.concatenate((user_cb_feat_train, item_cb_feat_train,
                          user_cf_feat_train, item_cf_feat_train), axis=1)
X_train = csr_matrix(X_train)
X_test = np.concatenate((user_cb_feat_test, item_cb_feat_test,
                         user_cf_feat_test, item_cf_feat_test), axis=1)
X_test = csr_matrix(X_test)

In [48]:
n_epochs = 50  # number of full stochastic passes through the training data
k = 16

In [49]:
fm_hybrid = pylibfm.FM(num_factors=k,
                num_iter=n_epochs,
                verbose=True,
                task="regression",
                initial_learning_rate=0.001,
                learning_rate_schedule="optimal")
fm_hybrid.fit(X_train, y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.54297
-- Epoch 2
Training MSE: 0.47172
-- Epoch 3
Training MSE: 0.44934
-- Epoch 4
Training MSE: 0.43755
-- Epoch 5
Training MSE: 0.42993
-- Epoch 6
Training MSE: 0.42259
-- Epoch 7
Training MSE: 0.41720
-- Epoch 8
Training MSE: 0.41216
-- Epoch 9
Training MSE: 0.40788
-- Epoch 10
Training MSE: 0.40411
-- Epoch 11
Training MSE: 0.39952
-- Epoch 12
Training MSE: 0.39591
-- Epoch 13
Training MSE: 0.39292
-- Epoch 14
Training MSE: 0.38963
-- Epoch 15
Training MSE: 0.38677
-- Epoch 16
Training MSE: 0.38350
-- Epoch 17
Training MSE: 0.38109
-- Epoch 18
Training MSE: 0.37831
-- Epoch 19
Training MSE: 0.37577
-- Epoch 20
Training MSE: 0.37345
-- Epoch 21
Training MSE: 0.37100
-- Epoch 22
Training MSE: 0.36897
-- Epoch 23
Training MSE: 0.36668
-- Epoch 24
Training MSE: 0.36484
-- Epoch 25
Training MSE: 0.36295
-- Epoch 26
Training MSE: 0.36084
-- Epoch 27
Training MSE: 0.35894
-- Epoch 28
Tra

In [45]:
y_pred = fm_hybrid.predict(X_test)

$MSE$

In [46]:
mean_squared_error(y_test, y_pred)

0.8622595008465794

$MAE$

In [47]:
mean_absolute_error(y_test, y_pred)

0.7267606055164346

### Ranking Evaluation

In [120]:
N = 10

* evaluate against ranking criteria (MRR, MAP) using the rating predictions to impose a ranking on items for a given user (see kNN rating prediction)

In [92]:
user_train_ratings = {}

grouped = data.train_ratings[['user', 'item', 'rating']].groupby('user')
for user in data.users:
    vals = grouped.get_group(user)[['item', 'rating']].values
    user_train_ratings[user] = dict(zip(vals[:, 0].astype(int),
                                        vals[:, 1].astype(float)))

In [106]:
def get_prediction(fm: object, user: int, users: pd.DataFrame, items: pd.DataFrame) -> Dict[int, Dict[str, float]]:
    n_items = len(items)
    n_users = len(users)
    
    single_user_cb_feat = users.loc[user].values.reshape(1, -1).repeat(n_items, axis=0)
    all_items_cb_feat = items.values
    single_user_cf_feat = np.zeros((n_items, n_users))
    single_user_cf_feat[:, user-1] = 1
    all_items_cf_feat = np.eye(n_items)
    
    input_data = np.concatenate((single_user_cb_feat, all_items_cb_feat,
                                 single_user_cf_feat, all_items_cf_feat), axis=1)
    input_data = csr_matrix(input_data)
    
    preds = fm.predict(input_data)
    sorting = np.argsort(pred)[::-1]
    
    preds = {item: {'pred': pred} for item, pred in
                 zip(items.index.values[sorting], preds[sorting])}
    
    return preds

In [113]:
def get_recommendations(fm: object, user: int, user_train_ratings: dict, N: int, users: pd.DataFrame, items: pd.DataFrame):
    known_items = list(user_train_ratings[user].keys())
    predictions = get_prediction(fm, user, users, items)
    recommendations = []
    for item, pred in predictions.items():
        if item not in known_items:
            add_item = (item, pred)
            recommendations.append(add_item)
        if len(recommendations) == N:
            break
            
    return recommendations

In [116]:
preds = get_prediction(fm_hybrid, 1, users, items)

In [114]:
recs = get_recommendations(fm_hybrid, 1, user_train_ratings, N=N, users=users, items=items)

In [121]:
test_ratings = data.test_ratings

#### Evaluation of Ranking Metrics for Hybrid Recommender

In [None]:
recommender = fm_hybrid

In [122]:
metric = 'map'
user_scores = []
relevant_items = get_relevant_items(test_ratings)

for user in data.users:
    if user in relevant_items.keys():
        predicted_items = get_recommendations(recommender, user, user_train_ratings, N, users, items)
        predicted_items = [item for item, _ in predicted_items]
        if metric == 'map':
            true_positives = np.intersect1d(relevant_items[user],
                                            predicted_items)
            score = len(true_positives) / N
        elif metric == 'mrr':
            score = np.mean([reciprocal_rank(item, predicted_items)
                             for item in relevant_items[user]])
        else:
            raise ValueError(f"Unknown value {metric} for Argument `metric`")

        user_scores.append(score)

np.mean(user_scores)

0.03904255319148937

In [123]:
metric = 'mrr'
user_scores = []
relevant_items = get_relevant_items(test_ratings)

for user in data.users:
    if user in relevant_items.keys():
        predicted_items = get_recommendations(recommender, user, user_train_ratings, N, users, items)
        predicted_items = [item for item, _ in predicted_items]
        if metric == 'map':
            true_positives = np.intersect1d(relevant_items[user],
                                            predicted_items)
            score = len(true_positives) / N
        elif metric == 'mrr':
            score = np.mean([reciprocal_rank(item, predicted_items)
                             for item in relevant_items[user]])
        else:
            raise ValueError(f"Unknown value {metric} for Argument `metric`")

        user_scores.append(score)

np.mean(user_scores)

0.00586257796559588

#### Evaluation of Ranking Metrics for Content-based Recommender

In [126]:
recommender = fm_cb

In [127]:
metric = 'map'
user_scores = []
relevant_items = get_relevant_items(test_ratings)

for user in data.users:
    if user in relevant_items.keys():
        predicted_items = get_recommendations(recommender, user, user_train_ratings, N, users, items)
        predicted_items = [item for item, _ in predicted_items]
        if metric == 'map':
            true_positives = np.intersect1d(relevant_items[user],
                                            predicted_items)
            score = len(true_positives) / N
        elif metric == 'mrr':
            score = np.mean([reciprocal_rank(item, predicted_items)
                             for item in relevant_items[user]])
        else:
            raise ValueError(f"Unknown value {metric} for Argument `metric`")

        user_scores.append(score)

np.mean(user_scores)

0.03904255319148937

In [128]:
metric = 'mrr'
user_scores = []
relevant_items = get_relevant_items(test_ratings)

for user in data.users:
    if user in relevant_items.keys():
        predicted_items = get_recommendations(recommender, user, user_train_ratings, N, users, items)
        predicted_items = [item for item, _ in predicted_items]
        if metric == 'map':
            true_positives = np.intersect1d(relevant_items[user],
                                            predicted_items)
            score = len(true_positives) / N
        elif metric == 'mrr':
            score = np.mean([reciprocal_rank(item, predicted_items)
                             for item in relevant_items[user]])
        else:
            raise ValueError(f"Unknown value {metric} for Argument `metric`")

        user_scores.append(score)

np.mean(user_scores)

0.00586257796559588

* wrap into Recommender class

* perform and compare for CB and hybrid cases
* draw chart