# Unit 7: Content-based Filtering for Rating Prediction using a Factorization Machine

In this section, we switch from collaborative to content-based filtering. Where collaborative filtering exploits similarities among interactions, content-based filtering exploits similarities between user and/or item features. It finds combinations of user-item features that help to predict ratings or rankings.

However, we discussed the superiority of the ranking approach before, for simplicity we do rating prediction again here. The rating predictions are hence used to impose an ordering on items that are then recommended to the user.

The model we use for the relationship between features and ratings is a factorization machine which is similar to matrix factorization and offers more flexibility in modeling.

In [7]:
from collections import OrderedDict
import itertools
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyfm import pylibfm
from scipy import sparse
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [8]:
from recsys_training.data import Dataset, genres, get_user_profiles
from recsys_training.evaluation import get_relevant_items

In [9]:
ml100k_ratings_filepath = '../../data/raw/ml-100k/u.data'
ml100k_item_filepath = '../../data/raw/ml-100k/u.item'
ml100k_user_filepath = '../../data/raw/ml-100k/u.user'

## Load Data

In [10]:
data = Dataset(ml100k_ratings_filepath)
data.rating_split(seed=42)
user_ratings = data.get_user_ratings()

In [11]:
item_feat = pd.read_csv(ml100k_item_filepath, sep='|', header=None,
                        names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,
                        engine='python')

In [12]:
user_feat = pd.read_csv(ml100k_user_filepath, sep='|', header=None,
                        names=['user', 'age', 'gender', 'occupation', 'zip'])

## User and Item Content (Features)

### Exploration

In [13]:
item_feat.head()

Unnamed: 0,item,title,release,video_release,imdb_url,unknown,Action,Adventure,Animation,Children,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
user_feat.head()

Unnamed: 0,user,age,gender,occupation,zip
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


### Preprocessing

#### Items

We keep the following information for items:
* release year
* genres

In [15]:
def min_max_scale(val, bounds):
    min_max_range = bounds['max']-bounds['min']
    return (val-bounds['min'])/min_max_range

In [16]:
# Infer the release year
idxs = item_feat[item_feat['release'].notnull()].index
item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release'].str.split('-')
item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)

# Impute median release year value for the items with missing release year
top_year = item_feat.loc[idxs, 'release_year'].astype(int).describe()['50%']
idx = item_feat[item_feat['release'].isnull()].index
item_feat.loc[idx, 'release_year'] = top_year

# Min-max scale the release year
item_year_bounds = {'min': item_feat['release_year'].min(),
                    'max': item_feat['release_year'].max()}
item_feat['release_year'] = item_feat['release_year'].apply(
    lambda year: min_max_scale(year, item_year_bounds))

# Drop other columns
item_feat.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)

In [17]:
item_feat.head()

Unnamed: 0,item,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
0,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0.960526
1,2,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.960526
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0.960526
3,4,0,1,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0.960526
4,5,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,1,0,0,0.960526


#### users

We keep the following information for users:
* age
* gender
* occupation
* zip-code

In [18]:
# Min-max scale the age
user_age_bounds = {'min': user_feat['age'].min(),
                   'max': user_feat['age'].max()}
user_feat['age'] = user_feat['age'].apply(lambda age: min_max_scale(age, user_age_bounds))

# Transform gender characters to numerical values (categories)
genders = sorted(user_feat['gender'].unique())
user_gender_map = dict(zip(genders, range(len(genders))))
user_feat['gender'] = user_feat['gender'].map(user_gender_map)

# Transform occupation strings to numerical values (categories)
occupations = sorted(user_feat['occupation'].unique())
user_occupation_map = dict(zip(occupations, range(len(occupations))))
user_feat['occupation'] = user_feat['occupation'].map(user_occupation_map)

# Transform the zip codes to categories keeping the first three digits and impute for missing
idxs = user_feat[~user_feat['zip'].str.isnumeric()].index
user_feat.loc[idxs, 'zip'] = '00000'
zip_digits_to_cut = 3
user_feat['zip'] = user_feat['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)

![](../Parrot.png)

**TODO:** Complete `get_user_profiles` to infer user profiles combining their ratings with the item features the users liked

In addition, we infer profiles by combining item information with rating data for each user to get features that represent the users' preferred genres and film age

In [19]:
def user_profiler(group):
    genre_dist = group[genres].mean()
    year_dist = group['release_year'].describe()[['mean', 'std', '50%']]

    return pd.concat((genre_dist, year_dist), axis=0)

In [20]:
def get_user_profiles(ratings: pd.DataFrame,
                      item_feat: pd.DataFrame,
                      min_rating: float = 4.0) -> pd.DataFrame:
    ratings = ratings[ratings.rating >= min_rating]
    ratings = ratings[['user', 'item']]
    ratings = ratings.merge(item_feat, on='item', how='left')
    ratings.drop(['item'], axis=1, inplace=True)

    grouped = ratings.groupby('user')
    profiles = grouped.apply(user_profiler).reset_index()
    profiles.rename(columns={'50%': 'median'}, inplace=True)
    
    return profiles

Finally, we join the original user information with their profiles' information and one-hot-encode categorical information

In [21]:
profiles = get_user_profiles(data.train_ratings, item_feat)
user_feat = user_feat.merge(profiles, on='user', how='left')

occupation_1H = pd.get_dummies(user_feat['occupation'], prefix='occupation')
zip_1H = pd.get_dummies(user_feat['zip'], prefix='zip')

user_feat.drop(['occupation', 'zip', ], axis=1, inplace=True)
user_feat = pd.concat([user_feat, occupation_1H, zip_1H], axis=1)

user_feat.fillna(0, inplace=True)

We remove the user/item id columns and replace the current dataframe indices with their values

In [22]:
user_feat.index = user_feat['user'].values
user_feat.drop('user', axis=1, inplace=True)

item_feat.index = item_feat['item'].values
item_feat.drop('item', axis=1, inplace=True)

### Final Check

In [23]:
item_feat.head()

Unnamed: 0,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,release_year
1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.960526
2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.960526
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0.960526
4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0.960526
5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0.960526


In [24]:
user_feat.head()

Unnamed: 0,age,gender,unknown,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,zip_90,zip_91,zip_92,zip_93,zip_94,zip_95,zip_96,zip_97,zip_98,zip_99
1,0.257576,1,0.00813,0.235772,0.097561,0.04065,0.04065,0.308943,0.081301,0.04065,...,0,0,0,0,0,0,0,0,0,0
2,0.69697,0,0.0,0.193548,0.064516,0.032258,0.032258,0.258065,0.193548,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0.242424,1,0.0,0.25,0.166667,0.0,0.0,0.25,0.25,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.257576,1,0.0,0.285714,0.142857,0.0,0.0,0.142857,0.214286,0.071429,...,0,0,0,0,0,0,0,0,0,0
5,0.393939,0,0.021277,0.382979,0.234043,0.12766,0.06383,0.595745,0.148936,0.0,...,0,0,0,0,0,0,0,0,0,0


Sparsity of user/item content information

In [25]:
(user_feat==0).sum().sum()/user_feat.size

0.861337364529982

In [26]:
(item_feat==0).sum().sum()/item_feat.size

0.8640309155766944

## Factorization Machine for a Content-based Recommender

![FM](../fm.png)

[Steffen Rendle: Factorization Machines](https://www.csie.ntu.edu.tw/~b97053/paper/Rendle2010FM.pdf)

[pyFM - Factorization Machines in Python](https://github.com/coreylynch/pyFM)

#### Create Feature Matrices

In [27]:
# fetch content information for all observed user-item rating combinations
user_cb_feat_train = user_feat.loc[data.train_ratings.user.values].values
user_cb_feat_test = user_feat.loc[data.test_ratings.user.values].values
item_cb_feat_train = item_feat.loc[data.train_ratings.item.values].values
item_cb_feat_test = item_feat.loc[data.test_ratings.item.values].values

In [28]:
# concatenate user and item content information to form design matrices
# and convert to sparse matrix in Compressed Sparse Row (CSR) format
X_train = np.concatenate((user_cb_feat_train, item_cb_feat_train), axis=1)
X_train = sparse.csr_matrix(X_train)
X_test = np.concatenate((user_cb_feat_test, item_cb_feat_test), axis=1)
X_test = sparse.csr_matrix(X_test)

In [29]:
def get_sparsity(sparse_arr) -> float:
    num_elements = sparse_arr.shape[0]*sparse_arr.shape[1]
    num_nonzero_elements = sparse_arr.nnz
    density = num_nonzero_elements/num_elements
    return 1-density

In [30]:
X_train

<80000x161 sparse matrix of type '<class 'numpy.float64'>'
	with 1995351 stored elements in Compressed Sparse Row format>

In [31]:
# Sparsity of Training Data
get_sparsity(X_train)

0.8450814440993789

In [32]:
X_test

<20000x161 sparse matrix of type '<class 'numpy.float64'>'
	with 498098 stored elements in Compressed Sparse Row format>

In [33]:
# Sparsity of Test Data
get_sparsity(X_test)

0.8453111801242236

#### Create Target Matrices for Rating Predictions

In [34]:
y_train = data.train_ratings.rating.values.astype(float)
y_test = data.test_ratings.rating.values

#### Train Factorization Machine for Rating Prediction as Regressor using pyFM

In [35]:
n_epochs = 10  # number of full stochastic passes through the training data
k = 16
random_seed = 28

In [36]:
fm_cb = pylibfm.FM(num_factors=k,
                   num_iter=n_epochs,
                   verbose=True,
                   task="regression",
                   initial_learning_rate=0.001,
                   learning_rate_schedule="optimal",
                   seed=random_seed)
fm_cb.fit(X_train, y_train)

Creating validation dataset of 0.01 of training for adaptive regularization
-- Epoch 1
Training MSE: 0.58234
-- Epoch 2
Training MSE: 0.54275
-- Epoch 3
Training MSE: 0.52975
-- Epoch 4
Training MSE: 0.52318
-- Epoch 5
Training MSE: 0.52044
-- Epoch 6
Training MSE: 0.51692
-- Epoch 7
Training MSE: 0.51528
-- Epoch 8
Training MSE: 0.51396
-- Epoch 9
Training MSE: 0.51294
-- Epoch 10
Training MSE: 0.51197


### Evaluation on Test Set

In [37]:
y_pred = fm_cb.predict(X_test)

$MSE$

In [38]:
mean_squared_error(y_test, y_pred)

1.044349402408133

$MAE$

In [39]:
mean_absolute_error(y_test, y_pred)

0.8222544325267082

In [40]:
def get_prediction(fm: object, user: int, user_feat: pd.DataFrame, item_feat: pd.DataFrame,
                   items: np.array = None, remove_known_pos: bool = True) -> Dict[int, Dict[str, float]]:
    
    if items is None:
        if remove_known_pos:
            # Predict from unobserved items
            known_items = np.array(list(user_ratings[user].keys()))
            items = np.setdiff1d(data.items, known_items)
        else:
            items = np.array(data.items)
    if type(items) == np.int64:
        items = np.array([items])
    
    n_items = len(items)
    
    single_user_cb_feat = user_feat.loc[user].values.reshape(1, -1).repeat(n_items, axis=0)
    all_items_cb_feat = item_feat.loc[items].values
    
    input_data = np.concatenate((single_user_cb_feat, all_items_cb_feat), axis=1)
    input_data = sparse.csr_matrix(input_data)
    
    preds = fm.predict(input_data)
    sorting = np.argsort(preds)[::-1]
    
    preds = {item: {'pred': pred} for item, pred in
             zip(items[sorting], preds[sorting])}
    
    return preds

In [41]:
predictions = get_prediction(fm_cb, 1, user_feat, item_feat)
list(predictions.items())[:20]

[(656, {'pred': 4.8035651984148116}),
 (675, {'pred': 4.800327024891117}),
 (1122, {'pred': 4.735948099109543}),
 (1542, {'pred': 4.685597876365361}),
 (615, {'pred': 4.640353965197814}),
 (429, {'pred': 4.6296556721542235}),
 (617, {'pred': 4.6243379942996246}),
 (484, {'pred': 4.613840448074965}),
 (1453, {'pred': 4.603821320164888}),
 (836, {'pred': 4.577358902326652}),
 (835, {'pred': 4.572232971320044}),
 (525, {'pred': 4.56803112914736}),
 (1397, {'pred': 4.563078112233887}),
 (1198, {'pred': 4.562231550809198}),
 (1020, {'pred': 4.559356412016999}),
 (607, {'pred': 4.558273993012009}),
 (478, {'pred': 4.555062382849775}),
 (1203, {'pred': 4.549314282204439}),
 (659, {'pred': 4.538371630059824}),
 (1451, {'pred': 4.533448720243973})]

In [42]:
def get_recommendations(fm_cb: object, user: int, N: int, user_feat: pd.DataFrame, item_feat: pd.DataFrame,
                        remove_known_pos: bool = True) -> List[Tuple[int, Dict[str, float]]]:
    predictions = get_prediction(fm_cb, user, user_feat, item_feat,
                                 remove_known_pos=remove_known_pos)
    recommendations = []
    # TODO: Simplify
    for item, pred in predictions.items():
        add_item = (item, pred)
        recommendations.append(add_item)
        if len(recommendations) == N:
            break

    return recommendations

In [43]:
get_recommendations(fm_cb, 1, N=10, user_feat=user_feat, item_feat=item_feat)

[(656, {'pred': 4.8035651984148116}),
 (675, {'pred': 4.800327024891117}),
 (1122, {'pred': 4.735948099109543}),
 (1542, {'pred': 4.685597876365361}),
 (615, {'pred': 4.640353965197814}),
 (429, {'pred': 4.6296556721542235}),
 (617, {'pred': 4.6243379942996246}),
 (484, {'pred': 4.613840448074965}),
 (1453, {'pred': 4.603821320164888}),
 (836, {'pred': 4.577358902326652})]

## Evaluation

In [44]:
N = 10

In [45]:
relevant_items = get_relevant_items(data.test_ratings)

In [46]:
users = relevant_items.keys()
prec_at_N = dict.fromkeys(data.users)

for user in users:
    recommendations = get_recommendations(fm_cb, user, N, user_feat=user_feat, item_feat=item_feat, remove_known_pos=True)
    recommendations = [val[0] for val in recommendations]
    hits = np.intersect1d(recommendations,
                          relevant_items[user])
    prec_at_N[user] = len(hits)/N

In [47]:
recommendations

[1122, 498, 1453, 1542, 1198, 617, 1397, 967, 1299, 612]

In [48]:
np.mean([val for val in prec_at_N.values() if val is not None])

0.01893617021276596