# Unit 9: LightFM

You almost made it - this is the final lesson and it is also going to be the easiest one.

As you may already assume - there are a lot of recommender packages in Python out there. In this lesson we will look at LightFM - an easy to use and lightweight implementation of different approaches and algorithms (FM, BPR, WARP, ...) to perform CF, CBF and hybrid recommenders.

Within a few lines of code we set-up, train and use a recommender for recommendations.

* [LightFM on GitHub](https://github.com/lyst/lightfm)
* [LightFM documentation](https://making.lyst.com/lightfm/docs/home.html)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.sparse import coo_matrix

from recsys_training.data import Dataset, genres

In [None]:
from lightfm.datasets import fetch_movielens
from lightfm.evaluation import precision_at_k
from lightfm import LightFM

In [None]:
ml100k_ratings_filepath = '../../data/raw/ml-100k/u.data'
ml100k_item_filepath = '../../data/raw/ml-100k/u.item'
ml100k_user_filepath = '../../data/raw/ml-100k/u.user'

## Load Data

### You may easily load Movielens Data ...

In [None]:
data = fetch_movielens(min_rating=4.0, genre_features=True)

In [None]:
data

### But, we want to use the exact same data and split that we used in the lessons before

In [None]:
data = Dataset(ml100k_ratings_filepath)
data.filter(min_rating=4.0)
data.rating_split(seed=42)

#### Transform our training and testing data into sparse matrices

In [None]:
# Train DataFrame to Train COO Matrix
ratings = data.train_ratings["rating"].values
# We subtract 1 to make user/item ids 0-index-based
rows = data.train_ratings["user"].values - 1
cols = data.train_ratings["item"].values - 1

train_mat = coo_matrix((ratings, (rows, cols)),
                       shape=(data.n_users, data.n_items))


# Test DataFrame to Test COO Matrix
ratings = data.test_ratings["rating"].values
# We subtract 1 to make user/item ids 0-index-based
rows = data.test_ratings["user"].values - 1
cols = data.test_ratings["item"].values - 1

test_mat = coo_matrix((ratings, (rows, cols)),
                      shape=(data.n_users, data.n_items))

In [None]:
train_mat

In [None]:
test_mat

## Collaborative Filtering

In [None]:
params = {
    'no_components': 10,
    'loss': 'bpr',
    'learning_rate': 0.07,
    'random_state': 42,
    'user_alpha': 0.0002,
    'item_alpha': 0.0002
}

epochs = 10

N = 10

In [None]:
cf_model = LightFM(**params)

In [None]:
cf_model.fit(train_mat, epochs=epochs, verbose=True)

### Evaluate the `MAP@10` on test data

If we provide training data with evaluation, known positives will be removed.

In [None]:
prec_at_N = precision_at_k(cf_model, test_mat, train_mat, k=N)

In [None]:
prec_at_N.mean()

### Evaluate the `MAP@10` on train data

In [None]:
prec_at_N = precision_at_k(cf_model, train_mat, k=N)

In [None]:
prec_at_N.mean()

Maybe try adding some regularization to improve the recommendation relevancy - simply add `user_alpha` and `item_alpha` to the `params` dictionary and find appropriate values.

## Hybrid (CF + CBF)

### Load user and item features

In [None]:
def min_max_scale(val, bounds):
    min_max_range = bounds['max']-bounds['min']
    return (val-bounds['min'])/min_max_range


def user_profiler(group):
    genre_dist = group[genres].mean()
    year_dist = group['release_year'].describe()[['mean', 'std', '50%']]

    return pd.concat((genre_dist, year_dist), axis=0)


def get_user_profiles(ratings: pd.DataFrame,
                      item_feat: pd.DataFrame,
                      min_rating: float = 4.0) -> pd.DataFrame:
    ratings = ratings[ratings.rating >= min_rating]
    ratings = ratings[['user', 'item']]
    ratings = ratings.merge(item_feat, on='item', how='left')
    ratings.drop(['item'], axis=1, inplace=True)

    grouped = ratings.groupby('user')
    profiles = grouped.apply(user_profiler).reset_index()
    profiles.rename(columns={'50%': 'median'}, inplace=True)
    
    return profiles


item_feat = pd.read_csv(ml100k_item_filepath, sep='|', header=None,
                        names=['item', 'title', 'release', 'video_release', 'imdb_url']+genres,
                        engine='python')

user_feat = pd.read_csv(ml100k_user_filepath, sep='|', header=None,
                        names=['user', 'age', 'gender', 'occupation', 'zip'])

# Infer the release year
idxs = item_feat[item_feat['release'].notnull()].index
item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release'].str.split('-')
item_feat.loc[idxs, 'release_year'] = item_feat.loc[idxs, 'release_year'].apply(lambda val: val[2]).astype(int)

# Impute median release year value for the items with missing release year
top_year = item_feat.loc[idxs, 'release_year'].astype(int).describe()['50%']
idx = item_feat[item_feat['release'].isnull()].index
item_feat.loc[idx, 'release_year'] = top_year

# Min-max scale the release year
item_year_bounds = {'min': item_feat['release_year'].min(),
                    'max': item_feat['release_year'].max()}
item_feat['release_year'] = item_feat['release_year'].apply(
    lambda year: min_max_scale(year, item_year_bounds))

# Drop other columns
item_feat.drop(['title', 'release', 'video_release', 'imdb_url'], axis=1, inplace=True)

# Min-max scale the age
user_age_bounds = {'min': user_feat['age'].min(),
                   'max': user_feat['age'].max()}
user_feat['age'] = user_feat['age'].apply(lambda age: min_max_scale(age, user_age_bounds))

# Transform gender characters to numerical values (categories)
genders = sorted(user_feat['gender'].unique())
user_gender_map = dict(zip(genders, range(len(genders))))
user_feat['gender'] = user_feat['gender'].map(user_gender_map)

# Transform occupation strings to numerical values (categories)
occupations = sorted(user_feat['occupation'].unique())
user_occupation_map = dict(zip(occupations, range(len(occupations))))
user_feat['occupation'] = user_feat['occupation'].map(user_occupation_map)

# Transform the zip codes to categories keeping the first three digits and impute for missing
idxs = user_feat[~user_feat['zip'].str.isnumeric()].index
user_feat.loc[idxs, 'zip'] = '00000'
zip_digits_to_cut = 3
user_feat['zip'] = user_feat['zip'].apply(lambda val: int(val) // 10 ** zip_digits_to_cut)


profiles = get_user_profiles(data.train_ratings, item_feat)
user_feat = user_feat.merge(profiles, on='user', how='left')

occupation_1H = pd.get_dummies(user_feat['occupation'], prefix='occupation')
zip_1H = pd.get_dummies(user_feat['zip'], prefix='zip')

user_feat.drop(['occupation', 'zip', ], axis=1, inplace=True)
user_feat = pd.concat([user_feat, occupation_1H, zip_1H], axis=1)

user_feat.fillna(0, inplace=True)


user_feat.index = user_feat['user'].values
user_feat.drop('user', axis=1, inplace=True)

item_feat.index = item_feat['item'].values
item_feat.drop('item', axis=1, inplace=True)

In [None]:
(user_feat==0).sum().sum()/user_feat.size

In [None]:
(item_feat==0).sum().sum()/item_feat.size

In [None]:
# Create User Feature COO Matrix
# user_feat_mat = coo_matrix(np.eye(data.n_users))
user_feat_mat = coo_matrix(np.concatenate((user_feat.values, np.eye(data.n_users)), axis=1))

# Create Item Feature COO Matrix
# item_feat_mat = coo_matrix(np.eye(data.n_items))
item_feat_mat = coo_matrix(np.concatenate((item_feat.values, np.eye(data.n_items)), axis=1))

In [None]:
user_feat_mat

In [None]:
item_feat_mat

### Model Training

![](Parrot.png)

**Task:** Check the [lightFM API](https://making.lyst.com/lightfm/docs/home.html) to see how you can incorporate proper data - can you tweek the algorithm to beat pure Collaborative Filtering?

In [None]:
params = {
    'no_components': 10,
    'loss': 'warp',
    'learning_rate': 0.03,
    'random_state': 42,
    'user_alpha': 0.0001,
    'item_alpha': 0.0001
}

epochs = 10

N = 10

In [None]:
hybrid_model = None

#
# Up to you ;)
#

In [None]:
prec_at_N = precision_at_k(hybrid_model,
                           test_mat,
                           train_mat,
                           k=N,
                           user_features=user_feat_mat,
                           item_features=item_feat_mat)

In [None]:
prec_at_N.mean()