## Collabrative Filtering and Hybrid Filtering
### `LightFM` 

In [1]:
import pickle
import re
import time
import math
import itertools as it
import random
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from lightfm import LightFM
from lightfm.evaluation import auc_score
from scipy import sparse
from scipy.sparse import coo_matrix, csr_matrix
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import sys

sys.path.append('./modules')

np.random.seed(12)



## 1. Collaborative Filtering Model

In [2]:
data = pd.read_csv('cleaned.csv')

In [3]:
data.isna().sum()

uid         0
page        0
pageURL     0
datetime    0
interest    0
category    0
dtype: int64

In [4]:
page_df = pd.read_csv('./../databases/csv-data/cleaned.csv', encoding='utf-8')[['pageURL', 'page', 'category']]

In [5]:
# Load the data
ratings = pd.read_csv('./../databases/csv-data/cleaned.csv', parse_dates=['datetime'])
ratings = ratings[['uid', 'page', 'pageURL', 'interest', 'datetime']]
ratings.loc[:, 'interest'] = 1
# Filter out users with less than 2 pageviews
ratings = ratings.groupby(['uid']).filter(lambda x: len(x) > 1).reset_index(drop=True)
ratings['rank_latest'] = ratings.groupby(['uid'])['datetime'].rank(method='first', ascending=False)

In [6]:
train_ratings = ratings[ratings['rank_latest'] != 1][['uid', 'pageURL', 'interest']]
test_ratings = ratings[ratings['rank_latest'] == 1]

In [7]:
def create_interaction_matrix(df, user_col, item_col, rating_col, norm=True):

    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > 0 else 0)
    else:
        interactions = interactions
    return interactions

In [8]:
interactions = create_interaction_matrix(ratings, 'uid', 'pageURL', 'interest')
interactions.shape

(4415, 6605)

In [9]:
def create_subset_interactions(interactions, subset_ratings):
    interactions = interactions.reset_index()
    interactons_melt = interactions.melt(id_vars='uid', var_name='pageURL', value_name='rating')
    interactons_melt.drop(columns=['rating'], inplace=True)
    subset_ratings = interactons_melt.merge(subset_ratings, on=['uid', 'pageURL'], how='left')
    subset_ratings['interest'].replace(np.nan, 0, inplace=True)
    subset_interactions = create_interaction_matrix(subset_ratings, 'uid', 'pageURL', 'interest')
    return subset_interactions

In [10]:
train_interactions = create_subset_interactions(interactions, train_ratings)
train_data = sparse.coo_matrix(train_interactions)

In [11]:
sum(sum(train_interactions.values))

26703

In [12]:
test_interactions = create_subset_interactions(interactions, test_ratings)
test_data = sparse.coo_matrix(test_interactions)

In [13]:
sum(sum(test_interactions.values))

4415

In [14]:
non_zero = np.count_nonzero(interactions)
total_value = np.product(interactions.shape)
sparsity = (total_value - non_zero) / total_value
sparsity

0.9989328925631171

### Train model

In [15]:
import json
import operator
import pickle as pkl
import random
import time
from collections import Counter
from itertools import product

import numpy as np
import pandas as pd
import scipy
from lightfm import LightFM
from lightfm.evaluation import (auc_score, precision_at_k, recall_at_k, reciprocal_rank)
from pandas.io.json import json_normalize
from scipy.sparse import csr_matrix
from sklearn.model_selection import StratifiedKFold, train_test_split

#### collaborative filtering - bpr

In [16]:
# model_bpr = LightFM(
#     no_components=30,
#     learning_schedule='adagrad',
#     loss='bpr',
#     learning_rate=0.05,
#     rho=0.95,
#     epsilon=1e-05,
#     item_alpha=0.001,
#     user_alpha=0.001,
#     random_state=12)

# start_time = time.time()
# model_bpr.fit(train_data, epochs=15, num_threads=1)
# end_time = time.time()
# print('Time taken for model train: {} secs'.format(round((end_time - start_time), 2)))

In [17]:
def run_model(model, train, test, item_feat=None, user_feat=None):
    train_auc = auc_score(model, 
                          train, 
                          item_features=item_feat,
                          user_features=user_feat).mean()
    print('AUC: train %.3f.' % (train_auc))

    test_auc = auc_score(model, 
                         test, 
                         train,
                         item_features=item_feat,
                         user_features=user_feat).mean()
    print('AUC: test %.3f.' % (test_auc))
#     return train_auc, test_auc

In [18]:
# run_model(model_bpr, train_data, test_data)

#### collaborative filtering - warp

In [19]:
# model_warp = LightFM(
#     no_components=30,
#     learning_schedule='adagrad',
#     loss='warp',
#     random_state=12)

# start_time = time.time()
# model_warp.fit(train_data, epochs=15, num_threads=1)
# end_time = time.time()
# print('Time taken for model train: {} secs'.format(round((end_time - start_time), 2)))

In [20]:
# run_model(model_warp, train_data, test_data)

In [21]:
# train_interactions = pd.DataFrame(train_val_data.todense(), index=uid, columns=iid)
# train_interactions

In [22]:
def create_combinations(parameters):
    combinations = [dict(zip(parameters.keys(), values)) for values in it.product(*parameters.values())]
    return combinations

In [23]:
# possible_parameters =    {
#                         'no_components': [2,4,6,8,10,20,30,40,50,60,70,80],
#                         'learning_schedule': ['adagrad','adadelta'],
#                         'loss': ['bpr','warp'],
#                         'learning_rate': [0.05,0.01,0.005,0.001],
#                         'rho': [0.99,0.97,0.95,0.92,0.90,0.87,0.85,0.82,0.80],
#                         'epsilon': [1e-3,1e-04,1e-05,1e-06,1e-07],
#                         'item_alpha': [0.1,0.05,0.01,0.005,0.001,0.0005,0.0001],
#                         'user_alpha': [0.1,0.05,0.01,0.005,0.001,0.0005,0.0001],
#                         'random_state': [12]
#                         }
# parameter_combinations = create_combinations(possible_parameters)

In [24]:
def hyperparameter_search(train_data, val_data, parameter_combinations, 
                                n_iter=2, epochs=15,  num_threads=1, verbose=False):
    df_list = []
    for parameters in random.sample(parameter_combinations, n_iter):
        auc_sum = []
        start_time = time.time()
  
        model = LightFM(**parameters)
        model.fit(train_data,
                      epochs=epochs,
                      num_threads=num_threads)
        auc = auc_score(model, val_data, train_data, num_threads=num_threads).mean()
        auc_sum.append(auc)
        
        
        model = LightFM(**parameters)
        params = model.get_params()
        params['auc'] = np.mean(auc_sum)
        params['epochs'] = epochs
        params['random_state'] = 5
        df = pd.DataFrame([params])
        end_time = time.time()
        if verbose:
            print('Time for training one set of parameters: {} secs'.format(round((end_time - start_time), 2)))
        df_list.append(df)
        
    return pd.concat(df_list, axis=0).sort_values(['auc'], ascending=False)

In [25]:
# table = hyperparameter_search(train_data, test_data, parameter_combinations, n_iter=10, 
#                           epochs=15, num_threads=1, verbose=True)

In [26]:
# table

In [27]:
def find_best_params(table):
    best = table.iloc[0].to_dict()
    del best['auc']
    del best['epochs']
    return best

In [28]:
# best_params = find_best_params(table)

In [29]:
# best_params

In [30]:
# start_time = time.time()

# model_cf = LightFM(**best_params)
# model_cf.fit(train_data,
#             epochs=15,
#             num_threads=1)

# end_time = time.time()
# print('Time taken for model train: {} secs'.format(round((end_time - start_time), 2)))

### Hyperparameter Searching II

In [31]:
def sample_hyperparameters():

    while True:
        yield {
            "no_components": np.random.randint(2, 64),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp"]),
            "learning_rate": np.random.exponential(0.05),
#             "item_alpha": np.random.exponential(1e-8),
#             "user_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(1, 20),
            "num_epochs": np.random.randint(5, 50),
        }


def random_search(train, test, num_samples=10, num_threads=1):
    result_list = []
    for hyperparams in it.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs, num_threads=num_threads)
        score = auc_score(model, test, train_interactions=train, num_threads=num_threads).mean()
        hyperparams["num_epochs"] = num_epochs

        result = hyperparams.copy()
        result['score'] = score
        result_list.append(result)
    return pd.DataFrame(result_list).sort_values(['score'], ascending=False).reset_index(drop=True)

In [32]:
table = random_search(train_data, test_data, num_threads=1)
table

Unnamed: 0,no_components,learning_schedule,loss,learning_rate,max_sampled,num_epochs,score
0,55,adadelta,warp,0.026098,11,11,0.926851
1,35,adagrad,warp,0.033216,19,27,0.924608
2,45,adagrad,warp,0.012272,5,16,0.910729
3,27,adagrad,warp,0.007379,1,45,0.910063
4,13,adadelta,bpr,0.103331,3,8,0.67175
5,56,adadelta,bpr,0.060638,5,6,0.659042
6,5,adagrad,bpr,0.125509,6,18,0.622992
7,52,adagrad,bpr,0.041407,14,14,0.61756
8,14,adadelta,bpr,0.095776,5,40,0.616489
9,47,adagrad,bpr,0.018546,16,30,0.594675


In [33]:
def get_best_params(table):
    best_params = table.iloc[0].to_dict()
    del best_params['score']
    num_epochs = best_params.pop("num_epochs")
    return best_params, num_epochs

In [34]:
# Get best parameters and number of epochs
best_params, num_epochs = get_best_params(table)
num_epochs

11

In [35]:
params_warp = best_params
params_warp['loss'] = 'warp'

In [36]:
params_warp

{'no_components': 55,
 'learning_schedule': 'adadelta',
 'loss': 'warp',
 'learning_rate': 0.026097724509700826,
 'max_sampled': 11}

In [37]:
# Get model with best parameters
model_warp = LightFM(**params_warp, random_state=12)

# Time model fitting
start_time = time.time()
model_warp.fit(train_data,
                item_features=None,
                user_features=None,
                epochs=num_epochs,
                num_threads=1)

end_time = time.time()
print('Time taken for model train: {} secs'.format(round((end_time - start_time), 2)))

Time taken for model train: 4.46 secs


In [38]:
run_model(model_warp, train_data, test_data)

AUC: train 0.988.
AUC: test 0.930.


In [39]:
params_bpr = best_params
params_bpr['loss'] = 'bpr'

In [40]:
params_bpr

{'no_components': 55,
 'learning_schedule': 'adadelta',
 'loss': 'bpr',
 'learning_rate': 0.026097724509700826,
 'max_sampled': 11}

In [41]:
# Get model with best parameters
model_bpr = LightFM(**params_bpr, random_state=12)

# Time model fitting
start_time = time.time()
model_bpr.fit(train_data,
                item_features=None,
                user_features=None,
                epochs=num_epochs,
                num_threads=1)

end_time = time.time()
print('Time taken for model train: {} secs'.format(round((end_time - start_time), 2)))

Time taken for model train: 8.36 secs


In [42]:
run_model(model_bpr, train_data, test_data)

AUC: train 0.935.
AUC: test 0.628.


### Make Recommendations

In [43]:
def user_item_dicts(interactions, item_df):
    user_ids = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_ids:
        user_dict[i] = counter
        counter += 1

    item_dict ={}
    for i in range(item_df.shape[0]):
        item_dict[(item_df.loc[i,'pageURL'])] = item_df.loc[i,'page']
    
    return user_dict, item_dict

In [44]:
user_dict, page_dict = user_item_dicts(interactions, page_df)

In [45]:
def items_for_user(model, train_interactions, user_id:str, user_dict, 
                               item_dict, num_rec_items = 10):
    """Produce user recommendations"""
    
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x, np.arange(n_items)))
    scores.index = interactions.columns
    scores = list(pd.Series(scores.sort_values(ascending=False).index))
    
    known_items = list(pd.Series(train_interactions.loc[user_id,:][train_interactions.loc[user_id,:] 
                                                                   > 0].index).sort_values(ascending=False))
    
    scores = [x for x in scores if x not in known_items]
    score_list = scores[0: num_rec_items]
    known_items = list(pd.Series(known_items).apply(lambda x: item_dict[x]))
    scores = list(pd.Series(score_list).apply(lambda x: item_dict[x]))

    print("Pages user previously viewed:")
    counter = 1
    for i in known_items[: 100]:
        print(str(counter) + '- ' + i)
        counter+=1

    print("\nRecommended pages:")
    counter = 1
    for i in scores:
        print(str(counter) + '- ' + i)
        counter+=1

In [46]:
items_for_user(model_warp, train_interactions, "012a363175ab8c0dbbbd51b6201389ba", user_dict, 
                               page_dict, num_rec_items = 10)

Pages user previously viewed:
1- Coke and your nose - The Mix
2- Cocaine - The Mix

Recommended pages:
1- The Mix - Essential support for under 25s
2- Speak to Our Team - The Mix
3- Get Support - The Mix
4- Mental Health - The Mix
5- The Mix Counselling Service - The Mix
6- Crisis Messenger - The Mix
7- Apps and Tools - The Mix
8- About Us - The Mix
9- Email us - The Mix
10- Meet our team - The Mix


In [47]:
pd.set_option('max_colwidth', 400)

from IPython.display import display_html

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'), raw=True)

In [180]:
def recommendation_for_user(model, user_id):
    target_pageURL = test_ratings[test_ratings.uid==user_id]['pageURL'].values[0]
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x, np.arange(n_items)))
    scores.index = interactions.columns


    matrix = interactions
    mask = (matrix[matrix.index==user_id]==0).values[0]
    np.random.seed(123)
    test_items = list(np.random.choice(matrix.columns[mask], 99)) + [target_pageURL]
    hit_items = scores[scores.index.isin(test_items)].sort_values(ascending=False).index[:10]

    if target_pageURL in hit_items:
        hit_status=1
    else:
        hit_status=0

    top10_pageURLs = scores.to_frame(name='score').reindex(test_items).sort_values(by='score', ascending=False).index[:10]


    page_list = []
    for url in list(top10_pageURLs):
        page_list.append(ratings[ratings['pageURL'] == url].page.unique()[0])

    recom_page_df = pd.DataFrame(page_list, columns=[ 'Top 10 Recommended Pages'])

    previous_page_df = pd.DataFrame(ratings[(ratings['uid'] ==user_id) & (ratings['rank_latest']!=1)]['page'])\
    .rename(columns={'page':'Pages Previously Viewed'})

    test_page_df = pd.DataFrame({'Test Page':ratings[ratings['pageURL'] == target_pageURL].page.unique()[0],
                     'Hit': hit_status}, index=[0])

    display_side_by_side(recom_page_df, previous_page_df, test_page_df)

In [181]:
user_id = '8a126af7234b61961af8b6dd693423ae'
print(f"UserID: {user_id}")
recommendation_for_user(model_warp, user_id)

UserID: 8a126af7234b61961af8b6dd693423ae


Unnamed: 0,Top 10 Recommended Pages
0,Sex & Relationships - The Mix
1,Will my caution affect my job application? - The Mix
2,Buying prescription drugs online - The Mix
3,chatroom - The Mix
4,Hello 🦄 💖 ✨ — The Mix Support Community
5,Work & Study - Page 2 of 24 - The Mix
6,How to break up with someone - TheMix.org.uk
7,Cheapest fags you can buy in the UK? — The Mix Support Community
8,Talking to your partner about your mental health - The Mix
9,Recovering after an abortion - TheMix.org.uk

Unnamed: 0,Pages Previously Viewed
16823,Do I need to apply for settled or pre-settled status? - The Mix

Unnamed: 0,Test Page,Hit
0,Buying prescription drugs online - The Mix,1


In [182]:
user_id = 'b1201219c1fb71f9af289258c586f56f'
print(f"UserID: {user_id}")
recommendation_for_user(model_warp, user_id)

UserID: b1201219c1fb71f9af289258c586f56f


Unnamed: 0,Top 10 Recommended Pages
0,chatroom - The Mix
1,Expert Q&A - Sexplain and Jennifer Niven 28th Sept — The Mix Support Community
2,Paranoia - The Mix
3,Movin’ on from The Mix — The Mix Support Community
4,Genital warts - TheMix.org.uk
5,Sex in club toilets - The Mix
6,Favourite Cake - I will bake the winner [I am a hilariously bad baker] — The Mix Support Community
7,I'm a 14 year old girl and i cant make myself come — The Mix Support Community
8,Sex & Relationships - Page 2 of 35 - The Mix
9,Badges: Preferences

Unnamed: 0,Pages Previously Viewed
21820,Anything Goes — The Mix Support Community

Unnamed: 0,Test Page,Hit
0,Favourite Cake - I will bake the winner [I am a hilariously bad baker] — The Mix Support Community,1


In [183]:
user_id = '0d33d641e363ded62e8fe5618661d4fe'
print(f"UserID: {user_id}")
recommendation_for_user(model_warp, user_id)

UserID: 0d33d641e363ded62e8fe5618661d4fe


Unnamed: 0,Top 10 Recommended Pages
0,Pro rata pay - The Mix
1,What is polyamory? - The Mix
2,The Mix are always here to support you - The Mix
3,"Understanding money, with MyBnk: Buying a house or a car and reading your bank statements - The Mix"
4,Family Life - The Mix
5,Dealing with divorce - The Mix
6,5 tips for choosing the right student house for you - The Mix
7,Ambassador voices: Breaking the self-harm stigma - The Mix
8,Auto-links (full list) — The Mix Support Community
9,New Poll — The Mix Support Community

Unnamed: 0,Pages Previously Viewed
1370,The Mix Community
1371,Health & Wellbeing — The Mix Support Community
1372,Cautions and warnings - The Mix
1373,SHOUT — The Mix Support Community
1374,How body confident are you? — The Mix Support Community
1375,relatively happy/good news thread - Page 201 — The Mix Support Community
1376,Get Support - The Mix
1377,Speak to Our Team - The Mix
1378,The Mix Counselling Service - The Mix
1379,Money Works: Being on benefits - The Mix

Unnamed: 0,Test Page,Hit
0,Pro rata pay - The Mix,1


In [218]:
user_id = '07bb523a073fb805aa26452a69801966'
print(f"UserID: {user_id}")
recommendation_for_user(model_warp, user_id)

UserID: 07bb523a073fb805aa26452a69801966


Unnamed: 0,Top 10 Recommended Pages
0,chatroom - The Mix
1,Connect four! - Page 9 — The Mix Support Community
2,Body Image and Self-Esteem - The Mix
3,Introducing the Body & Soul Club - The Mix
4,General chit chat (OP GreenTea) - Page 542 — The Mix Support Community
5,The Mix Support Community
6,how can I deal with my diagnosis? — The Mix Support Community
7,Change The World — The Mix Support Community
8,The story of a Mix volunteer who set up her own mental health campaign - The Mix
9,I’m ugly — The Mix Support Community

Unnamed: 0,Pages Previously Viewed
808,The Mix - Essential support for under 25s
809,About Us - The Mix
810,Meet our team - The Mix

Unnamed: 0,Test Page,Hit
0,Body Image and Self-Esteem - The Mix,1


In [215]:
ratings.uid.unique()[120:130]

array(['07338e01542c9284eb4bf330f4607c2b',
       '0742dbfc648ca86546a36528b3b79eac',
       '077ccc04b2515394cae9c48f135f7e7d',
       '07826918c5b1e435cf81a3118db10163',
       '0785d26b4dab9c248441f0c9bd7fe83f',
       '07939c1831a30a88b1c344e355a4f934',
       '07bb0d724ad2a3f055871fec87e425bb',
       '07bb523a073fb805aa26452a69801966',
       '07bfa7c4b91eafca2a90d630aaf3c655',
       '07c0846d71a72ac68227b3c4923d7d14'], dtype=object)

### Evaluation Metrics

#### HR

In [54]:
def user_hit_status(model, user_id):
    test_pageURL = test_ratings[test_ratings.uid==user_id]['pageURL'].values[0]
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x, np.arange(n_items)))
    scores.index = interactions.columns


    matrix = interactions
    mask = (matrix[matrix.index==user_id]==0).values[0]
    test_items = list(np.random.choice(matrix.columns[mask], 99)) + [test_pageURL]
    hit_items = scores[scores.index.isin(test_items)].sort_values(ascending=False).index[:10]

    if test_pageURL in hit_items:
        hit_status=1
    else:
        hit_status=0


    return hit_status

In [55]:
def hit_ratio(model):
    hit_ratio_list = []
    for user_id in ratings.uid.unique():
        hr = user_hit_status(model,  user_id)
        hit_ratio_list.append(hr)

    return round(np.mean(hit_ratio_list), 3)

In [56]:
hit_ratio(model_warp)

print(f"Hit Ratio of lightFM pure collaborative model: {hit_ratio(model_warp)}")

Hit Ratio of lightFM pure collaborative model: 0.849


#### NDCG 

In [57]:
def user_ndcg_score(model, user_id):
    test_pageURL = test_ratings[test_ratings.uid==user_id]['pageURL'].values[0]
    n_users, n_items = interactions.shape
    user_x = user_dict[user_id]
    scores = pd.Series(model.predict(user_x, np.arange(n_items)))
    scores.index = interactions.columns
    scores = pd.DataFrame(scores).rename(columns={0: 'page_score'})
    scores['score_rank'] = scores['page_score'].rank(
                method='first', ascending=False)

    test_rank = scores[scores.index == test_pageURL]['score_rank'][0]
    ndcg = math.log(2) / math.log(1 + test_rank)

    return round(ndcg, 3)

In [58]:
def ndcg_ratio(model):
    ndcg_list = []
    for user_id in ratings.uid.unique():
        ndcg = user_ndcg_score(model, user_id)
        ndcg_list.append(ndcg)
    
    return round(np.mean(ndcg_list), 3)

In [59]:
ndcg_ratio(model_warp)

print(f"NDCG Ratio of lightFM pure collaborative model: {ndcg_ratio(model_warp)}")

NDCG Ratio of lightFM pure collaborative model: 0.368


In [60]:
hit_ratio(model_bpr)

print(f"Hit Ratio of lightFM pure collaborative model: {hit_ratio(model_bpr)}")

Hit Ratio of lightFM pure collaborative model: 0.525


In [61]:
ndcg_ratio(model_bpr)

print(f"NDCG Ratio of lightFM pure collaborative model: {ndcg_ratio(model_bpr)}")

NDCG Ratio of lightFM pure collaborative model: 0.204
