<a href="https://colab.research.google.com/github/mheidari98/Movie-Recommender-Systems/blob/main/LightFM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

from google.colab import drive
drive.mount('/content/gdrive')

!pip install lightfm

In [32]:
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k, reciprocal_rank
from lightfm.data import Dataset
from lightfm.datasets import fetch_movielens
from sklearn.model_selection import train_test_split
import time
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd


In [33]:
Dataset_path = "./DataSets/"
Movies_metadata = pd.read_csv(Dataset_path+"MoviesInfo.csv")
ratings = pd.read_csv( Dataset_path + "ml-latest/ratings.csv")

### Ratings

In [34]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 847.0 MB


### just keep rating year between 1995, 2001

In [35]:
from datetime import datetime

# strftime('%Y-%m-%d %H:%M:%S')
ratings['year_rated'] = ratings['timestamp'].apply(lambda x: int( datetime.fromtimestamp( x ).strftime('%Y') ) )

ratings = ratings[ ratings['year_rated'].between(1995, 2001)].copy()

ratings.reset_index(drop=True, inplace=True)

In [36]:
print( f"number of Ratings : { ratings.shape[0] }")
print( f"number of movies : { ratings.groupby('movieId').count().shape[0] }")
print( f"number of users : { ratings.groupby('userId').count().shape[0] }")
print( f"range of rating : ( { np.min( ratings.groupby('rating').count().index )}, {np.max( ratings.groupby('rating').count().index )})  ")

number of Ratings : 7329482
number of movies : 4937
number of users : 103827
range of rating : ( 1.0, 5.0)  


### just keep rating upper than median of user ratings

In [37]:
ratingGroupByUserId = ratings.groupby(['userId'])

# ratingThreshHold = ratingGroupByUserId.apply(lambda grp: grp.rating.median() )  #=> MemoryError: Unable to allocate 98.2 GiB for an array with shape (283228, 46554) and data type float64
ratingThreshHold = ratingGroupByUserId.apply(lambda grp: np.percentile(grp.rating, 75) )

ratingThreshHold = ratingThreshHold.to_frame().reset_index()

tmpp = pd.merge( ratings, ratingThreshHold , how='inner' )

CleanedRatings = tmpp[ tmpp['rating']>=tmpp[0] ].reset_index()[ ['userId', 'movieId', 'rating', 'timestamp']]

In [38]:
print( f"number of Ratings : { CleanedRatings.shape[0] }")
print( f"number of movies : { CleanedRatings.groupby('movieId').count().shape[0] }")
print( f"number of users : { CleanedRatings.groupby('userId').count().shape[0] }")
print( f"range of rating : ( { np.min( CleanedRatings.groupby('rating').count().index )}, {np.max( CleanedRatings.groupby('rating').count().index )})  ")

number of Ratings : 3363219
number of movies : 4839
number of users : 103827
range of rating : ( 1.0, 5.0)  


### LightFM 

In [41]:
#del ratings['year_rated']
#del ratings['timestamp']


In [42]:
data = Dataset()
data.fit(ratings.userId.unique(), ratings.movieId.unique(),ratings.rating.unique())
interactions, weights = data.build_interactions([tuple(i) for i in ratings.values])
print(repr(interactions))

<103827x4937 sparse matrix of type '<class 'numpy.int32'>'
	with 7329482 stored elements in COOrdinate format>


In [43]:
interactions.shape

(103827, 4937)

In [44]:
train,test = random_train_test_split(interactions,test_percentage=0.2)

### Warp

In [45]:
model = LightFM(loss='warp',
                random_state=2016,
                learning_rate=0.90,
                no_components=150,
                user_alpha=0.000005)

In [58]:
model = model.fit(train,
                  epochs=10,
                  num_threads=16, verbose=True)

Epoch: 100%|██████████| 10/10 [43:53<00:00, 263.36s/it]


In [59]:
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()
print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))


Precision: train 0.00, test 0.00.


In [60]:
train_recall = recall_at_k(model,train, k=10).mean()
test_recall = recall_at_k(model,test, k=10).mean()
print('recall: train %.2f, test %.2f.' % (train_recall, test_recall))

recall: train 0.00, test 0.00.


In [61]:
train_aucscore = auc_score(model, train).mean()
test_aucscore = auc_score(model, test).mean()
print('auc_score: train %.2f, test %.2f.' % (train_aucscore, test_aucscore))

auc_score: train 0.52, test 0.50.


In [62]:
train_reciprocal = reciprocal_rank(model, train).mean()
test_reciprocal = reciprocal_rank(model,test).mean()
print('reciprocal_rank: train %.2f, test %.2f.' % (train_reciprocal, test_reciprocal))

reciprocal_rank: train 0.02, test 0.01.


### Bpr

In [66]:
model = LightFM(loss='bpr',
                random_state=2016,
                learning_rate=0.90,
                no_components=150,
                user_alpha=0.000005)

In [67]:
model = model.fit(train,
                  epochs=10,
                  num_threads=16, verbose=True)

Epoch: 100%|██████████| 10/10 [37:45<00:00, 226.51s/it]


In [70]:
train_precision = precision_at_k(model, train, k=10).mean()
test_precision = precision_at_k(model, test, k=10).mean()
print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))

Precision: train 0.01, test 0.00.


In [71]:
train_recall = recall_at_k(model,train, k=10).mean()
test_recall = recall_at_k(model,test, k=10).mean()
print('recall: train %.2f, test %.2f.' % (train_recall, test_recall))

recall: train 0.01, test 0.00.


In [72]:
train_aucscore = auc_score(model, train).mean()
test_aucscore = auc_score(model, test).mean()
print('auc_score: train %.2f, test %.2f.' % (train_aucscore, test_aucscore))

auc_score: train 0.93, test 0.89.


In [73]:
train_reciprocal = reciprocal_rank(model, train).mean()
test_reciprocal = reciprocal_rank(model,test).mean()
print('reciprocal_rank: train %.2f, test %.2f.' % (train_reciprocal, test_reciprocal))

reciprocal_rank: train 0.05, test 0.01.
