# Installations and Imports

In [None]:
import sys
!{sys.executable} -m pip install lightfm

In [None]:
from google.colab import drive
import os
import warnings
import json
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/BT4222 Project')
os.chdir('Datasets/')

In [None]:
# LightFM imports
from lightfm import LightFM # import model class to fit model
from lightfm.data import Dataset # make dataset in right representation for lightfm
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k, recall_at_k

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)

warnings.simplefilter(action='ignore', category=FutureWarning)

# Dataset Preparations

* Combine train.csv and test.csv from feature-engineering.ipynb that contains all the engineered features.
* Pick out columns for movie_features matrix and user_movie matrix
* Use LightFM package to create dataset of type lightfm.data.Dataset, and use LightFM package cross-validation to perform train-test split.

In [None]:
# Import train and test data from CSV
train_df = pd.read_csv('train_reduced.csv')
test_df = pd.read_csv('test_reduced.csv')

In [None]:
# Check train and test data content
# train_df.head(1)
train_df.shape

(621955, 48)

In [None]:
test_df.shape

(187768, 48)

In [None]:
# Merge both df together to create full dataset
full_df = train_df.append(test_df)
full_df.shape

(809723, 48)

In [None]:
full_df.head(1)

Unnamed: 0,userId,rating,timestamp_rated,tag,ml_title,tmdbId,cast,crew,keywords,original_title,overview,production_companies,release_date,runtime,status,tagline,movie_id,overview_embedding,production_country_Germany,production_country_United Kingdom,production_country_United States of America,genre_Action,genre_Adventure,genre_Animation,genre_Children,genre_Comedy,genre_Crime,genre_Drama,genre_Family,genre_Fantasy,genre_Horror,genre_Mystery,genre_Romance,genre_Sci-Fi,genre_ScienceFiction,genre_Thriller,genre_War,spoken_language_Deutsch,spoken_language_Español,spoken_language_Français,spoken_language_Italiano,has_homepage,gross_profit,roi,profit_margin,tagline_sentiment_score_average,overview_sentiment_score_average,release_year
0,25,4.0,2010-07-01 05:54:30,,Rain Man (1988),380,"Dustin Hoffman, Tom Cruise, Valeria Golino, Ge...","Hans Zimmer, Linda DeScenna, Louis DiGiaimo, W...","individual, mentally disabled, autism, loss of...",Rain Man,Selfish yuppie Charlie Babbitt's father left a...,"United Artists, Star Partners II Ltd.",1988-12-11,133.0,Released,A journey through understanding and fellowship.,380,[-1.02400000e-02 -2.06307143e-02 8.01640714e-...,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,387800000.0,15.512,0.939438,0.0,-0.36335,1988


## Selecting columns for user-item matrix and item_feature matrix

User-item matrix includes only `userId`, `tmdbId` that represents movie and the `rating` that serves as weights for each interaction.

In [None]:
# Pick out relevant features for the user-movie interactions and movie features
review_df = full_df[['userId', 'tmdbId', 'rating']].copy()

features_df = full_df.copy().drop(columns=['userId', 'rating', 'timestamp_rated', 'tag', 'ml_title', 'cast', 'crew', 'keywords', 'original_title', 'overview', 'overview_embedding',
    'production_companies', 'release_date', 'status', 'tagline', 'gross_profit', 'roi',
    'profit_margin'], axis=1
    )

In [None]:
review_df.shape

(809723, 3)

In [None]:
features_df.shape

(809723, 30)

In [None]:
# Convert dataset to JSON string format
review_str = review_df.to_json(orient = 'records')
features_str = features_df.to_json(orient = 'records')

In [None]:
review_dict = json.loads(review_str)
features_dict = json.loads(features_str)

In [None]:
review_dict[0]

{'userId': 25, 'tmdbId': 380, 'rating': 4.0}

In [None]:
features_dict[0]

{'tmdbId': 380,
 'runtime': 133.0,
 'movie_id': 380,
 'production_country_Germany': 0,
 'production_country_United Kingdom': 0,
 'production_country_United States of America': 1,
 'genre_Action': 0,
 'genre_Adventure': 0,
 'genre_Animation': 0,
 'genre_Children': 0,
 'genre_Comedy': 0,
 'genre_Crime': 0,
 'genre_Drama': 1,
 'genre_Family': 0,
 'genre_Fantasy': 0,
 'genre_Horror': 0,
 'genre_Mystery': 0,
 'genre_Romance': 0,
 'genre_Sci-Fi': 0,
 'genre_ScienceFiction': 0,
 'genre_Thriller': 0,
 'genre_War': 0,
 'spoken_language_Deutsch': 0,
 'spoken_language_Español': 0,
 'spoken_language_Français': 0,
 'spoken_language_Italiano': 1,
 'has_homepage': 0,
 'tagline_sentiment_score_average': 0.0,
 'overview_sentiment_score_average': -0.36335,
 'release_year': 1988}

## Create dataset for LightFM model input

In [None]:
# Use LightFM Dataset function to create dataset that fits
dataset = Dataset()
dataset.fit((x['userId'] for x in review_dict), (x['tmdbId'] for x in review_dict), item_features=((key) for key in features_dict[0].keys() if key !='tmdbId'))

# Check shape of dataset to see if all ids are taken into account, user: 8656 and movies: 3973
num_users, num_movies = dataset.interactions_shape()

print(f"num_users = {num_users}, num_movies = {num_movies}")

num_users = 8656, num_movies = 3970


In [None]:
# Build item_features
item_features = dataset.build_item_features(
    ((x['tmdbId'],
     {key: value for key, value in x.items() if key != next(iter(x))}
     ) for x in features_dict)
)

print(repr(item_features))

<3970x3999 sparse matrix of type '<class 'numpy.float32'>'
	with 119100 stored elements in Compressed Sparse Row format>


In [None]:
# Build interactions matrix with weights as ratings
interactions, weights = dataset.build_interactions(((x['userId'], x['tmdbId'], x['rating']) for x in review_dict))

print(repr(interactions))
print(repr(weights))

<8656x3970 sparse matrix of type '<class 'numpy.int32'>'
	with 809723 stored elements in COOrdinate format>
<8656x3970 sparse matrix of type '<class 'numpy.float32'>'
	with 809723 stored elements in COOrdinate format>


## Split into train and test interactions
For LightFM, train and test sets are expected to have the same dimensions. Use cross validation split method in package to split into 2 disjoint sets of train and test.
Train-test Split = 75:25 (as per other models in evaluation)

In [None]:
train_interactions, test_interactions = cross_validation.random_train_test_split(
    interactions,
    test_percentage=0.25,
    random_state=np.random.RandomState(42)
)

In [None]:
# Check for same dimensions
print(repr(train_interactions))
print(repr(test_interactions))

<8656x3970 sparse matrix of type '<class 'numpy.int32'>'
	with 607292 stored elements in COOrdinate format>
<8656x3970 sparse matrix of type '<class 'numpy.int32'>'
	with 202431 stored elements in COOrdinate format>


# Create Model

## Train model

### Loss function for training: Weighted Approximate-Rank Pairwise (WARP)
This function is designed for learning-to-rank problems in collaborative filtering settings.

It samples negative items that are likely to have a higher rank than the positive item. This sampling is done to allow for faster convergence during training, especially when dealing with large datasets. WARP loss aims to approximate the ranking order of items by adjusting the model parameters to minimize the ranking violation.
Yields higher Precision@k, used mainly for for top k recommendations.

In [None]:
# Create WARP model and train on train interactions
warp_model = LightFM(loss='warp', no_components=40, learning_rate=0.48, item_alpha=1e-7, random_state=np.random.RandomState(42))

warp_model.fit(train_interactions, epochs=55, item_features=item_features, verbose=True)

Epoch:   0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 100%|██████████| 55/55 [06:22<00:00,  6.96s/it]


<lightfm.lightfm.LightFM at 0x16abe3cd0>

## Model evaluation on test set

### Precision @ K = 10

In [None]:
# Precision of WARP Loss Model
precision_warp = precision_at_k(warp_model, test_interactions, train_interactions, item_features=item_features, k=10).mean()
precision_warp

0.0819752

### Recall @ K = 10

In [None]:
# Recall of WARP Loss model
recall_warp = recall_at_k(warp_model, test_interactions, train_interactions, item_features=item_features, k=10).mean()
recall_warp

0.0349526322469863

### F1 Score

In [None]:
#F1 = [2(Precision * Recall)]/(Precision + Recall)
f1_warp = (2*precision_warp*recall_warp) / (recall_warp+precision_warp)
f1_warp

0.0490088451382704

In [None]:
# all metrics together
print(f"Average Precision @ K = {precision_warp:.6f}")
print(f"Average Recall @ K = {recall_warp:.6f}")
print(f"Average F1 Score @ K = {f1_warp:.6f}")

Average Precision @ K = 0.081975
Average Recall @ K = 0.034953
Average F1 Score @ K = 0.049009


# Making predictions with model

## Top 10 Recommended Movies for Sample User


In [None]:
def top_ten_movies(model, user_id, num_movies):
    index_id = user_id-1

    scores = model.predict(index_id, np.arange(num_movies))
    top_movies = features_df.iloc[np.argsort(-scores)]
    top_tmdbId = top_movies[0:10]['tmdbId'].values

    user_top_10 = pd.DataFrame(columns=['tmdbId', 'original_title'])
    for i in top_tmdbId:
        user_top_10 = user_top_10.append(full_df.loc[full_df['tmdbId'] == i, ['tmdbId', 'original_title']], ignore_index=True)
    user_top_10 = user_top_10.drop_duplicates(ignore_index=True)
    user_top_10.index = range(1, 11)

    print(f"For User {user_id}: ")
    return user_top_10

For sample user with user_id = 10:

In [None]:
top_ten_movies(warp_model, 10, num_movies)

For User 10: 


Unnamed: 0,tmdbId,original_title
1,78,Blade Runner
2,12207,醉拳二
3,8840,DragonHeart
4,9451,Election
5,3597,I Know What You Did Last Summer
6,4584,Sense and Sensibility
7,251,Ghost
8,9378,Thir13en Ghosts
9,3049,Ace Ventura: Pet Detective
10,500,Reservoir Dogs


## Getting related movies to movie

In [None]:
def movie_top_10_movies(model, itemID):
    # get biases and latent representations (embeddings) for items
    movie_bias, movie_repre = model.get_item_representations(features=item_features)

    # get cosine similarity (-1 dissimilar) by getting dot prod
    cos_sim = np.dot(movie_repre, movie_repre[itemID, :])

    # normalise
    movie_norms = np.linalg.norm(movie_repre, axis=1)
    cos_sim /= movie_norms

    # pick out top 10 sim index from cos sim
    top_cos_sim = np.argpartition(cos_sim, -11)[-11:]
    top_sim_movies = sorted(zip(top_cos_sim, cos_sim[top_cos_sim] / movie_norms[itemID]), key=lambda x: -x[1])
    top_sim_movies_index = [x[0] for x in top_sim_movies]

    # create dataframe to print as result
    result = pd.DataFrame(columns=['tmdbId', 'original_title'])
    tmdb_lst = features_df.iloc[top_sim_movies_index]['tmdbId'].values

    for i in tmdb_lst:
        result = result.append(full_df.loc[full_df['tmdbId'] == i, ['tmdbId', 'original_title']], ignore_index=True)
    result = result.drop_duplicates(ignore_index=True)
    result.index = range(0, 11)

    title = result.loc[result['tmdbId'] == tmdb_lst[0]]['original_title'].values[0]

    print(f"For itemId: {itemID}, tmdbId: {features_df.iloc[itemID]['tmdbId']:.0f}, title: {title}")
    return result[1:]

In [None]:
movie_top_10_movies(warp_model, 241)

For itemId: 241, tmdbId: 629, title: The Usual Suspects


Unnamed: 0,tmdbId,original_title
1,9372,Super Size Me
2,10218,Swingers
3,197,Braveheart
4,1271,300
5,745,The Sixth Sense
6,68924,The Ice Storm
7,37233,The Firm
8,2074,Flirting with Disaster
9,4518,Elizabeth
10,414,Batman Forever
