# Ranking evaluation with offline metrics: 
### implemented precision@k, recal@k, ...

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import time

# import gradio as gr
import pickle

from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

import line_profiler
%load_ext line_profiler

In [5]:
# load in tuned model and transformed document-topic matrix
lda_main = pickle.load(open('..\\recsys_content_based\\model_building_out\\model_2023_08_16.sav', 'rb'))

with open("..\\recsys_content_based\\data_preprocessing_out\\word_key.txt", "rb") as f:
    word_key = pickle.load(f)

# read in movie database
df = pd.read_csv("..\\database\\dataset_spaces_upload.csv", index_col=[0])

# read in scipy sparse matrix
X = sparse.load_npz("..\\recsys_content_based\\data_preprocessing_out\\X.npz")
with open("..\\recsys_content_based\\model_building_out\\Xtran.txt", "rb") as f:
    Xtran_main = pickle.load(f)


In [6]:
df_orig = pd.read_csv('..\\database\\dataset_film_scripts\\springfield_movie_scripts_2023_01_13_clean.csv', index_col = [0])
df_orig = df_orig.drop(['script_text', 'springfield_link', 'tmdb_poster_link', 'imdb_link'], axis=1)
df_orig.head()

Unnamed: 0,movie_title,movie_year,imdb_id,tmdb_id
0,A 2nd Hand Lover,2015,tt10919164,472886
1,A Aa,2016,tt5684466,372399
2,A Baby at Any Cost,2022,tt15331880,938971
3,A Bad Idea Gone Wrong,2017,tt5212918,438424
4,A Bad Moms Christmas,2017,tt6359956,431530


In [8]:
df_movielens = pd.read_csv('..\\database\\dataset_movieLens\\links.csv')
# df_movielens['movielens_id'] = df_movielens.index
df_movielens.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  int64  
 1   imdbId   58098 non-null  int64  
 2   tmdbId   57917 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.3 MB


In [9]:
df_joined = df_orig.join(df_movielens.dropna().set_index('tmdbId'), how='left', on='tmdb_id')
df_joined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35530 entries, 0 to 35515
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movie_title  35530 non-null  object 
 1   movie_year   35530 non-null  int64  
 2   imdb_id      33712 non-null  object 
 3   tmdb_id      35530 non-null  int64  
 4   movieId      20381 non-null  float64
 5   imdbId       20381 non-null  float64
dtypes: float64(2), int64(2), object(2)
memory usage: 1.9+ MB


In [10]:
df_joined.head()

Unnamed: 0,movie_title,movie_year,imdb_id,tmdb_id,movieId,imdbId
0,A 2nd Hand Lover,2015,tt10919164,472886,,
1,A Aa,2016,tt5684466,372399,,
2,A Baby at Any Cost,2022,tt15331880,938971,,
3,A Bad Idea Gone Wrong,2017,tt5212918,438424,181135.0,5212918.0
4,A Bad Moms Christmas,2017,tt6359956,431530,179953.0,6359956.0


In [11]:
# second option is to join on imdbId -- both options yield the same result ~ 20,300 non-null matches
# df_joined = df_orig.join(df_movielens.set_index('imdbId'), how='left', on='imdb_id')
# df_joined.head()
# df_joined.info()

In [13]:
df_movielens_ratings = pd.read_csv('..\\database\\dataset_movieLens\\ratings.csv')
df_movielens_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [14]:
# filter out movies from ratings matrix that are not in script database
unique_movielens_ids = df_joined['movieId'].unique()[1:]
unique_movielens_ids = np.sort(unique_movielens_ids.astype(int))
movieId = np.array(df_movielens_ratings['movieId'])

In [15]:
bool_mask = [True if j in unique_movielens_ids else False for j in movieId]
df_movielens_ratings = df_movielens_ratings.loc[bool_mask]

In [16]:
unique_movielens_users = np.array(df_movielens_ratings['userId'].value_counts().index)
num_ratings_per_user = np.array(df_movielens_ratings['userId'].value_counts())
userId = np.array(df_movielens_ratings['userId'])

In [17]:
# drop all users from ratings matrix that rated less than 6 films
users_drop = unique_movielens_users[num_ratings_per_user <= 5]
bool_mask = [False if j in users_drop else True for j in userId]
df_movielens_ratings = df_movielens_ratings.loc[bool_mask]

In [18]:
# drop movies from Xtran_main and df that are not in movie lens database
df_joined = df_joined.drop_duplicates(subset='tmdb_id')

In [19]:
df_joined = df_joined.dropna(subset='movieId')

In [20]:
Xtran_main = Xtran_main[df_joined.index,:]
df = df.loc[df_joined.index].reset_index(drop=True)

In [21]:
df_final = df_movielens_ratings.join(df_joined.set_index("movieId"), how="left", on="movieId")

unique_users = np.array(df_final['userId'].value_counts().sort_index().index)
num_ratings_per_user = np.array(df_final['userId'].value_counts().sort_index())
diff = [np.sum(num_ratings_per_user[:j])  if j > 0 else 0 for j in range(len(unique_users))]

df_imdb_sort = df.copy()
# df_imdb_sort['script_id'] = df_imdb_sort.index
# df_imdb_sort = df_imdb_sort.set_index('imdb_id')

In [22]:
# form training and testing dataframes
df_final

Unnamed: 0,userId,movieId,rating,timestamp,movie_title,movie_year,imdb_id,tmdb_id,imdbId
0,1,307,3.5,1256677221,Trois couleurs: Bleu (Three Colors: Blue),1993.0,tt0108394,108.0,108394.0
1,1,481,3.5,1256677456,Kalifornia,1993.0,tt0107302,10909.0,107302.0
2,1,1091,1.5,1256677471,Weekend at Bernie's,1989.0,tt0098627,8491.0,98627.0
3,1,1257,4.5,1256677460,Better Off Dead...,1985.0,tt0088794,13667.0,88794.0
4,1,1449,4.5,1256677264,Waiting For Guffman,1996.0,tt0118111,16448.0,118111.0
...,...,...,...,...,...,...,...,...,...
27753439,283228,8542,4.5,1379882795,A Day at the Races,1937.0,tt0028772,11939.0,28772.0
27753440,283228,8712,4.5,1379882751,My Favorite Wife,1940.0,tt0029284,41463.0,29284.0
27753441,283228,34405,4.5,1379882889,Serenity,2005.0,tt0379786,16320.0,379786.0
27753442,283228,44761,4.5,1354159524,Brick,2005.0,tt0393109,9270.0,393109.0


In [23]:
imdb_list_sort = list(df_imdb_sort['imdb_id'])


In [24]:
df.columns = ["Title", "Year", "Genres", "IMDb Rating", 'num_votes', 'is_adult', 'imdb_id', 'imdb_link', 'tmdb_poster_link']

In [25]:
df = df[["Title", "Year", "IMDb Rating", "Genres"]]

In [26]:
df

Unnamed: 0,Title,Year,IMDb Rating,Genres
0,A Bad Idea Gone Wrong,2017,5.8,Comedy
1,A Bad Moms Christmas,2017,5.6,Comedy
2,A Bag of Hammers,2011,6.6,"Comedy,Drama"
3,A Ballerina's Tale,2015,6.4,Documentary
4,A Band Called Death,2012,7.5,"Biography,Documentary,Music"
...,...,...,...,...
20137,90 minutter,2012,6.1,Drama
20138,96 Minutes,2011,5.7,"Drama,Thriller"
20139,99 Homes,2014,7.1,"Crime,Drama"
20140,[REC],2007,7.4,"Horror,Mystery,Thriller"
