# Ranking evaluation with offline metrics: 
### implemented precision@k, recal@k, ...

In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
import time

# import gradio as gr
import pickle

from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

import line_profiler
%load_ext line_profiler

In [2]:
# load in tuned model and transformed document-topic matrix
lda_main = pickle.load(open('model_building\\model_2023_08_16.sav', 'rb'))

with open("data_preprocessing_eda_out\\word_key.txt", "rb") as f:
    word_key = pickle.load(f)

# read in movie database
df = pd.read_csv("data_preprocessing_eda_out\\df_spaces_upload.csv", index_col=[0])

# read in scipy sparse matrix
X = sparse.load_npz("data_preprocessing_eda_out\\X.npz")
with open("model_building\\Xtran.txt", "rb") as f:
    Xtran_main = pickle.load(f)


In [3]:
df_orig = pd.read_csv('data_cleaning_and_synthesis_out\\springfield_movie_scripts_2023_01_13_clean.csv', index_col = [0])
df_orig = df_orig.drop(['script_text', 'springfield_link', 'tmdb_poster_link', 'imdb_link'], axis=1)
df_orig.head()

Unnamed: 0,movie_title,movie_year,imdb_id,tmdb_id
0,A 2nd Hand Lover,2015,tt10919164,472886
1,A Aa,2016,tt5684466,372399
2,A Baby at Any Cost,2022,tt15331880,938971
3,A Bad Idea Gone Wrong,2017,tt5212918,438424
4,A Bad Moms Christmas,2017,tt6359956,431530


In [4]:
df_movielens = pd.read_csv('..\\database_movieLens\\links.csv')
# df_movielens['movielens_id'] = df_movielens.index
df_movielens.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  58098 non-null  int64  
 1   imdbId   58098 non-null  int64  
 2   tmdbId   57917 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 1.3 MB


In [5]:
df_joined = df_orig.join(df_movielens.dropna().set_index('tmdbId'), how='left', on='tmdb_id')
df_joined.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35530 entries, 0 to 35515
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movie_title  35530 non-null  object 
 1   movie_year   35530 non-null  int64  
 2   imdb_id      33712 non-null  object 
 3   tmdb_id      35530 non-null  int64  
 4   movieId      20381 non-null  float64
 5   imdbId       20381 non-null  float64
dtypes: float64(2), int64(2), object(2)
memory usage: 1.9+ MB


In [6]:
df_joined.head()

Unnamed: 0,movie_title,movie_year,imdb_id,tmdb_id,movieId,imdbId
0,A 2nd Hand Lover,2015,tt10919164,472886,,
1,A Aa,2016,tt5684466,372399,,
2,A Baby at Any Cost,2022,tt15331880,938971,,
3,A Bad Idea Gone Wrong,2017,tt5212918,438424,181135.0,5212918.0
4,A Bad Moms Christmas,2017,tt6359956,431530,179953.0,6359956.0


In [7]:
# second option is to join on imdbId -- both options yield the same result ~ 20,300 non-null matches
# df_joined = df_orig.join(df_movielens.set_index('imdbId'), how='left', on='imdb_id')
# df_joined.head()
# df_joined.info()

In [8]:
df_movielens_ratings = pd.read_csv('..\\database_movieLens\\ratings.csv')
df_movielens_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [9]:
# filter out movies from ratings matrix that are not in script database
unique_movielens_ids = df_joined['movieId'].unique()[1:]
unique_movielens_ids = np.sort(unique_movielens_ids.astype(int))
movieId = np.array(df_movielens_ratings['movieId'])

In [10]:
bool_mask = [True if j in unique_movielens_ids else False for j in movieId]
df_movielens_ratings = df_movielens_ratings.loc[bool_mask]

In [11]:
unique_movielens_users = np.array(df_movielens_ratings['userId'].value_counts().index)
num_ratings_per_user = np.array(df_movielens_ratings['userId'].value_counts())
userId = np.array(df_movielens_ratings['userId'])

In [12]:
# drop all users from ratings matrix that rated less than 6 films
users_drop = unique_movielens_users[num_ratings_per_user <= 5]
bool_mask = [False if j in users_drop else True for j in userId]
df_movielens_ratings = df_movielens_ratings.loc[bool_mask]

In [13]:
# drop movies from Xtran_main and df that are not in movie lens database
df_joined = df_joined.drop_duplicates(subset='tmdb_id')

In [14]:
df_joined = df_joined.dropna(subset='movieId')

In [15]:
Xtran_main = Xtran_main[df_joined.index,:]
df = df.loc[df_joined.index].reset_index(drop=True)

In [16]:
df_final = df_movielens_ratings.join(df_joined.set_index("movieId"), how="left", on="movieId")

unique_users = np.array(df_final['userId'].value_counts().sort_index().index)
num_ratings_per_user = np.array(df_final['userId'].value_counts().sort_index())
diff = [np.sum(num_ratings_per_user[:j])  if j > 0 else 0 for j in range(len(unique_users))]

df_imdb_sort = df.copy()
# df_imdb_sort['script_id'] = df_imdb_sort.index
# df_imdb_sort = df_imdb_sort.set_index('imdb_id')

In [17]:
# form training and testing dataframes
df_final

Unnamed: 0,userId,movieId,rating,timestamp,movie_title,movie_year,imdb_id,tmdb_id,imdbId
0,1,307,3.5,1256677221,Trois couleurs: Bleu (Three Colors: Blue),1993.0,tt0108394,108.0,108394.0
1,1,481,3.5,1256677456,Kalifornia,1993.0,tt0107302,10909.0,107302.0
2,1,1091,1.5,1256677471,Weekend at Bernie's,1989.0,tt0098627,8491.0,98627.0
3,1,1257,4.5,1256677460,Better Off Dead...,1985.0,tt0088794,13667.0,88794.0
4,1,1449,4.5,1256677264,Waiting For Guffman,1996.0,tt0118111,16448.0,118111.0
...,...,...,...,...,...,...,...,...,...
27753439,283228,8542,4.5,1379882795,A Day at the Races,1937.0,tt0028772,11939.0,28772.0
27753440,283228,8712,4.5,1379882751,My Favorite Wife,1940.0,tt0029284,41463.0,29284.0
27753441,283228,34405,4.5,1379882889,Serenity,2005.0,tt0379786,16320.0,379786.0
27753442,283228,44761,4.5,1354159524,Brick,2005.0,tt0393109,9270.0,393109.0


In [18]:
imdb_list_sort = list(df_imdb_sort['imdb_id'])


In [19]:
df.columns = ["Title", "Year", "Genres", "IMDb Rating", 'num_votes', 'is_adult', 'imdb_id', 'imdb_link', 'tmdb_poster_link']

In [20]:
df = df[["Title", "Year", "IMDb Rating", "Genres"]]

In [21]:
df

Unnamed: 0,Title,Year,IMDb Rating,Genres
0,A Bad Idea Gone Wrong,2017,5.8,Comedy
1,A Bad Moms Christmas,2017,5.6,Comedy
2,A Bag of Hammers,2011,6.6,"Comedy,Drama"
3,A Ballerina's Tale,2015,6.4,Documentary
4,A Band Called Death,2012,7.5,"Biography,Documentary,Music"
...,...,...,...,...
20137,90 minutter,2012,6.1,Drama
20138,96 Minutes,2011,5.7,"Drama,Thriller"
20139,99 Homes,2014,7.1,"Crime,Drama"
20140,[REC],2007,7.4,"Horror,Mystery,Thriller"


In [25]:
len(unique_users)*20*8/1e9

0.0409664

In [23]:
def user_average_topic():
    jcols = [imdb_list_sort.index(j) for j in imdb_ids]
    sim_in = np.sum(Xtran[jcols, :], axis=0).reshape(1, Xtran.shape[1])/len(imdb_ids)

In [24]:
def movie_rec_users(imdb_ids, rating_min, is_adult, Xtran, num_rec=5):
    # compute top 5 movie recommendations for the input movie and filters
    # inputs:
    #       movie_name: selected movie_name from radio
    #       rating_min: filter out all movies with ratings less than rating_min
    #       is_adult: if True then filter out adult titles
    # ouputs:
    #       df_in: dataframe with all the info on movie_name
    #       df_out: dataframe with all the info on top 5 recommended movies

    jcols = [imdb_list_sort.index(j) for j in imdb_ids]
    sim_in = np.sum(Xtran[jcols, :], axis=0).reshape(1, Xtran.shape[1])/len(imdb_ids)


    # compute similarity between movie_name and all other movies in database
    sim_movie = cosine_similarity(sim_in, Xtran).reshape((len(df),))

    # sort dataframe by movie similarity in descending order
    arg_sim_movie_ordered = np.argsort(sim_movie[np.argpartition(sim_movie, -num_rec)[-num_rec:]])
    df_sort = df.iloc[arg_sim_movie_ordered]

    # fiter by rating_min and is_adult
    # df_sort = df_sort[df_sort["average_rating"] >= float(rating_min)]
    # if is_adult:
    #     df_sort = df_sort[df_sort["is_adult"] == 0]

    # raise error if less than num_Rec movies are left after filtering
    # if len(df_sort) < num_rec:
    #     raise gr.Error(
    #         "Not enough movies met the filter criteria. Try reducing the minimum rating."
    #     )

    # construct output dataframe
    df_out = df_sort.head(num_rec)


    return df_out

In [28]:
num_rec = 20
kk = 0
for j, user_id in enumerate(unique_users[0:2001]):

    input_user_ratings = df_final.iloc[diff[j]:diff[j]+num_ratings_per_user[j]]
    imdb_ids = list(np.random.permutation(input_user_ratings['imdb_id'])[0:3])
    df_out = movie_rec_users(imdb_ids, rating_min=0, is_adult=False, Xtran=Xtran_main, num_rec=num_rec)

    # compute precision@k, recall@k,
    rec_titles = list(df_out["Title"])
    watched_titles = list(input_user_ratings["movie_title"])

    num_hits = len(np.intersect1d(rec_titles, watched_titles))
    precision_at_k = num_hits / num_rec
    recall_at_k = num_hits / len(watched_titles)
    s = np.min([len(watched_titles), num_rec])
    r_precision_at_k = num_hits/s
    kk = kk  + 1
    if(kk%1000 == 0):
        print(kk)
    # print(f'User: {user_id} - {precision_at_k} - {recall_at_k} - {r_precision_at_k}')

1000
2000


In [146]:
r_precision_at_k

0.0

In [250]:
%lprun -f test test()

1000
2000
3000
4000
5000
6000
7000
8000
9000
*** KeyboardInterrupt exception caught in code being profiled.

Timer unit: 1e-07 s

Total time: 50.3709 s
File: C:\Users\Nick\AppData\Local\Temp\ipykernel_9876\1162995794.py
Function: test at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def test():
     2         1         12.0     12.0      0.0      num_rec = 20
     3         1          5.0      5.0      0.0      kk = 0
     4      9883     117569.0     11.9      0.0      for j, user_id in enumerate(unique_users):
     5                                           
     6      9883    8113299.0    820.9      1.6          input_user_ratings = df_final.iloc[diff[j]:diff[j]+num_ratings_per_user[j]]
     7      9883   11030129.0   1116.1      2.2          imdb_ids = list(np.random.permutation(input_user_ratings['imdb_id'])[0:3])
     8      9882  429013305.0  43413.6     85.2          df_out = movie_rec_users(imdb_ids, rating_min=0, is_adult=False, Xtran=Xtran_main, num_rec=num_rec)
     9                                       