# Build a CTM

In [None]:
import pandas as pd
import numpy as np
from scipy import sparse
import time

# import gradio as gr
import pickle
import matplotlib.pyplot as plt

# import functions from surprise library

from surprise import SVD, CTM
from surprise import Dataset, NormalPredictor, Reader
from surprise.model_selection import cross_validate, train_test_split
from surprise.accuracy import rmse

import line_profiler
%load_ext line_profiler



In [None]:
# load in tuned model and transformed document-topic matrix
lda_main = pickle.load(open('..\\recsys_content_based\\model_building_out\\model_2023_08_16.sav', 'rb'))

with open("..\\recsys_content_based\\data_preprocessing_out\\word_key.txt", "rb") as f:
    word_key = pickle.load(f)

# read in movie database
df = pd.read_csv("..\\database\\dataset_spaces_upload.csv", index_col=[0])

# read in scipy sparse matrix
X = sparse.load_npz("..\\recsys_content_based\\data_preprocessing_out\\X.npz")
with open("..\\recsys_content_based\\model_building_out\\Xtran.txt", "rb") as f:
    Xtran_main = pickle.load(f)


In [None]:
# load in cleaned movie scripts dataset
df_orig = pd.read_csv('..\\database\\dataset_film_scripts\\springfield_movie_scripts_2023_01_13_clean.csv', index_col = [0])
df_orig = df_orig.drop(['script_text', 'springfield_link', 'tmdb_poster_link', 'imdb_link'], axis=1)
df_orig['recsys_id'] = df_orig.index
print(df_orig.info())
df_orig.head()

In [None]:
# load in movielens dataset
df_movielens = pd.read_csv('..\\database\\dataset_movieLens\\links.csv')
df_movielens.head()

In [None]:
df_movielens_ratings = pd.read_csv('..\\database\\dataset_movieLens\\ratings.csv')
df_movielens_ratings.head()

In [None]:
# join movie lens database with scripts dataset
df_joined = df_orig.join(df_movielens.dropna().set_index('tmdbId'), how='left', on='tmdb_id')

# drop duplicates and missing movieIds
df_joined = df_joined.drop_duplicates(subset='tmdb_id')
df_joined = df_joined.dropna(subset='movieId')
print(df_joined.info())
df_joined.head()


In [None]:
# second option is to join on imdbId -- both options yield the same result ~ 20,300 non-null matches
# df_joined = df_orig.join(df_movielens.set_index('imdbId'), how='left', on='imdb_id')
# df_joined.head()
# df_joined.info()

In [None]:
# filter out movies from ratings matrix that are not in script database
# takes about 5 minutes to run
unique_movielens_ids = df_joined['movieId'].unique()[1:]
unique_movielens_ids = np.sort(unique_movielens_ids.astype(int))
movieId = np.array(df_movielens_ratings['movieId'])

bool_mask = [True if j in unique_movielens_ids else False for j in movieId]
df_movielens_ratings = df_movielens_ratings.loc[bool_mask]

# drop all users from ratings matrix that rated less than 6 films
unique_movielens_users = np.array(df_movielens_ratings['userId'].value_counts().index)
num_ratings_per_user = np.array(df_movielens_ratings['userId'].value_counts())
userId = np.array(df_movielens_ratings['userId'])

users_drop = unique_movielens_users[num_ratings_per_user <= 5]
bool_mask = [False if j in users_drop else True for j in userId]
df_movielens_ratings = df_movielens_ratings.loc[bool_mask]


df_movielens_ratings.head()

In [None]:
df_final = df_movielens_ratings.join(df_joined.set_index("movieId"), how="left", on="movieId")
df_final = df_final.dropna()
df_final['recsys_id'] = df_final['recsys_id'].astype('int')
# unique_users = np.array(df_final['userId'].value_counts().sort_index().index)
# num_ratings_per_user = np.array(df_final['userId'].value_counts().sort_index())
# diff = [np.sum(num_ratings_per_user[:j])  if j > 0 else 0 for j in range(len(unique_users))]

df_final.head()

In [None]:
# drop movies from Xtran_main and df that are not in movie lens database
jkeep = sorted(df_final['recsys_id'].astype('int').unique().tolist())
# Xtran_main = Xtran_main[jkeep,:]
df = df.loc[jkeep].reset_index(drop=True)
df_orig = df_orig.loc[jkeep]
df_orig.head()

In [None]:
# form dataset for SVD algorithm
df_ratings_matrix = df_final[['userId', 'recsys_id', 'rating']].copy()

df_ratings_matrix = df_ratings_matrix.iloc[0:100_000].copy()


# map half ratings to integer ratings using a 50/50 split to nearest whole number
df_ratings_matrix[df_ratings_matrix['rating'] == 0.5] = 1
for jrating in [1.5,2.5,3.5,4.5]:
    a = np.array(df_ratings_matrix[df_ratings_matrix['rating'] == jrating].index,dtype=int)
    np.random.shuffle(a) # shuffle in-place, returns none
    num_ratings = len(a)
    df_ratings_matrix.loc[list(a[0:int(np.ceil(num_ratings/2))]),'rating'] = jrating-.5
    df_ratings_matrix.loc[list(a[int(np.ceil(num_ratings/2)):]),'rating'] = jrating+.5


reader = Reader(rating_scale=(1.0, 5.0))
data = Dataset.load_from_df(df_ratings_matrix[["userId", "recsys_id", "rating"]], reader)

trainset, testset = train_test_split(data, test_size=0.10)

In [None]:
iids_in_train_set = [trainset.to_raw_iid(j) for j in range(trainset.n_items)]
theta = Xtran_main[iids_in_train_set, :]

In [None]:
hp = [0]
for k in hp:
    algoC = CTM(n_factors=20, n_epochs=20, theta=theta*k)

    algoC.fit(trainset=trainset)
    rmse(algoC.test(testset))




In [None]:
algo = SVD(n_factors=20, n_epochs=20, verbose=True)
algo.fit(trainset=trainset)
rmse(algo.test(testset))

# evaluate precision@k and recall@k metrics

In [None]:
from collections import defaultdict
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(
            ((true_r >= threshold) and (est >= threshold))
            for (est, true_r) in user_ratings[:k]
        )

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls


In [None]:
predictions = algoC.test(testset)
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=3.5)

# Precision and recall can then be averaged over all users
print(sum(prec for prec in precisions.values()) / len(precisions))
print(sum(rec for rec in recalls.values()) / len(recalls))

In [None]:
predictions = algo.test(testset)
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=3.5)

# Precision and recall can then be averaged over all users
print(sum(prec for prec in precisions.values()) / len(precisions))
print(sum(rec for rec in recalls.values()) / len(recalls))