## 05_data_preprocessing_CTM.ipynb

Description: 
- build dataset for collaborative topic model (CTM), combining movieLens dataset and scripts dataset

External Dependencies:
- MovieLens links.csv from "..\\database\\dataset_movieLens\\links.csv"
- MovieLens ratings.csv from "..\\database\\dataset_movieLens\\ratings.csv"
- springfield_movie_scripts.csv from "database\dataset_film_scripts\springfield_movie_scripts.csv"

Returns:
- pd.DataFrame saved to "data_preprocessing_out\\df_processed_CTM.csv"

In [3]:
import pandas as pd
import numpy as np

In [4]:
# load in cleaned movie scripts dataset
df_script = pd.read_csv(
    "..\\database\\dataset_film_scripts\\springfield_movie_scripts.csv", index_col=[0]
)
df_script = df_script.drop(
    ["script_text", "springfield_link", "tmdb_poster_link", "imdb_link"], axis=1
)
df_script["recsys_id"] = df_script.index
print(df_script.head())

# load in movieLens links and ratings datasets
df_movielens = pd.read_csv("..\\database\\dataset_movieLens\\links.csv")
print(df_movielens.head())

df_movielens_ratings = pd.read_csv("..\\database\\dataset_movieLens\\ratings.csv")
print(df_movielens_ratings.head())

             movie_title  movie_year     imdb_id  tmdb_id  recsys_id
0       A 2nd Hand Lover        2015  tt10919164   472886          0
1                   A Aa        2016   tt5684466   372399          1
2     A Baby at Any Cost        2022  tt15331880   938971          2
3  A Bad Idea Gone Wrong        2017   tt5212918   438424          3
4   A Bad Moms Christmas        2017   tt6359956   431530          4
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0
   userId  movieId  rating   timestamp
0       1      307     3.5  1256677221
1       1      481     3.5  1256677456
2       1     1091     1.5  1256677471
3       1     1257     4.5  1256677460
4       1     1449     4.5  1256677264


In [4]:
# join movieLens dataset with scripts dataset
df_joined = df_script.join(
    df_movielens.dropna().set_index("tmdbId"), how="left", on="tmdb_id"
)

# drop duplicates and missing movieIds
df_joined = df_joined.drop_duplicates(subset="tmdb_id")
df_joined = df_joined.dropna(subset="movieId")

df_joined.head()

Unnamed: 0,movie_title,movie_year,imdb_id,tmdb_id,recsys_id,movieId,imdbId
3,A Bad Idea Gone Wrong,2017,tt5212918,438424,3,181135.0,5212918.0
4,A Bad Moms Christmas,2017,tt6359956,431530,4,179953.0,6359956.0
5,A Bag of Hammers,2011,tt1509787,59441,5,101597.0,1509787.0
6,A Ballerina's Tale,2015,tt4504040,334682,6,144622.0,4504040.0
7,A Band Called Death,2012,tt2064713,137563,7,102062.0,2064713.0


In [5]:
# filter out movies from ratings matrix that are not in script database
# takes about 3.5 minutes to run
unique_movielens_ids = df_joined["movieId"].unique()[1:]
unique_movielens_ids = np.sort(unique_movielens_ids.astype(int))
movieId = np.array(df_movielens_ratings["movieId"])

bool_mask = [True if j in unique_movielens_ids else False for j in movieId]
df_movielens_ratings = df_movielens_ratings.loc[bool_mask]

# drop all users from ratings matrix that rated less than n_films
n_films = 2
unique_movielens_users = np.array(df_movielens_ratings["userId"].value_counts().index)
num_ratings_per_user = np.array(df_movielens_ratings["userId"].value_counts())
userId = np.array(df_movielens_ratings["userId"])

users_drop = unique_movielens_users[num_ratings_per_user < n_films]
bool_mask = [False if j in users_drop else True for j in userId]
df_movielens_ratings = df_movielens_ratings.loc[bool_mask]


df_movielens_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [6]:
df_final = df_movielens_ratings.join(
    df_joined.set_index("movieId"), how="left", on="movieId"
)
df_final = df_final.dropna()
df_final["recsys_id"] = df_final["recsys_id"].astype("int")

df_final.head()

Unnamed: 0,userId,movieId,rating,timestamp,movie_title,movie_year,imdb_id,tmdb_id,recsys_id,imdbId
0,1,307,3.5,1256677221,Trois couleurs: Bleu (Three Colors: Blue),1993,tt0108394,108,32255,108394.0
1,1,481,3.5,1256677456,Kalifornia,1993,tt0107302,10909,15129,107302.0
2,1,1091,1.5,1256677471,Weekend at Bernie's,1989,tt0098627,8491,33636,98627.0
3,1,1257,4.5,1256677460,Better Off Dead...,1985,tt0088794,13667,3458,88794.0
4,1,1449,4.5,1256677264,Waiting For Guffman,1996,tt0118111,16448,33327,118111.0


In [10]:
df_final[["userId", "recsys_id", "rating"]].to_csv(
    "data_preprocessing_out\\df_processed_CTM.csv"
)