In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df_r = pd.read_json('data/ratings.jsonl', lines=True).drop('Timestamp', axis=1)
df_c = pd.read_json('data/content.jsonl', lines=True)

In [3]:
df_c = df_c[['ItemId', 'Genre', 'Director', 'Actors', 'Plot']]

In [4]:
df_c.head()

Unnamed: 0,ItemId,Genre,Director,Actors,Plot
0,c9f0f895fb,"Documentary, Short",William K.L. Dickson,Fred Ott,A man (Edison's assistant) takes a pinch of sn...
1,d3d9446802,"Documentary, Short",Louis Lumière,,A man opens the big gates to the Lumière facto...
2,c20ad4d76f,"Documentary, Short","Auguste Lumière, Louis Lumière","Madeleine Koehler, Marcel Koehler, Mrs. August...",A group of people are standing in a straight l...
3,8e296a067a,"Short, News, Sport",Birt Acres,,Although the content of this film is primitive...
4,54229abfcf,"Short, Horror",Georges Méliès,"Jehanne d'Alcy, Jules-Eugène Legris, Georges M...",A bat flies into an ancient castle and transfo...


In [5]:
def process_actors(actors_str):
    return [actor.strip() for actor in actors_str.split(',')]

lookup_table_item_actors = {item_id: process_actors(actors) for item_id, actors in zip(df_c['ItemId'], df_c['Actors'])}

In [6]:
lookup_table_item_actors[df_c['ItemId'].loc[2]]

['Madeleine Koehler', 'Marcel Koehler', 'Mrs. Auguste Lumiere']

### PLOT

In [7]:
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(df_c['Plot'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [8]:
def calc_similarity(tfidf_df, itemId1, itemId2):
    line1 = df_c[df_c['ItemId'] == itemId1].index[0]
    line2 = df_c[df_c['ItemId'] == itemId2].index[0]
    return cosine_similarity(tfidf_df.loc[line1].values.reshape(1, -1), tfidf_df.loc[line2].values.reshape(1, -1))

In [19]:
def calc_sim_user_item(userId, itemId):
    aux = df_r[df_r['UserId'] == userId]

    sims = []
    for item_id, rating in zip(aux['ItemId'], aux['Rating']):
        sims.append(calc_similarity(tfidf_df, item_id, itemId)[0][0] * rating)

    return sum(sims)