In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

In [4]:
df_r = pd.read_json('data/ratings.jsonl', lines=True).drop('Timestamp', axis=1)
df_t = pd.read_csv('data/targets.csv')

KeyboardInterrupt: 

In [None]:
def load_content(content_file):
    df_content = pd.read_json(content_file, lines=True)

    # Getting the Rotten Tomatoes ratings
    rt_ratings = []
    for ratings_list in df_content['Ratings']:
        rt_rating = next((item['Value'] for item in ratings_list if item['Source'] == 'Rotten Tomatoes'), None)
        if rt_rating:
            rt_rating = int(rt_rating[:-1])
        rt_ratings.append(rt_rating)
    df_content['rtRating'] = rt_ratings

    # Getting useful columns
    data_content = df_content[['ItemId', 'Metascore', 'imdbRating', 'imdbVotes', 'rtRating', 'Awards']].copy()

    # Updating 'Awards' column
    data_content['Awards'] = data_content['Awards'].apply(lambda x: 0 if x == 'N/A' else 1)

    # Replacing string 'N/A' with np.nan and removing number separators
    data_content = data_content.replace('N/A', np.nan)
    data_content['imdbVotes'] = data_content['imdbVotes'].str.replace(',', '')

    # Converting to numeric data
    data_content['Metascore'] = data_content['Metascore'].astype('float32')
    data_content['imdbRating'] = data_content['imdbRating'].astype('float32')
    data_content['imdbVotes'] = data_content['imdbVotes'].astype('float32')
    
    # Substitute NaN with mean
    quantiles = data_content.quantile(0.5, numeric_only=True)
    data_content = data_content.fillna(quantiles)
    
    # Normalizing imdbRating between 0 and 10
    for col in data_content.columns:
        if col in ['ItemId', 'Awards']:
            continue
        min_rating = data_content[col].min()
        max_rating = data_content[col].max()
        data_content[col] = 0 + ((data_content[col] - min_rating) * (10 - 0)) / (max_rating - min_rating)

    data_content['Plot'] = df_content['Plot'].copy()

    return data_content

In [None]:
df_c = load_content('data/content.jsonl')

### PLOT

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
tfidf_matrix = tfidf_vectorizer.fit_transform(df_c['Plot'])

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# def compute_cosine_similarity_in_batches(matrix, batch_size=1000):
#     n_rows = matrix.shape[0]
#     similarity_matrix = np.zeros((n_rows, n_rows))

#     for start_row in range(0, n_rows, batch_size):
#         end_row = min(start_row + batch_size, n_rows)
#         batch_similarity = cosine_similarity(matrix[start_row:end_row], matrix)
#         similarity_matrix[start_row:end_row] = batch_similarity

#     return similarity_matrix

# # Compute the cosine similarity in batches
# similarity_matrix = compute_cosine_similarity_in_batches(tfidf_matrix)

### CALC USER MATRICES AND SAVE (TAKES TOOO LONG)

In [None]:
# # Assuming df_r and df_c are pandas DataFrames and tfidf_matrix is a 2D array or a DataFrame

# # Group by UserId
# grouped = df_r.groupby('UserId') 

# # For each user, get the list of items they have consumed, then calculate the column-wise mean of those items from tfidf_matrix
# user_representations = grouped['ItemId'].apply(lambda items: tfidf_matrix[[df_c.index[df_c['ItemId'] == item].tolist()[0] for item in items]].mean(axis=0))

# # Get user mean ratings
# user_mean_ratings = grouped['Rating'].mean().tolist()

# # Convert user representations to a list of arrays (if necessary)
# user_representations = user_representations.tolist()

# # Ensure each element in user_representations is a numpy array
# user_representations = [np.array(representation) for representation in user_representations]

In [None]:
# user_representations_matrix = np.array(user_representations)
# np.save('matrices/user_representations_matrix.npy', user_representations_matrix)

In [None]:
# user_mean_ratings_array = np.array(user_mean_ratings)
# np.save('matrices/user_mean_ratings_array.npy', user_mean_ratings_array)

### PREDS

In [None]:
user_repr = np.load('matrices/user_representations_matrix.npy')
user_mean_rating = np.load('matrices/user_mean_ratings_array.npy')

In [None]:
users = df_r['UserId'].unique()
lookup_table_user = {user_id: idx for idx, user_id in enumerate(users)}

In [None]:
items = df_c['ItemId'].unique()
lookup_table_item = {item: idx for idx, item in enumerate(items)}

In [None]:
ratings = []
for i in tqdm(range(len(df_t))):
    line = df_t.loc[i]
    idx_user = lookup_table_user[line['UserId']]
    idx_item = lookup_table_item[line['ItemId']]

    array_user = user_repr[idx_user]
    array_item = tfidf_matrix[idx_item]
    cos_sim = cosine_similarity(array_user, array_item)[0][0]
    ratings.append(cos_sim * user_mean_rating[idx_user])


31471 21530
0.2817339642606825
