In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Spring\'23/CSCE670/Animendations/
!ls

/content/drive/MyDrive/Spring'23/CSCE670/Animendations
'Animendations: Project Preview.gslides'
 data
 Gojo
 preprocessed_data
'Project Proposal: Animendations.gdoc'
 synopsis_embeddings
 user_profile
 visual_embeddings


In [3]:
from scipy.sparse import coo_matrix
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import matplotlib.pyplot as plt
import ast

In [9]:
animes_data_path = "./preprocessed_data/new_preprocessed_animes.csv"
profiles_data_path = "./preprocessed_data/preprocessed_profiles.csv"
reviews_data_path = "./preprocessed_data/new_preprocessed_reviews.csv"
animes_data = pd.read_csv(animes_data_path)
profiles_data = pd.read_csv(profiles_data_path)
reviews_data = pd.read_csv(reviews_data_path, engine='python', sep=',', error_bad_lines=False)

import pickle 
item_feature_dict = {}

with open("./visual_embeddings/dict_1000_vit.pickle", "rb") as f:
  visual_embeddings_dict = pickle.load(f)
with open("./synopsis_embeddings/sentence_embeddings_mpnet.pickle", "rb") as f:
  synopsis_embeddings_dict = pickle.load(f)

q1 = reviews_data['anime_uid'].unique().tolist()
q2 = [x for x in visual_embeddings_dict.keys()]
q3 = [x for x in synopsis_embeddings_dict.keys()]

s1 = set(q1)
s2 = set(q2)
s3 = set(q3)

# s3 = s1-s2 
s4 = s1 - s2.intersection(s3)
l4 = list(s4)

animes = animes_data.copy()
profiles = profiles_data.copy()
reviews = reviews_data.copy()
for x in l4:
  reviews.drop(reviews[reviews['anime_uid'] == x].index, inplace = True)



  reviews_data = pd.read_csv(reviews_data_path, engine='python', sep=',', error_bad_lines=False)


In [10]:
reviews.shape

(83918, 7)

In [11]:
np.random.seed(0)
unique_AnimeID = reviews['anime_uid'].unique()
unique_users = reviews['profile'].unique()
j = 0
user_old2new_id_dict = dict()
for u in unique_users:
    user_old2new_id_dict[u] = j
    j += 1
j = 0
movie_old2new_id_dict = dict()
for i in unique_AnimeID:
    movie_old2new_id_dict[i] = j
    j += 1

In [12]:
# Then, use the generated dictionaries to reindex UserID and MovieID in the data_df
user_list = reviews['profile'].values
movie_list = reviews['anime_uid'].values
for j in range(len(reviews)):
    user_list[j] = user_old2new_id_dict[user_list[j]]
    movie_list[j] = movie_old2new_id_dict[movie_list[j]]
reviews['profile'] = user_list
reviews['anime_uid'] = movie_list

# generate train_df with 70% samples and test_df with 30% samples, and there should have no overlap between them.
train_index = np.random.random(len(reviews)) <= 0.7
train_df = reviews[train_index]
test_df = reviews[~train_index]

# generate train_mat and test_mat
num_user = len(reviews['profile'].unique())
num_items = len(reviews['anime_uid'].unique())

train_mat = coo_matrix((train_df['score'].values, (train_df['profile'].values, train_df['anime_uid'].values)), shape=(num_user, num_items)).astype(float).toarray()
test_mat = coo_matrix((test_df['score'].values, (test_df['profile'].values, test_df['anime_uid'].values)), shape=(num_user, num_items)).astype(float).toarray()
train_data = torch.FloatTensor(train_mat)
test_data = torch.FloatTensor(test_mat)

In [13]:
def predict_rating(user_index, item_index, ratings, item_similarity, k=150):
    numerator = 0
    denominator = 0
    rated_items = np.where(ratings[user_index] != 0)[0]
    #simm_items = np.argsort(item_similarity[item_index])[::-1][:k + 1].tolist()
    simm_items = np.argsort(item_similarity[item_index])[::-1][:k].tolist()
    for i in range(len(simm_items)):
        if i != item_index  and ratings[user_index][i] != 0:
            numerator += ratings[user_index][i] * item_similarity[item_index][i]
            denominator += item_similarity[item_index][i]
    if denominator == 0:
        return 0
    else:
        return numerator / denominator

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

item_feature_dict = dict()
# synopsis_embeddings_dict = {k: v.detach().numpy() for k, v in synopsis_embeddings_dict.items()}

all_anime_uid_list = list(s2.intersection(s3))
item_feature_dict = {key: np.concatenate((visual_embeddings_dict[key][0], synopsis_embeddings_dict[key][0]), axis = 0) for key in all_anime_uid_list}

embeddings = [[] for _ in range(num_items)]
for i in item_feature_dict:
  try:
    embeddings[movie_old2new_id_dict[i]] = item_feature_dict[i].reshape(-1)
  except Exception as e:
    print(e)

simm_item_ij = cosine_similarity(embeddings)

In [25]:
with open('./visual_synopsis_item_item_sim.pickle', 'wb') as f:
  pickle.dump(simm_item_ij, f)

In [18]:
count = np.count_nonzero(simm_item_ij > 0.7)
print(count / simm_item_ij.size)

0.4821444695363359


In [19]:
# Compute predicted rating for a given user and item
def predict_rating_based_users_who_rated_similar_items(item_idx, item_name, review_rating, simm_item_ij, k=10, user_column='profile', item_column='anime_uid'):
    # Find k most similar items to the target item
    sim_items = np.argsort(simm_item_ij[item_idx])[::-1][:k].tolist()
    #print(sim_items)
    #sim_items = simm_item_ij[item_idx].sort_values(ascending=False)[:k].index.tolist()
    # Find all users who have rated those similar items
    
    sim_users = getattr(review_rating[getattr(review_rating,item_column).isin(sim_items)],user_column).tolist()
    # Compute weighted average of their ratings for the target item
    weights = [simm_item_ij[item_idx][sim_item] for sim_item in sim_items]
    
    #ratings_target = review_rating[(getattr(review_rating, user_column).isin(sim_users)) & (getattr(review_rating,item_column) == item_name)]
    ratings_target = review_rating[(getattr(review_rating, user_column).isin(sim_users)) & (getattr(review_rating,item_column) == item_name)]

    if len(ratings_target) == 0:
        return 5
    else:
        print(np.array(weights).shape)
        print(np.array(ratings_target.score).shape)
        return np.average(ratings_target.score, weights=weights)

In [20]:
import math

sum_diff = 0
indices_test = np.argwhere(test_mat > 0)
cnt = len(indices_test)
for index in indices_test:
  predicted_rating = predict_rating(index[0], index[1], train_data, simm_item_ij, 250)
  actual_rating = test_data[index[0]][index[1]]
  diff = (actual_rating - predicted_rating)*(actual_rating - predicted_rating)
  sum_diff = sum_diff + diff 

rmse = math.sqrt(sum_diff/cnt)
print("RMSE = ", rmse)

RMSE =  5.479323100865358


### Visual Based content - compressed 50 vit - RMSE =  5.508981653670089
### Visual Based content - 1000 vit - RMSE =  5.479323797063949
### Synopsis Based content - 768 mpnet - RMSE =  5.494660646826089
### Synopsis Based content - 50 mpnet - RMSE =  5.494175340940799
### Visual + Synopsis Based content - 1000 vit + 768 mpnet - RMSE =  5.479323100865358