# Combine reviews and genres similarities

In [1]:
import pandas as pd
import numpy as np
from numba import njit, prange

from tqdm.notebook import tqdm

In [2]:
users = 3974
movies = 3564

## Load data

### Load reviews

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
from collections import defaultdict

In [5]:
reviews_csv = pd.read_csv("../../data/movie_reviews.csv")
reviews_csv["movie_id"] = reviews_csv["movie_id"].apply(lambda x: x - 1)

movies_ids = reviews_csv["movie_id"].tolist()
reviews = reviews_csv["text"].tolist()

# Concatenate reviews for the same movie
concatenated_reviews = defaultdict(str)
for movie_id, review in tqdm(zip(reviews_csv["movie_id"].tolist(), reviews_csv["text"].tolist())):
    concatenated_reviews[movie_id] += review
    
movies_ids = list(concatenated_reviews.keys())
reviews = list(concatenated_reviews.values())

# Fit tfdf
vectorizer = TfidfVectorizer(stop_words="english")
vectorized_reviews = vectorizer.fit_transform(reviews).todense()

# Build full features matrix
vectorized_reviews_full = np.zeros((movies, vectorized_reviews.shape[1]))

for idx, movie_id in enumerate(movies_ids):
    vectorized_reviews_full[movie_id] = vectorized_reviews[idx]

0it [00:00, ?it/s]

Get all rows that don't have a review

In [6]:
from collections import defaultdict

In [7]:
no_review_idxs = defaultdict(bool)
for i, item_review_vec in enumerate(vectorized_reviews_full):
    if np.all(item_review_vec == 0):
        no_review_idxs[i] = True
        
print(len(no_review_idxs))

1076


In [8]:
@njit(parallel=True)
def compute_pairwise_similarities_dot_product(reviews, n_items):
    out = np.zeros((n_items, n_items))
    
    for i in range(n_items):
        for j in prange(n_items):
            if i != j:
                # Because the vectors are normalized
                # The cosine similarity is just the dot product
                out[i, j] = np.dot(reviews[i], reviews[j])
                
    return out

In [9]:
reviews_sims_mat = compute_pairwise_similarities_dot_product(vectorized_reviews_full, movies)

### Load genres

In [10]:
genres_map = {
    "Action": 0,
    "Adventure": 1,
    "Animation": 2,
    "Children's": 3,
    "Comedy": 4,
    "Crime": 5,
    "Documentary": 6,
    "Drama": 7,
    "Fantasy": 8,
    "Film-Noir": 9,
    "Horror": 10,
    "Musical": 11,
    "Mystery": 12,
    "Romance": 13,
    "Sci-Fi": 14,
    "Thriller": 15,
    "War": 16,
    "Western": 17,
}

movies_csv = pd.read_csv("../../data/movies_data.csv")
movies_csv["movie_id"] = movies_csv["movie_id"].apply(lambda x: x - 1)

In [11]:
def generate_genres_encoding(ids, genres, n_items):
    out = np.zeros((n_items, len(genres_map)))
    
    for movie_id, genre_list in zip(ids, genres):
        genre_list = genre_list.split("|")
        for genre in genre_list:
            out[movie_id, genres_map[genre]] = 1
                   
    return out

In [12]:
genres_encoded = generate_genres_encoding(movies_csv["movie_id"].tolist(), movies_csv["genres"].tolist(), movies)

In [13]:
@njit
def jaccard(v1, v2):
    intersection = np.logical_and(v1, v2).sum()
    union = np.logical_or(v1, v2).sum()
    
    return float(intersection)/float(union)

@njit(parallel=True)
def compute_pairwise_similarities_jaccard(genres, n_items):
    out = np.zeros((n_items, n_items))
    
    for i in range(n_items):
        for j in prange(n_items):
            if i != j:
                out[i, j] = jaccard(genres[i], genres[j])
                
    return out

In [14]:
genres_sims_mat = compute_pairwise_similarities_jaccard(genres_encoded, movies)

In [15]:
# Check values ranges in similarities
print(reviews_sims_mat.min(), reviews_sims_mat.max())
print(genres_sims_mat.min(), genres_sims_mat.max())

0.0 0.8453488606804234
0.0 1.0


In [16]:
np.save("reviews_sims_mat", reviews_sims_mat)
np.save("genres_sims_mat", genres_sims_mat)

## Average similarities

In [17]:
reviews_sims_mat = np.load("reviews_sims_mat.npy")
genres_sims_mat = np.load("genres_sims_mat.npy")

In [26]:
# Rescale reviews sims mat because dot product's range is -1 to 1
# reviews_sims_mat = (reviews_sims_mat + 1)/2
# print(reviews_sims_mat.min(), reviews_sims_mat.max())

In [18]:
combined_sims_mat = (reviews_sims_mat + genres_sims_mat)/2
print(combined_sims_mat.min(), combined_sims_mat.max())

np.save("combined_sims_mat", combined_sims_mat)

0.0 0.9226744303402117


In [19]:
# Combine similarities, if an item doesn't have a review, use the genre similarity
combined_sims_mat = np.zeros_like(reviews_sims_mat)

for i in range(movies):
    for j in range(movies):
        if no_review_idxs[i] or no_review_idxs[j]:
            combined_sims_mat[i, j] = genres_sims_mat[i, j]
        else:
            combined_sims_mat[i, j] = (reviews_sims_mat[i, j] + genres_sims_mat[i, j])/2
            
print(combined_sims_mat.min(), combined_sims_mat.max())
np.save("combined_sims_mat_2", combined_sims_mat)

0.0 1.0
