In [1]:
import numpy as np
from numba import njit, prange
import pandas as pd

from tqdm.notebook import tqdm

In [2]:
train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

train_set = train_csv.sample(frac=.8, axis=0)
val_set = train_csv.drop(train_set.index, axis=0)

assert train_csv.shape[0] == train_set.shape[0] + val_set.shape[0]

In [3]:
@njit
def generate_ratings_mat(uir_mat, n_users, n_items):
    out = np.zeros((n_users, n_items), dtype=np.float32)
    
    for u, i, r in uir_mat:
        out[u,i] = r
        
    return out

users = 3974
movies = 3564
train_ratings_mat = generate_ratings_mat(train_set.values[:,:-1], users, movies)

## Load reviews

In [4]:
reviews_csv = pd.read_csv("../../data/movie_reviews.csv")
reviews_csv["movie_id"] = reviews_csv["movie_id"].apply(lambda x: x - 1)
reviews_csv.head()

Unnamed: 0,movie_id,text
0,0,Andy's toys live a reasonable life of fun and ...
1,0,I am a big fan of the animated movies coming f...
2,0,This is a very clever animated story that was ...
3,0,Toy Story – 5/5 stars Children play with toys....
4,0,"Y'know, I always suspected that my toys were c..."


### Concatenate reviews for the same movie

In [5]:
from collections import defaultdict

In [6]:
concatenated_reviews = defaultdict(str)

for movie_id, review in tqdm(zip(reviews_csv["movie_id"].tolist(), reviews_csv["text"].tolist())):
    concatenated_reviews[movie_id] += review

0it [00:00, ?it/s]

In [7]:
movies_ids = list(concatenated_reviews.keys())
reviews = list(concatenated_reviews.values())

## Vectorizing reviews

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# Removes stop words, radicalizes and applies l2 norm
vectorizer = TfidfVectorizer(stop_words="english")

In [10]:
vectorized_reviews = vectorizer.fit_transform(reviews).todense()
vectorized_reviews.shape

(2488, 75724)

#### Build matrix of features per item with zeros for movies that don't have a review

In [11]:
vectorized_reviews_full = np.zeros((movies, vectorized_reviews.shape[1]))

for idx, movie_id in enumerate(movies_ids):
    vectorized_reviews_full[movie_id] = vectorized_reviews[idx]

We have the matrix of vectorized reviews (n_movies, n_terms)

## Compute similarities

In [12]:
@njit(parallel=True)
def compute_pairwise_similarities(reviews, n_items):
    out = np.zeros((n_items, n_items))
    
    for i in range(n_items):
        for j in prange(n_items):
            if i != j:
                # Because the vectors are normalized
                # The cosine similarity is just the dot product
                out[i, j] = np.dot(reviews[i], reviews[j])
                
    return out

In [17]:
sims_mat = compute_pairwise_similarities(vectorized_reviews_full, movies)

## Compute prediction

In [18]:
@njit
def predict(u, i, sims_mat, ratings_mat, k):
    sim_items_rated_by_u = [
        (sims_mat[i, j], ratings_mat[u, j])
        for j in range(ratings_mat.shape[1]) 
        if ratings_mat[u, j] != 0
    ]
    
    num = 0
    den = 0
    for sim_ij, r_uj in sorted(sim_items_rated_by_u, reverse=True)[:k]:
        num += (sim_ij*r_uj)
        den += sim_ij
        
    return num/(den+1e-15)

In [19]:
predict(0, 10, sims_mat, train_ratings_mat, 4)

4.219143390740921

In [20]:
@njit(parallel=True)
def predict_batch(ui_mat, sims_mat, ratings_mat, k):
    predictions = np.zeros(ui_mat.shape[0])
    
    for idx in prange(ui_mat.shape[0]):
        u, i = ui_mat[idx]
        predictions[idx] = predict(u, i, sims_mat, ratings_mat, k)
        
    return np.clip(predictions, 1, 5)

In [21]:
@njit(parallel=True)
def rmse(pred, expected):
    return np.sqrt(np.sum((pred-expected)**2)/pred.shape[0])

## Choose best $k$

In [22]:
val_uir = val_set.values[:, :-1]

In [23]:
min_error = np.inf
best_k = 1
for k in tqdm(range(1, 100)):
    val_preds = predict_batch(val_uir[:, :-1], sims_mat, train_ratings_mat, k=k)

    err = rmse(val_preds, val_uir[:, -1])
    if min_error > err:
        min_error = err
        best_k = k
    
print(best_k, min_error)

  0%|          | 0/99 [00:00<?, ?it/s]

36 1.553623518612029


## Predict

In [24]:
# Create ratings mat on full train data
total_ratings_mat = generate_ratings_mat(train_csv.values[:,:-1], users, movies)

In [25]:
# Predict
test_ui = test_csv.values[:, 1:3]
test_preds = predict_batch(test_ui, sims_mat, total_ratings_mat, k=best_k)

In [26]:
out_csv = pd.DataFrame.from_dict({
    "id": test_csv["id"].tolist(),
    "rating": test_preds
})

out_csv.head()

Unnamed: 0,id,rating
0,0,3.735731
1,1,1.0
2,2,3.742742
3,3,3.795614
4,4,1.0


In [None]:
out_csv.to_csv("out_fbc_knn_reviews_1.csv", index=False)