In [1]:
import numpy as np
from numba import njit, prange
import pandas as pd

from tqdm.notebook import tqdm

In [2]:
train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

train_set = train_csv.sample(frac=.8, axis=0)
val_set = train_csv.drop(train_set.index, axis=0)

assert train_csv.shape[0] == train_set.shape[0] + val_set.shape[0]

In [3]:
@njit
def generate_ratings_mat(uir_mat, n_users, n_items):
    out = np.zeros((n_users, n_items), dtype=np.float32)
    
    for u, i, r in uir_mat:
        out[u,i] = r
        
    return out

users = 3974
movies = 3564
train_ratings_mat = generate_ratings_mat(train_set.values[:,:-1], users, movies)

In [4]:
@njit
def predict(u, i, sims_mat, ratings_mat, k):
    sim_items_rated_by_u = [
        (sims_mat[i, j], ratings_mat[u, j])
        for j in range(ratings_mat.shape[1]) 
        if ratings_mat[u, j] != 0 and i != j
    ]
    
    num = 0
    den = 0
    for sim_ij, r_uj in sorted(sim_items_rated_by_u, reverse=True)[:k]:
        num += (sim_ij*r_uj)
        den += sim_ij
        
    return num/(den+1e-15)

In [5]:
@njit(parallel=True)
def predict_batch(ui_mat, sims_mat, ratings_mat, k):
    predictions = np.zeros(ui_mat.shape[0])
    
    for idx in prange(ui_mat.shape[0]):
        u, i = ui_mat[idx]
        predictions[idx] = predict(u, i, sims_mat, ratings_mat, k)
        
    return np.clip(predictions, 1, 5)

In [6]:
@njit(parallel=True)
def rmse(pred, expected):
    return np.sqrt(np.sum((pred-expected)**2)/pred.shape[0])

### Predictions

In [10]:
# Load similarity matrix
# sims_mat = np.load("combined_sims_mat.npy")
sims_mat = np.load("combined_sims_mat_2.npy")

In [11]:
val_uir = val_set.values[:, :-1]

In [12]:
min_error = np.inf
best_k = 1
for k in tqdm(range(1, 200)):
    val_preds = predict_batch(val_uir[:, :-1], sims_mat, train_ratings_mat, k=k)

    err = rmse(val_preds, val_uir[:, -1])
    if min_error > err:
        min_error = err
        best_k = k
    
print(best_k, min_error)

  0%|          | 0/199 [00:00<?, ?it/s]

154 1.0050397029980014
