In [45]:
import numpy as np
from numba import njit, prange
import pandas as pd

from tqdm.notebook import tqdm

In [2]:
train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

train_set = train_csv.sample(frac=.8, axis=0)
val_set = train_csv.drop(train_set.index, axis=0)

assert train_csv.shape[0] == train_set.shape[0] + val_set.shape[0]

In [3]:
@njit
def generate_ratings_mat(uir_mat, n_users, n_items):
    out = np.zeros((n_users, n_items), dtype=np.float32)
    
    for u, i, r in uir_mat:
        out[u,i] = r
        
    return out

users = 3974
movies = 3564
train_ratings_mat = generate_ratings_mat(train_set.values[:,:-1], users, movies)

## Load genres

In [4]:
movies_csv = pd.read_csv("../../data/movies_data.csv")
movies_csv["movie_id"] = movies_csv["movie_id"].apply(lambda x: x - 1)
movies_csv.head()

Unnamed: 0,movie_id,title,genres
0,0,Toy Story (1995),Animation|Children's|Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama
4,4,Father of the Bride Part II (1995),Comedy


In [5]:
genres_map = {
    "Action": 0,
    "Adventure": 1,
    "Animation": 2,
    "Children's": 3,
    "Comedy": 4,
    "Crime": 5,
    "Documentary": 6,
    "Drama": 7,
    "Fantasy": 8,
    "Film-Noir": 9,
    "Horror": 10,
    "Musical": 11,
    "Mystery": 12,
    "Romance": 13,
    "Sci-Fi": 14,
    "Thriller": 15,
    "War": 16,
    "Western": 17,
}

In [6]:
def generate_genres_encoding(ids, genres, n_items):
    out = np.zeros((n_items, len(genres_map)))
    
    for movie_id, genre_list in zip(ids, genres):
        genre_list = genre_list.split("|")
        for genre in genre_list:
            out[movie_id, genres_map[genre]] = 1
                   
    return out

genres_encoded = generate_genres_encoding(movies_csv["movie_id"].tolist(), movies_csv["genres"].tolist(), movies)

## Compute similarities

In [7]:
@njit
def jaccard(v1, v2):
    intersection = np.logical_and(v1, v2).sum()
    union = np.logical_or(v1, v2).sum()
    
    return float(intersection)/float(union)

In [8]:
@njit(parallel=True)
def compute_pairwise_similarities(genres, n_items):
    out = np.zeros((n_items, n_items))
    
    for i in range(n_items):
        for j in prange(n_items):
            if i != j:
                out[i, j] = jaccard(genres[i], genres[j])
                
    return out

In [47]:
sims_mat = compute_pairwise_similarities(genres_encoded, movies)

## Compute prediction

In [31]:
@njit
def predict(u, i, sims_mat, ratings_mat, k):
    sim_items_rated_by_u = [
        (sims_mat[i, j], ratings_mat[u, j])
        for j in range(ratings_mat.shape[1]) 
        if ratings_mat[u, j] != 0
    ]
    
    num = 0
    den = 0
    for sim_ij, r_uj in sorted(sim_items_rated_by_u, reverse=True)[:k]:
        num += (sim_ij*r_uj)
        den += sim_ij
        
    return num/(den+1e-15)

In [32]:
predict(0, 10, sims_mat, train_ratings_mat, 4)

3.4117647058823506

In [33]:
@njit(parallel=True)
def predict_batch(ui_mat, sims_mat, ratings_mat, k):
    predictions = np.zeros(ui_mat.shape[0])
    
    for idx in prange(ui_mat.shape[0]):
        u, i = ui_mat[idx]
        predictions[idx] = predict(u, i, sims_mat, ratings_mat, k)
        
    return np.clip(predictions, 1, 5)

In [34]:
@njit(parallel=True)
def rmse(pred, expected):
    return np.sqrt(np.sum((pred-expected)**2)/pred.shape[0])

## Choose best $k$

In [43]:
val_uir = val_set.values[:, :-1]

In [74]:
min_error = np.inf
best_k = 1
for k in tqdm(range(99, 200)):
    val_preds = predict_batch(val_uir[:, :-1], sims_mat, train_ratings_mat, k=k)

    err = rmse(val_preds, val_uir[:, -1])
    if min_error > err:
        min_error = err
        best_k = k
    
print(best_k, min_error)

  0%|          | 0/101 [00:00<?, ?it/s]

199 1.0169192637504492


# Fit on total data

In [48]:
total_ratings_mat = generate_ratings_mat(train_csv.values[:,:-1], users, movies)

## Predict on test data

In [55]:
test_ui = test_csv.values[:, 1:3]

In [71]:
test_preds = predict_batch(test_ui, sims_mat, total_ratings_mat, k=best_k)

In [72]:
out_csv = pd.DataFrame.from_dict({
    "id": test_csv["id"].tolist(),
    "rating": test_preds
})

out_csv.head()

Unnamed: 0,id,rating
0,0,3.774648
1,1,3.9
2,2,4.0
3,3,4.0
4,4,3.833333


In [73]:
out_csv.to_csv("out_fbc_knn_generos_2.csv", index=False)