In [1]:
import numpy as np
from numba import njit, prange
from numba.typed import Dict, List
import pandas as pd

from tqdm.notebook import tqdm

In [2]:
train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

train_set = train_csv.sample(frac=.8, axis=0)
val_set = train_csv.drop(train_set.index, axis=0)

assert train_csv.shape[0] == train_set.shape[0] + val_set.shape[0]

In [3]:
users = 3974
movies = 3564

## Load genres

In [4]:
movies_csv = pd.read_csv("../../data/movies_data.csv")
movies_csv["movie_id"] = movies_csv["movie_id"].apply(lambda x: x - 1)
movies_csv.head()

Unnamed: 0,movie_id,title,genres
0,0,Toy Story (1995),Animation|Children's|Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama
4,4,Father of the Bride Part II (1995),Comedy


In [5]:
genres_raw_dict = {
    "Action": 0,
    "Adventure": 1,
    "Animation": 2,
    "Children's": 3,
    "Comedy": 4,
    "Crime": 5,
    "Documentary": 6,
    "Drama": 7,
    "Fantasy": 8,
    "Film-Noir": 9,
    "Horror": 10,
    "Musical": 11,
    "Mystery": 12,
    "Romance": 13,
    "Sci-Fi": 14,
    "Thriller": 15,
    "War": 16,
    "Western": 17,
}

genres_map = Dict()
for k, v in genres_raw_dict.items():
    genres_map[k] = v

In [6]:
@njit(parallel=True)
def generate_genres_encoding(ids, genres, n_items, genres_map):
    out = np.zeros((n_items, len(genres_map)))
    
    for movie_id, genre_list in zip(ids, genres):
        genre_list = genre_list.split("|")
        for genre in genre_list:
            out[movie_id, genres_map[genre]] = 1
                   
    return out

# matrix (n_items, n_features)
genres_encoded = generate_genres_encoding(
    List(movies_csv["movie_id"].tolist()), 
    List(movies_csv["genres"].tolist()), 
    movies,
    genres_map,
)

## Train model

In [7]:
@njit
def fit(train_uir, features, n_users, n_items, α=0.05, λ=0.002, n_iters=20):
    n_features = features.shape[1]

    profiles = np.random.normal(0, .1, (n_users, n_features))
    biases = np.zeros((n_users, 1))
    
    for it in range(n_iters):
        for u, i, r in train_uir:
            e_ui = (np.dot(profiles[u], features[i]) + biases[u]) - r
            
            profiles[u] -= α*(e_ui*features[i] + λ*profiles[u])
            biases[u] -= α*e_ui
            
    return profiles, biases

In [8]:
train_fitted = fit(train_set.values[:,:-1], genres_encoded, users, movies)

## Predictions

In [9]:
train_profiles, train_biases = train_fitted
print(np.dot(train_profiles[0], genres_encoded[10]) + train_biases[0])

[2.94606727]


In [10]:
def predict_batch(ui_mat, features_mat, profiles, biases):
    # (n_users x n_features) x (n_items, n_features)^T + (n_users, 1) 
    predictions_mat = profiles@features_mat.T + biases
    
    return predictions_mat[ui_mat[:, 0], ui_mat[:, 1]]

In [11]:
@njit(parallel=True)
def rmse(pred, expected):
    return np.sqrt(np.sum((pred-expected)**2)/pred.shape[0])

In [12]:
val_uir = val_set.values[:, :-1]

In [13]:
preds_val = predict_batch(val_uir, genres_encoded, train_profiles, train_biases)

rmse(preds_val, val_uir[:, -1])

1.0616414711878002

## Choose best params

In [14]:
uir_train = train_set.values[:,:-1]
expected_val = val_uir[:, -1]

In [15]:
min_error = np.inf
best_params = [0.005, 0.001, 20]
for lr in tqdm(np.linspace(0.005, 0.1, num=10)):
    for λ in tqdm(np.linspace(0.001, 0.01, num=10)):
        for epochs in range(20, 50, 10):
            fitted_profiles, fitted_biases = fit(uir_train, genres_encoded, users, movies, lr, λ, n_iters=epochs)
            
            val_preds = predict_batch(val_uir, genres_encoded, fitted_profiles, fitted_biases)
            
            error = rmse(val_preds, expected_val)
            
            if min_error > error:
                min_error = error
                best_params = [lr, λ, epochs]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [22]:
print(min_error, best_params)

1.0237881078960265 [0.005, 0.01, 40]


## Fit on total data

In [16]:
total_profiles, total_biases = fit(
    train_csv.values[:,:-1], 
    genres_encoded, 
    users, 
    movies, 
    α=best_params[0], 
    λ=best_params[1], 
    n_iters=best_params[2]
)

## Predict on test data

In [17]:
test_ui = test_csv.values[:, 1:3]

In [18]:
test_preds = predict_batch(test_ui, genres_encoded, total_profiles, total_biases)

In [19]:
out_csv = pd.DataFrame.from_dict({
    "id": test_csv["id"].tolist(),
    "rating": test_preds
})

out_csv.head()

Unnamed: 0,id,rating
0,0,3.346339
1,1,3.83958
2,2,3.751718
3,3,3.751718
4,4,3.320649


In [20]:
out_csv.to_csv("out_linearcbf_generos_1.csv", index=False)