In [1]:
import pandas as pd
import numpy as np
from numba import njit, prange

from tqdm.notebook import tqdm, trange

from numba.typed import List
from numba import types

from numba_progress import ProgressBar

import math

In [2]:
users = 226570
items = 231637
genres = 552

train_data = pd.read_csv("../../Preprocessing/processed_dataframes/train.csv")
validation_data = pd.read_csv("../../Preprocessing/processed_dataframes/val.csv")

In [3]:
uir_train = train_data.values

uir_val = validation_data.values
n_val = uir_val.shape[0]

In [4]:
@njit
def get_items_rated_by_users(train_data, n_users):
    res = List([List.empty_list(types.int64) for _ in range(n_users)])
    
    for u, i, _ in train_data:
        res[u].append(i)
    
    for u in range(n_users):
        res[u].sort()
    
    return res

irbu = get_items_rated_by_users(uir_train, users)

In [5]:
import scipy.sparse as sparse

tags_mat = sparse.load_npz("../../Preprocessing/objects/tags_matrix.npz")

In [6]:
def get_tags_item_belongs_to(tags_mat, n_items):
    res = List([List.empty_list(types.int64) for _ in range(n_items)])
    
    for row, col in zip(*tags_mat.nonzero()):
        res[row].append(col)
        
    return res

tibt = get_tags_item_belongs_to(tags_mat, items)

In [7]:
def get_count_known_ratings_of_item(train_data, n_items):
    res = np.zeros((n_items,))
    
    for _, i, _ in train_data:
        res[i] += 1
        
    return res

# Ris
kri = get_count_known_ratings_of_item(uir_train, items)

In [8]:
def get_count_known_items_of_tag(tags_mat, n_tags):
    # Tags mat n_items x tags
    res = np.zeros((n_tags,))
    
    for row, _ in zip(*tags_mat.T.nonzero()):
        res[row] += 1
        
    return res

# Ggs
kit = get_count_known_items_of_tag(tags_mat, genres)

In [9]:
@njit
def step(
    train_data, 
    Rus,
    Gis,
    Ris, # Njs
    Ggs,
    n_users, 
    n_items, 
    k, 
    α1,
    α2,
    μ, bi, bu, P, Q, Y, X,
):
    loss = 0
    for u, i, r in train_data:
        Ru = Rus[u]
        sqrt_Ru = np.sqrt(len(Ru))

        implicit_feedback = np.zeros(k)
        for j in Ru:
            implicit_feedback += Y[j]
        implicit_feedback /= (sqrt_Ru+1e-15)
        
        Gi = Gis[i]
        genres_feedback = np.zeros(k)
        for g in Gi:
            genres_feedback += X[g]
        genres_feedback /= len(Gi)

        pred = μ + bu[u] + bi[i] + np.dot(Q[i] + genres_feedback, P[u] + implicit_feedback)
        error = r - pred

        # Updating
        λ1 = 0.05*1/(sqrt_Ru+1e-15)
        λ2 = 0.05*1/(np.sqrt(Ris[i])+1e-15)
        λ3 = 1/(sqrt_Ru+1e-15)
        λ4 = 1/(np.sqrt(Ris[i])+1e-15)
        
        
        bu[u] += α1 * (error - λ1*bu[u])
        bi[i] += α1 * (error - λ2*bi[i])

        Pu = P[u]
        Qi = Q[i]
        P[u] += α2*(error*Qi - λ3*Pu)
        Q[i] += α2*(error*(Pu+implicit_feedback) - λ4*Qi)
    
        term_1 = error*(1/len(Gi))*(Pu + implicit_feedback)
        for g in Gi:
            λ5 = 1/(np.sqrt(Ggs[g])+1e-15)
            X[g] += α2*(term_1 - λ5*X[g])
            
        term_2 = error*(1/sqrt_Ru)*(Qi + genres_feedback)
        for j in Ru:
            λ6 = 1/(np.sqrt(Ris[j])+1e-15)
            Y[j] += α2*(term_2 - λ6*Y[j])
            
        loss += error**2
            
    return np.sqrt(loss/len(train_data))

In [10]:
# RS HD page 171 (chrome), 84 book
def fit_gsvdpp(
    train_data, val_data, Rus, Gis, Ris, Ggs, n_users, n_items, n_tags, k, 
    α1=.01, α2=.01, n_iters=20
):
    """
    train_data: array Nx3
    """
    val_ui = uir_val[:, :2]
    val_exp = uir_val[:, -1]
    
    bu = np.zeros(n_users, np.double)
    bi = np.zeros(n_items, np.double)
    
    P = np.random.normal(0, .1, (n_users, k))
    Q = np.random.normal(0, .1, (n_items, k))
    Y = np.random.normal(0, .1, (n_items, k))
    X = np.random.normal(0, .1, (n_tags, k))
    
    μ = np.mean(train_data[:, 2])
    
    model_params = None
    best_epoch = 0
    prev_val_loss = math.inf
    
    t = trange(n_iters, leave=True)
    for it in t:
        loss = step(
            train_data, Rus, Gis, Ris, Ggs, n_users, n_items, k, 
            α1, α2, μ, bi, bu, P, Q, Y, X
        )
        
        val_preds = predict_batch(val_ui, Rus, Gis, (μ, bu, bi, P, Q, Y, X))
        val_loss = np.sqrt(1/n_val * np.sum((val_preds - val_exp)**2))
        t.set_postfix({"Loss": loss, "Val": val_loss})
        
        if val_loss < prev_val_loss:
            prev_val_loss = val_loss
            model_params = (μ, bu.copy(), bi.copy(), P.copy(), Q.copy(), Y.copy(), X.copy())
            best_epoch = it
    
#     return μ, bu, bi, P, Q, Y
    return model_params

## Fit

In [11]:
@njit
def predict(u, i, Rus, Gis, params):
    μ, bu, bi, P, Q, Y, X = params
    k = P.shape[1]
    
    Ru = Rus[u]
    sqrt_Ru = np.sqrt(len(Ru))

    implicit_feedback = np.zeros(k)
    for j in Ru:
        implicit_feedback += Y[j]
    implicit_feedback /= (sqrt_Ru+1e-15)
    
    Gi = Gis[i]
    genres_feedback = np.zeros(k)
    for g in Gi:
        genres_feedback += X[g]
    genres_feedback /= len(Gi)

    pred = μ + bu[u] + bi[i] + np.dot(Q[i] + genres_feedback, P[u] + implicit_feedback)
    
    return pred

In [12]:
@njit(parallel=True, nogil=True)
def predict_batch(ui_mat, Rus, Gis, params):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, Rus, Gis, params)
        
    return np.clip(predictions, 1., 5.)

In [13]:
# α1 = 0.0007
# α2 = 0.01
# k = 4

# fitted_params = fit_gsvdpp(
#     uir_train, uir_val, irbu, tibt, kri, kit, users, items, genres, k, 
#     α1, α2, n_iters=25,
# )

# val_preds = predict_batch(uir_val[:, :2], irbu, tibt, fitted_params)
# val_expected = uir_val[:, 2]

# error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
# print(error)

# # ERROR 0.8987672000985515

## Multiple train

In [14]:
from joblib import Parallel, delayed
from itertools import product

In [15]:
alphas = [0.005, 0.006, 0.007]
factors = [4, 5, 50, 100]

# params_product = list(product(alphas, alphas, factors))
# print(params_product)
params_product = [ 
    (0.007, 0.007, 100),
    (0.007, 0.007, 50), 
]

In [16]:
def train_val(
    uir_train,
    uir_val,
    users,
    movies,
    genres,
    k,
    α1,
    α2,
    irbu,
    tibt,
    kri, 
    kit,
    n_iters,
):
    fitted_params = fit_gsvdpp(
        uir_train, uir_val, irbu, tibt, kri, kit, users, items, genres, k, 
        α1, α2, n_iters,
    )
    
    val_preds = predict_batch(uir_val[:, :2], irbu, tibt, fitted_params)
    val_expected = uir_val[:, 2]
    error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
    
    print(α1, α2, k, error)
    return α1, α2, k, error

In [None]:
out = [
    train_val(
        uir_train, 
        uir_val,
        users, 
        items,
        genres,
        k=f, 
        α1=lr1,
        α2=lr2,
        irbu=irbu,
        tibt=tibt,
        kri=kri,
        kit=kit,
        n_iters=25,
    )
    for lr1, lr2, f in tqdm(params_product)
]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

In [None]:
print(sorted(out, key=lambda x: x[-1])[0])