In [1]:
import pandas as pd
import numpy as np
from numba import njit, prange

from tqdm.notebook import tqdm, trange

from numba.typed import List
from numba import types

from numba_progress import ProgressBar

In [2]:
users = 3974
movies = 3564

train_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/train_data.csv")
test_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]

In [3]:
uir_train = train_data.values

uir_val = validation_data.values
n_val = uir_val.shape[0]

In [4]:
@njit
def get_items_rated_by_users(train_data, n_users):
    res = List([List.empty_list(types.int64) for _ in range(n_users)])
    
    for u, i, _ in train_data:
        res[u].append(i)
    
    for u in range(n_users):
        res[u].sort()
    
    return res

irbu = get_items_rated_by_users(uir_train, users)

In [5]:
@njit
def predict(u, i, Rus, params):
    μ, bu, bi, P, Q, Y = params
    k = P.shape[1]
    
    Ru = Rus[u]
    sqrt_Ru = np.sqrt(len(Ru))

    implicit_feedback = np.zeros(k)
    for j in Ru:
        implicit_feedback += Y[j]
    implicit_feedback /= (sqrt_Ru+1e-15)

    pred = μ + bu[u] + bi[i] + np.dot(Q[i], P[u] + implicit_feedback)
    
    return pred

In [None]:
@njit
def step(
    train_data, 
    Rus, 
    n_users, 
    n_items, 
    k, 
    α1,
    α2,
    α3,
    α4,
    λ1, 
    λ2,
    μ, bi, bu, P, Q, Y,
):
    loss = 0
    for u, i, r in train_data:
        Ru = Rus[u]
        sqrt_Ru = np.sqrt(len(Ru))

        implicit_feedback = np.zeros(k)
        for j in Ru:
            implicit_feedback += Y[j]
        implicit_feedback /= (sqrt_Ru+1e-15)

        pred = μ + bu[u] + bi[i] + np.dot(Q[i], P[u] + implicit_feedback)
        error = r - pred

        # Updating
        bu[u] += α1 * (error - λ1*bu[u])
        bi[i] += α2 * (error - λ1*bi[i])

        Pu = P[u]
        Qi = Q[i]
        P[u] += α3*(error*Qi - λ2*Pu)
        Q[i] += α4*(error*(Pu+implicit_feedback) - λ2*Qi)

        term_1 = error*(Qi/(sqrt_Ru+1e-15))
        for j in Ru:
            Y[j] += α*(term_1 - λ2*Y[j])
            
        loss += error**2
            
    return np.sqrt(loss/len(train_data))

In [7]:
# RS HD page 171 (chrome), 84 book
def fit_svdpp(train_data, Rus, n_users, n_items, k, α=.01, λ1=.01, λ2=.01, n_iters=20):
    """
    train_data: array Nx3
    """
    bu = np.zeros(n_users, np.double)
    bi = np.zeros(n_items, np.double)
    
    P = np.random.normal(0, .1, (n_users, k))
    Q = np.random.normal(0, .1, (n_items, k))
    Y = np.random.normal(0, .1, (n_items, k))
    
    μ = np.mean(train_data[:, 2])
    
    t = trange(n_iters, leave=True)
    for it in t:
        loss = step(train_data, Rus, n_users, n_items, k, α, λ1, λ2, μ, bi, bu, P, Q, Y)
        t.set_postfix({"Loss": loss})
        α *= 0.9
    
    return μ, bu, bi, P, Q, Y

# Fit

In [8]:
fitted_params = fit_svdpp(uir_train, irbu, users, movies, k=10, α=.007, λ1=.005, λ2=.015, n_iters=3)

  0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
@njit(parallel=True, nogil=True)
def predict_batch_inner(ui_mat, Rus, params, progress_hook):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, Rus, params)
        if np.isnan(predictions[it]):
            print(u, i)
            
        progress_hook.update(1)
        
    return np.clip(predictions, 1., 5.)

def predict_batch(ui_mat, Rus, params):
    with ProgressBar(total=len(ui_mat)) as progress:
        return predict_batch_inner(ui_mat, Rus, params, progress)

In [10]:
val_preds = predict_batch(uir_val[:, :2], irbu, fitted_params)
val_expected = uir_val[:, 2]

error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
print(error)

  0%|                                                | 0/107157 [00:00<?, ?it/s]

0.9082651435110298


# Predict

In [11]:
uir_total = train_csv.drop(["timestamp"], axis=1).values
irbu_total = get_items_rated_by_users(uir_total, users)

In [13]:
fitted_final_params = fit_svdpp(uir_total, irbu_total, users, movies, k=10,α=.007, λ1=.005, λ2=.015, n_iters=30)

  0%|          | 0/30 [00:00<?, ?it/s]

In [14]:
ui_test = test_csv.drop(["id", "timestamp"], axis=1).values
test_predictions = predict_batch(ui_test, irbu_total, fitted_final_params)

print(test_predictions[:10])

  0%|                                                  | 0/3970 [00:00<?, ?it/s]

[3.32194134 3.27725764 2.94825925 3.51011877 3.01377354 2.94072361
 3.72891223 2.99116142 3.00999794 3.34803891]


In [15]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,3.321941
1,1,3.277258
2,2,2.948259
3,3,3.510119
4,4,3.013774


In [16]:
out_df.to_csv("out_svd++1.csv", index=False)