In [1]:
import pandas as pd
import numpy as np
from numba import njit, prange

from tqdm.notebook import tqdm, trange

from numba.typed import List
from numba import types

from numba_progress import ProgressBar

In [2]:
users = 3974
movies = 3564

train_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/train_data.csv")
test_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]

In [3]:
uir_train = train_data.values

uir_val = validation_data.values
n_val = uir_val.shape[0]

In [4]:
@njit
def get_items_rated_by_users(train_data, n_users):
    res = List([List.empty_list(types.int64) for _ in range(n_users)])
    
    for u, i, _ in train_data:
        res[u].append(i)
    
    for u in range(n_users):
        res[u].sort()
    
    return res

def get_irbu_matrix_with_len(irbu, n_users, n_items):
    out = np.ones((n_users, n_items), dtype=int)*-1
    len_irbu = np.zeros(n_users)

    for u in range(n_users):
        len_irbu[u] = len(irbu[u])
        for idx, i in enumerate(irbu[u]):
            out[u, idx] = i
            
    return out, len_irbu

irbu, len_irbu = get_irbu_matrix_with_len(get_items_rated_by_users(uir_train, users), users, movies)

In [5]:
@njit
def step(
    train_data, 
    Rus, 
    len_Rus,
    n_users, 
    n_items, 
    k, 
    α1,
    α2,
    α3,
    α4,
    α5,
    λ1, 
    λ2,
    μ, bi, bu, P, Q, Y,
):
    loss = 0
    for u, i, r in train_data:
        Ru = Rus[u]
        sqrt_Ru = np.sqrt(len_Rus[u])

        implicit_feedback = np.zeros(k)
        for j in Ru:
            if j == -1:
                break
            implicit_feedback += Y[j]
        implicit_feedback /= (sqrt_Ru+1e-15)

        pred = μ + bu[u] + bi[i] + np.dot(Q[i], P[u] + implicit_feedback)
        error = r - pred

        # Updating
        bu[u] += α1 * (error - λ1*bu[u])
        bi[i] += α2 * (error - λ1*bi[i])

        Pu = P[u]
        Qi = Q[i]
        P[u] += α3*(error*Qi - λ2*Pu)
        Q[i] += α4*(error*(Pu+implicit_feedback) - λ2*Qi)

        term_1 = error*(Qi/(sqrt_Ru+1e-15))
        for j in Ru:
            if j == -1:
                break
            Y[j] += α5*(term_1 - λ2*Y[j])
            
        loss += error**2
            
    return np.sqrt(loss/len(train_data))

In [11]:
# RS HD page 171 (chrome), 84 book
def fit_svdpp(train_data, val_data, Rus, len_Rus, n_users, n_items, k, α1=.01, α2=.01, α3=.01, α4=.01, α5=.01, λ1=.01, λ2=.01, n_iters=20):
    """
    train_data: array Nx3
    """
    val_ui = uir_val[:, :2]
    val_exp = uir_val[:, -1]
    
    bu = np.zeros(n_users, np.double)
    bi = np.zeros(n_items, np.double)
    
    P = np.random.normal(0, .1, (n_users, k))
    Q = np.random.normal(0, .1, (n_items, k))
    Y = np.random.normal(0, .1, (n_items, k))
    
    μ = np.mean(train_data[:, 2])
    
    t = trange(n_iters, leave=True)
    for it in t:
        loss = step(train_data, Rus, len_Rus, n_users, n_items, k, α1, α2, α3, α4, α5, λ1, λ2, μ, bi, bu, P, Q, Y)
#         α1 *= 0.9
#         α2 *= 0.9
#         α3 *= 0.9
#         α4 *= 0.9
#         α5 *= 0.9
        
        val_preds = predict_batch(val_ui, Rus, len_Rus, (μ, bu, bi, P, Q, Y))
        val_loss = np.sqrt(1/n_val * np.sum((val_preds - val_exp)**2))
        t.set_postfix({"Loss": loss, "Val": val_loss})
    
    return μ, bu, bi, P, Q, Y

# Fit

In [7]:
@njit
def predict(u, i, Rus, len_Rus, params):
    μ, bu, bi, P, Q, Y = params
    k = P.shape[1]
    
    Ru = Rus[u]
    sqrt_Ru = np.sqrt(len_Rus[u])

    implicit_feedback = np.zeros(k)
    for j in Ru:
        if j == -1:
            break
        implicit_feedback += Y[j]
    implicit_feedback /= (sqrt_Ru+1e-15)

    pred = μ + bu[u] + bi[i] + np.dot(Q[i], P[u] + implicit_feedback)
    
    return pred

In [8]:
@njit(parallel=True, nogil=True)
def predict_batch_inner(ui_mat, Rus, len_Rus, params, progress_hook):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, Rus, len_Rus, params)
        if np.isnan(predictions[it]):
            print(u, i)
            
        progress_hook.update(1)
        
    return np.clip(predictions, 1., 5.)

def predict_batch_progress_bar(ui_mat, Rus, len_Rus, params):
    with ProgressBar(total=len(ui_mat)) as progress:
        return predict_batch_inner(ui_mat, Rus, len_Rus, params, progress)
    
@njit(parallel=True, nogil=True)
def predict_batch(ui_mat, Rus, len_Rus, params):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, Rus, len_Rus, params)
        
    return np.clip(predictions, 1., 5.)

In [12]:
# lr = .007
# α1 = α2 = α3 = α4 = α5 = lr

α1 = 0.005
α2 = 0.005
α3 = 0.006
α4 = 0.006
α5 = 0.006
k = 5
fitted_params = fit_svdpp(
    uir_train, uir_val, irbu, len_irbu, users, movies, k, 
    α1, α2, α3, α4, α5, λ1=.02, λ2=.01,
    n_iters=30,
)

  0%|          | 0/30 [00:00<?, ?it/s]

In [13]:
val_preds = predict_batch(uir_val[:, :2], irbu, len_irbu, fitted_params)
val_expected = uir_val[:, 2]

error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
print(error)

0.8680973589568941


## Multiple train

In [None]:
from joblib import Parallel, delayed
from itertools import product

In [None]:
num = 5
alphas = np.linspace(0.003, 0.007, num=num)
lamb_1 = np.linspace(.004, .006, num=3)
lamb_2 = np.linspace(.015, .02, num=2)

params_product = list(product(alphas, alphas, lamb_1, lamb_2))

In [None]:
def train_val(
    uir_train, 
    arr_irbu,
    users, 
    movies,
    k, 
    α1,
    α2,
    α3,
    α4,
    α5,
    λ1, 
    λ2, 
    n_iters,
    uir_val,
):
    fitted_params = fit_svdpp(uir_train, arr_irbu, users, movies, k, α1, α2, α3, α4, α5,λ1, λ2, n_iters)
    
    val_preds = predict_batch(uir_val[:, :2], arr_irbu, fitted_params)
    val_expected = uir_val[:, 2]
    error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
    
    return μ, α1, α2, α3, α4, λ1, λ2, error

In [None]:
out = Parallel(n_jobs=12)(
    delayed(train_val)(
        uir_train, 
        irbu,
        users, 
        movies,
        k=5, 
        α1=lr1,
        α2=lr1,
        α3=lr2,
        α4=lr2,
        α5=lr2,
        λ1=lamb1, 
        λ2=lamb2, 
        n_iters=30,
        uir_val=uir_val,
    )
    for lr1, lr2, lamb1, lamb2 in tqdm(params_product)
)

In [None]:
print(sorted(out, key=lambda x: x[-1]))

# Predict

In [14]:
uir_total = train_csv.drop(["timestamp"], axis=1).values
irbu_total, len_irbu_total = get_irbu_matrix_with_len(get_items_rated_by_users(uir_total, users), users, movies)

In [26]:
fitted_final_params = fit_svdpp(
    uir_total,
    uir_val,
    irbu_total, 
    len_irbu_total,
    users, 
    movies, 
    k=5, 
    α1=.005, 
    α2=.005, 
    α3=.006, 
    α4=.006, 
    α5=.006, 
    λ1=.02,
    λ2=.01, 
    n_iters=50
)

  0%|          | 0/50 [00:00<?, ?it/s]

In [27]:
ui_test = test_csv.drop(["id", "timestamp"], axis=1).values
test_predictions = predict_batch(ui_test, irbu_total, len_irbu_total, fitted_final_params)

print(test_predictions[:10])

[3.10562099 3.4309013  2.80811193 3.54225531 3.27101877 3.0597957
 3.84120687 3.06299574 2.88759045 3.19954349]


In [28]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,3.105621
1,1,3.430901
2,2,2.808112
3,3,3.542255
4,4,3.271019


In [29]:
out_df.to_csv("out_svd++5.csv", index=False)