In [1]:
import pandas as pd
import numpy as np
from numba import njit, prange

from tqdm.notebook import tqdm, trange

from numba.typed import List
from numba import types

from numba_progress import ProgressBar

### Load Ratings

In [2]:
users = 3974
movies = 3564

train_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/train_data.csv")
test_csv = pd.read_csv("/home/nubol23/Desktop/Codes/USP/SCC5966/kaggle/data/test_data.csv")

train_csv["user_id"] = train_csv["user_id"].apply(lambda x: x - 1)
train_csv["movie_id"] = train_csv["movie_id"].apply(lambda x: x - 1)

test_csv["user_id"] = test_csv["user_id"].apply(lambda x: x - 1)
test_csv["movie_id"] = test_csv["movie_id"].apply(lambda x: x - 1)

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]

In [3]:
uir_train = train_data.values

uir_val = validation_data.values
n_val = uir_val.shape[0]

In [4]:
@njit
def get_items_rated_by_users(train_data, n_users):
    res = List([List.empty_list(types.int64) for _ in range(n_users)])
    
    for u, i, _ in train_data:
        res[u].append(i)
    
    for u in range(n_users):
        res[u].sort()
    
    return res

def get_irbu_matrix_with_len(irbu, n_users, n_items):
    out = np.ones((n_users, n_items), dtype=int)*-1
    len_irbu = np.zeros(n_users)

    for u in range(n_users):
        len_irbu[u] = len(irbu[u])
        for idx, i in enumerate(irbu[u]):
            out[u, idx] = i
            
    return out, len_irbu

irbu, len_irbu = get_irbu_matrix_with_len(get_items_rated_by_users(uir_train, users), users, movies)

### Load genres

In [5]:
movies_csv = pd.read_csv("../../data/movies_data.csv")
movies_csv["movie_id"] = movies_csv["movie_id"].apply(lambda x: x - 1)
movies_csv.head()

Unnamed: 0,movie_id,title,genres
0,0,Toy Story (1995),Animation|Children's|Comedy
1,1,Jumanji (1995),Adventure|Children's|Fantasy
2,2,Grumpier Old Men (1995),Comedy|Romance
3,3,Waiting to Exhale (1995),Comedy|Drama
4,4,Father of the Bride Part II (1995),Comedy


In [6]:
genres_map = {
    "Action": 0,
    "Adventure": 1,
    "Animation": 2,
    "Children's": 3,
    "Comedy": 4,
    "Crime": 5,
    "Documentary": 6,
    "Drama": 7,
    "Fantasy": 8,
    "Film-Noir": 9,
    "Horror": 10,
    "Musical": 11,
    "Mystery": 12,
    "Romance": 13,
    "Sci-Fi": 14,
    "Thriller": 15,
    "War": 16,
    "Western": 17,
}

In [13]:
def get_genres_item_belongs_to(ids, genres, n_items):
    out = List([List.empty_list(types.int64) for _ in range(n_items)])
    
    for movie_id, genre_list in zip(ids, genres):
        genre_list = genre_list.split("|")
        for genre in genre_list:
            out[movie_id].append(genres_map[genre])
                   
    return out

def get_gibt_matrix_with_len(genres_encoded, n_users, n_items):
    gibt = np.ones((n_items, len(genres_map)), dtype=int)*-1
    len_gibt = np.zeros(n_items)
    for i in range(n_items):
        len_gibt[i] = len(genres_encoded[i])
        for idx, g in enumerate(genres_encoded[i]):
            gibt[i, idx] = g
            
    return gibt, len_gibt
            
gibt, len_gibt = get_gibt_matrix_with_len(
    get_genres_item_belongs_to(
        movies_csv["movie_id"].tolist(), movies_csv["genres"].tolist(), movies
    ), users, movies,
)

## Train

In [32]:
@njit
def step(
    train_data, 
    Rus,
    len_Rus,
    Gis,
    len_Gis,
    n_users, 
    n_items, 
    k, 
    α1,
    α2,
    α3,
    α4,
    α5,
    α6,
    λ1, 
    λ2,
    λ3,
    λ4,
    λ5,
    λ6,
    μ, bi, bu, P, Q, Y, X,
):
    loss = 0
    for u, i, r in train_data:
        Ru = Rus[u]
        sqrt_Ru = np.sqrt(len_Rus[u])
        implicit_feedback = np.zeros(k)
        for j in Ru:
            if j == -1:
                break
            implicit_feedback += Y[j]
        implicit_feedback /= (sqrt_Ru+1e-15)
        
        Gi = Gis[i]
        genres_feedback = np.zeros(k)
        for g in Gi:
            if g == -1:
                break
            genres_feedback += X[g]
        genres_feedback /= len_Gis[i]
        

        pred = μ + bu[u] + bi[i] + np.dot(Q[i] + genres_feedback, P[u] + implicit_feedback)
        error = r - pred

        # Updating
        bu[u] += α1 * (error - λ1*bu[u])
        bi[i] += α2 * (error - λ2*bi[i])

        Pu = P[u]
        Qi = Q[i]
        P[u] += α3*(error*Qi - λ3*Pu)
        Q[i] += α4*(error*(Pu+implicit_feedback) - λ4*Qi)
    
        term_1 = error*(1/len_Gis[i])*(Pu + implicit_feedback)
        for g in Gi:
            if g == -1:
                break
            X[g] += α6*(term_1 - λ5*X[g])
            
        term_2 = error*(1/sqrt_Ru)*(Qi + genres_feedback)
        for j in Ru:
            if j == -1:
                break
            Y[j] += α5*(term_2 - λ6*Y[j])
            
        loss += error**2
            
    return np.sqrt(loss/len(train_data))

In [33]:
# RS HD page 171 (chrome), 84 book
def fit_svdpp(
    train_data, Rus, len_Rus, Gis, len_Gis, n_users, n_items, k, 
    α1=.01, α2=.01, α3=.01, α4=.01, α5=.01, α6=.01, 
    λ1=.01, λ2=.01, λ3=.01, λ4=.01, λ5=.01, λ6=.01, n_iters=20
):
    """
    train_data: array Nx3
    """
    bu = np.zeros(n_users, np.double)
    bi = np.zeros(n_items, np.double)
    
    P = np.random.normal(0, .1, (n_users, k))
    Q = np.random.normal(0, .1, (n_items, k))
    Y = np.random.normal(0, .1, (n_items, k))
    X = np.random.normal(0, .1, (len(genres_map), k))
    
    μ = np.mean(train_data[:, 2])
    
    t = trange(n_iters, leave=True)
    for it in t:
        loss = step(
            train_data, Rus, len_Rus, Gis, len_Gis, n_users, n_items, k, 
            α1, α2, α3, α4, α5, α6, 
            λ1, λ2, λ3, λ4, λ5, λ6, μ, bi, bu, P, Q, Y, X
        )
        t.set_postfix({"Loss": loss})
#         α1 *= 0.9
#         α2 *= 0.9
#         α3 *= 0.9
#         α4 *= 0.9
    
    return μ, bu, bi, P, Q, Y, X

# Fit

In [43]:
# lr = .007

fitted_params = fit_svdpp(
    uir_train, irbu, len_irbu, gibt, len_gibt, users, movies, k=10, 
    α1=0.005, α2=0.005, α3=0.006, α4=0.006, α5=0.005, α6=0.005, 
    λ1=0.02, λ2=0.02, λ3=.015, λ4=.01, λ5=.01, λ6=.01, 
    n_iters=40
)

  0%|          | 0/40 [00:00<?, ?it/s]

In [35]:
@njit
def predict(u, i, Rus, len_Rus, Gis, len_Gis, params):
    μ, bu, bi, P, Q, Y, X = params
    k = P.shape[1]
    
    Ru = Rus[u]
    sqrt_Ru = np.sqrt(len_Rus[u])

    implicit_feedback = np.zeros(k)
    for j in Ru:
        if j == -1:
            break
        implicit_feedback += Y[j]
    implicit_feedback /= (sqrt_Ru+1e-15)
    
    Gi = Gis[i]
    genres_feedback = np.zeros(k)
    for g in Gi:
        if g == -1:
            break
        genres_feedback += X[g]
    genres_feedback /= len_Gis[i]

    pred = μ + bu[u] + bi[i] + np.dot(Q[i] + genres_feedback, P[u] + implicit_feedback)
    
    return pred

In [36]:
@njit(parallel=True, nogil=True)
def predict_batch_inner(ui_mat, Rus, params, progress_hook):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, Rus, params)
        if np.isnan(predictions[it]):
            print(u, i)
            
        progress_hook.update(1)
        
    return np.clip(predictions, 1., 5.)

def predict_batch_progress_bar(ui_mat, Rus, params):
    with ProgressBar(total=len(ui_mat)) as progress:
        return predict_batch_inner(ui_mat, Rus, params, progress)
    
def predict_batch(ui_mat, Rus, len_Rus, Gis, len_Gis, params):
    predictions = np.zeros(len(ui_mat))
    for it in prange(ui_mat.shape[0]):
        u, i = ui_mat[it]
        predictions[it] = predict(u, i, Rus, len_Rus, Gis, len_Gis, params)
        
    return np.clip(predictions, 1., 5.)

In [44]:
val_preds = predict_batch(uir_val[:, :2], irbu, len_irbu, gibt, len_gibt, fitted_params)
val_expected = uir_val[:, 2]

error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
print(error)

0.8794862643065426


## Multiple train

In [None]:
from joblib import Parallel, delayed
from itertools import product

In [None]:
num = 5
alphas = np.linspace(0.003, 0.007, num=num)
lamb_1 = np.linspace(.004, .006, num=3)
lamb_2 = np.linspace(.015, .02, num=2)

params_product = list(product(alphas, alphas, lamb_1, lamb_2))

In [None]:
def train_val(
    uir_train, 
    arr_irbu,
    users, 
    movies,
    k, 
    α1,
    α2,
    α3,
    α4,
    α5,
    λ1, 
    λ2, 
    n_iters,
    uir_val,
):
    fitted_params = fit_svdpp(uir_train, arr_irbu, users, movies, k, α1, α2, α3, α4, α5,λ1, λ2, n_iters)
    
    val_preds = predict_batch(uir_val[:, :2], arr_irbu, fitted_params)
    val_expected = uir_val[:, 2]
    error = np.sqrt(1/n_val * np.sum((val_preds - val_expected)**2))
    
    return α1, α2, α3, α4, λ1, λ2, error

In [None]:
out = Parallel(n_jobs=12)(
    delayed(train_val)(
        uir_train, 
        irbu,
        users, 
        movies,
        k=5, 
        α1=lr1,
        α2=lr1,
        α3=lr2,
        α4=lr2,
        α5=lr2,
        λ1=lamb1, 
        λ2=lamb2, 
        n_iters=30,
        uir_val=uir_val,
    )
    for lr1, lr2, lamb1, lamb2 in tqdm(params_product)
)

In [None]:
print(sorted(out, key=lambda x: x[-1]))

# Predict

In [45]:
uir_total = train_csv.drop(["timestamp"], axis=1).values
irbu_total, len_irbu_total = get_irbu_matrix_with_len(get_items_rated_by_users(uir_total, users), users, movies)

In [49]:
fitted_final_params = fit_svdpp(
    uir_total, irbu_total, len_irbu_total, gibt, len_gibt, users, movies, k=6, 
    α1=0.005, α2=0.005, α3=0.006, α4=0.006, α5=0.005, α6=0.005, 
    λ1=0.02, λ2=0.02, λ3=.015, λ4=.01, λ5=.01, λ6=.01, 
    n_iters=50
)

  0%|          | 0/50 [00:00<?, ?it/s]

In [51]:
ui_test = test_csv.drop(["id", "timestamp"], axis=1).values
test_predictions = predict_batch(ui_test, irbu_total, len_irbu_total, gibt, len_gibt, fitted_final_params)

print(test_predictions[:10])

[2.88917054 3.38749789 2.9163412  3.57492143 3.8933387  2.95660216
 3.68966146 3.4268573  3.0660798  3.26501636]


In [52]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,2.889171
1,1,3.387498
2,2,2.916341
3,3,3.574921
4,4,3.893339


In [53]:
out_df.to_csv("out_gsvd++1.csv", index=False)