In [1]:
import numpy as np
import pandas as pd
from numba import njit, prange
from numpy.typing import NDArray
from tqdm.notebook import tqdm

import cupy as cp

In [2]:
np.set_printoptions(suppress=True)

In [3]:
train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]## Split into train and validation

In [4]:
@njit
def build_matrix_jit(data: NDArray[np.uint8], n_users: int, n_items: int):
    """
    data: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
         col 2: rating
    """
    ratings_mat = np.zeros((n_users+1, n_items+1), dtype=np.uint8)

    for row in data:
        u, i, r = row
        ratings_mat[u, i] = r
    
    return ratings_mat

users = 3974
movies = 3564

ratings_mat = build_matrix_jit(train_data.values, users, movies)## Compute predictions

In [5]:
ratings_mat_gpu = cp.asarray(ratings_mat)
print(ratings_mat_gpu.shape)

(3975, 3565)


In [7]:
np.save("mat", ratings_mat)

$U$: $|users| \times |features|$\
$\Sigma$: $|features| \times |features|$\
$V$: $|features| \times |items|$

## Fitting best $k$

In [6]:
@njit(nogil=True, parallel=True)
def predict_batch(X: NDArray[int], reconstructed_matrix: NDArray[np.float64]):
    """
    X: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
    """
    predictions = np.zeros(len(X), dtype=np.float64)
    
    for i in prange(len(X)):
        user_id, movie_id = X[i]
        predictions[i] = reconstructed_matrix[user_id, movie_id]
        
    return np.clip(predictions, 1, 5)

In [15]:
@njit(parallel=True)
def calc_error(preds, expected):
    num = 0
    for i in prange(len(preds)):
        num += (preds[i] - expected[i])**2

    rmse = np.sqrt(num/len(preds))
    return rmse

In [16]:
data_arr = validation_data.drop(["rating"], axis=1).values
ratings_mat_gpu = cp.asarray(ratings_mat.astype(np.float64))
U, S, V = cp.linalg.svd(ratings_mat_gpu)
expected_arr = validation_data["rating"].values

K = len(S)
errors = np.zeros(len(S))
errors[0] = errors[1] = np.inf
for k in tqdm(range(2, K)):
    Uk, Sk, Vk = U[:, :k], S[:k], V[:k]
    train_reconstructed = cp.asnumpy(Uk@cp.diag(Sk)@Vk)
    
    predicted = predict_batch(data_arr, train_reconstructed)
    
    errors[k] = calc_error(predicted, expected_arr)

  0%|          | 0/3563 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [24]:
mini = np.inf
idx = 0
for i, e in enumerate(errors):
    if e != 0 and mini > e:
        mini = e
        idx = i
        
print(mini, idx)

2.4738258440299004 25


## Predict

In [25]:
total_ratings = build_matrix_jit(train_csv.drop(["timestamp"], axis=1).values, users, movies)
total_ratings_gpu = cp.asarray(total_ratings)

U_total, S_total, V_total = cp.linalg.svd(total_ratings_gpu)
Uk_total, Sk_total, Vk_total = U_total[:, :k], S_total[:k], V_total[:k]
total_reconstructed = cp.asnumpy(Uk_total@cp.diag(Sk_total)@Vk_total)

In [26]:
test_predictions = predict_batch(
    test_csv.drop(["id", "timestamp"], axis=1).values,
    total_reconstructed,
)

In [27]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,1.0
1,1,1.0
2,2,1.0
3,3,1.0
4,4,1.0


In [30]:
out_df.to_csv("out_svd_1.csv", index=False)