In [1]:
import numpy as np
import pandas as pd
from numba import njit, prange
from numpy.typing import NDArray

In [2]:
np.set_printoptions(suppress=True)

## Load data

In [3]:
train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

# Split into train and validation
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]## Split into train and validation

In [4]:
@njit
def build_matrix_jit(data: NDArray[np.uint8], n_users: int, n_items: int):
    """
    data: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
         col 2: rating
    """
    ratings_mat = np.zeros((n_users+1, n_items+1), dtype=np.uint8)

    for row in data:
        u, i, r = row
        ratings_mat[u, i] = r
    
    return ratings_mat

users = 3974
movies = 3564

ratings_mat = build_matrix_jit(train_data.values, users, movies)## Compute predictions

## Functions

In [5]:
test_mat = np.array([
    [5, 2, 4, 3, 2, 3],
    [4, 3, 5, 4, 3, 2],
    [1, 5, 3, 4, 4, 5],
    [1, 0, 2, 3, 4, 2],
])

In [6]:
@njit(nogil=True, parallel=True)
def global_nonzer_mean(ratings: NDArray[np.uint8]) -> float:
    n_nonzero = 0
    sum_res = 0
    for u in prange(ratings.shape[0]):
        for i in range(ratings.shape[1]):
            if ratings[u, i] != 0:
                n_nonzero += 1
                sum_res += ratings[u, i]
    
    return sum_res/n_nonzero

assert np.round(global_nonzer_mean(test_mat), 6) == 3.217391

In [7]:
@njit(nogil=True, parallel=True)
def item_biases(ratings: NDArray[np.uint8], λ: float, μ: float) -> NDArray[np.float64]:
    n_items = ratings.shape[1]
    
    n_users_rated_items = np.zeros(n_items)
    biases_per_item = np.zeros(n_items)
    
    for u in prange(ratings.shape[0]):
        for i in range(n_items):
            if ratings[u, i] != 0:
                n_users_rated_items[i] += 1
                biases_per_item[i] += (ratings[u, i] - μ)
                
    return 1/(λ + n_users_rated_items) * biases_per_item
    
assert np.allclose(
    item_biases(test_mat, 0, global_nonzer_mean(test_mat)),
    [-0.4673913, 0.1159420, 0.2826087, 0.2826087, 0.0326087, -0.2173913],
) is True

In [8]:
@njit(nogil=True, parallel=True)
def user_biases(ratings: NDArray[np.uint8], λ: float, item_biases: NDArray[np.float64], μ: float) -> NDArray[np.float64]:
    n_users = ratings.shape[0]
    
    n_items_rated_by_users = np.zeros(n_users)
    biases_per_user = np.zeros(n_users)
    
    for u in prange(n_users):
        for i in range(ratings.shape[1]):
            if ratings[u, i] != 0:
                n_items_rated_by_users[u] += 1
                biases_per_user[u] += (ratings[u, i] - μ - item_biases[i])
                
    return 1/(λ + n_items_rated_by_users) * biases_per_user

bi = item_biases(test_mat, 0, global_nonzer_mean(test_mat))
assert np.allclose(
    user_biases(test_mat, 0, bi, global_nonzer_mean(test_mat)),
    [-0.05555556, 0.27777778, 0.44444444, -0.80000000],
) is True

In [9]:
@njit
def predict(μ: float, bi: NDArray[np.float64], bu: NDArray[np.float64], u, i):
    return μ + bi[i] + bu[u]

test_mean = global_nonzer_mean(test_mat)
test_item_biases = item_biases(test_mat, 0, test_mean)
test_user_biases =  user_biases(test_mat, 0, test_item_biases, test_mean)

assert round(predict(test_mean, test_item_biases, test_user_biases, u=3, i=1), 6) == 2.533333

## Fit parameters

In [10]:
train_mean = global_nonzer_mean(ratings_mat)
train_item_biases = item_biases(ratings_mat, λ=1e-15, μ=train_mean)
train_user_biases = user_biases(ratings_mat, λ=1e-15, item_biases=train_item_biases, μ=train_mean)

In [28]:
@njit(nogil=True, parallel=True)
def predict_batch(X: NDArray[int], μ: float, bi: NDArray[np.float64], bu: NDArray[np.float64]):
    """
    X: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
    """
    predictions = np.zeros(len(X), dtype=np.float64)
    
    for i in prange(len(X)):
        user_id, movie_id = X[i]
        predictions[i] = predict(μ, bi, bu, user_id, movie_id)
        
    return np.clip(predictions, 1, 5)

## Test on validation set

In [29]:
data_arr = validation_data.drop(["rating"], axis=1).values

predictions = predict_batch(data_arr, train_mean, train_item_biases, train_user_biases)

In [30]:
num = 0
for pred_r, real_r in zip(predictions, validation_data["rating"].values):
    num += (pred_r - real_r)**2

rmse = np.sqrt(num/len(predictions))
print(rmse)

0.9078230917608796


## Fit on all data and predict test set

In [31]:
total_ratings = build_matrix_jit(train_csv.drop(["timestamp"], axis=1).values, users, movies)

total_mean = global_nonzer_mean(total_ratings)
total_item_biases = item_biases(total_ratings, λ=1e-15, μ=total_mean)
total_user_biases = user_biases(total_ratings, λ=1e-15, item_biases=total_item_biases, μ=total_mean)

test_predictions = predict_batch(
    test_csv.drop(["id", "timestamp"], axis=1).values,
    total_mean,
    total_item_biases,
    total_user_biases,
)

In [34]:
out_df = pd.DataFrame.from_dict(
    {
        "id": list(test_csv["id"]),
        "rating": test_predictions
    }
)

out_df.head()

Unnamed: 0,id,rating
0,0,3.387973
1,1,3.240358
2,2,2.74283
3,3,3.613519
4,4,2.986986


In [35]:
out_df.to_csv("out_baseline_2.csv", index=False)