In [1]:
import numpy as np
import pandas as pd

from numba import jit, prange
from numpy.typing import NDArray
from typing import Tuple

from typing import List

from tqdm.notebook import tqdm

For colaborative filtering, we will only be using the ratings data

In [2]:
train_csv = pd.read_csv("../../data/train_data.csv")
test_csv = pd.read_csv("../../data/test_data.csv")

In [3]:
print(train_csv.shape)
train_csv.head()

(535784, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1160,5,974769817
1,1,1129,3,974769817
2,1,3328,4,974769817
3,1,2659,2,974769817
4,1,980,3,974769817


## Split into train and validation

In [4]:
train_data = train_csv.drop(["timestamp"], axis=1).sample(frac=0.8)
validation_data = train_csv.drop(train_data.index).drop(["timestamp"], axis=1)

assert train_data.shape[0] + validation_data.shape[0] == train_csv.shape[0]

In [5]:
train_data.head()

Unnamed: 0,user_id,movie_id,rating
13216,174,2312,4
451103,3261,3145,4
395798,2919,980,5
151721,1382,3311,4
459109,3316,1464,2


## Build train matrix

In [6]:
def build_matrix(df, n_users, n_items):
    ratings_mat = np.ones((n_users+1, n_items+1), dtype=np.int8)*-1

    for _, row in df.iterrows():
        u = row["user_id"]
        i = row["movie_id"]
        r = row["rating"]

        ratings_mat[u, i] = r
        
    return ratings_mat

# train_mat = build_matrix(train_data, 3974, 3564)

In [7]:
@jit(nopython=True)
def build_matrix_jit(data: NDArray[np.uint8], n_users: int, n_items: int):
    """
    data: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
        col 2: rating
    """
    ratings_mat = np.ones((n_users+1, n_items+1), dtype=np.int8)*-1

    for row in data:
        u, i, r = row
        ratings_mat[u, i] = r
        
    return ratings_mat

In [8]:
users = 3974
movies = 3564

ratings_mat = build_matrix_jit(train_data.values, users, movies)

## Compute predictions

In [9]:
@jit(nopython=True, parallel=True)
def get_means(ratings: np.array, eps=1e-15) -> List[int]:
    means = np.zeros(ratings.shape[1])
    counts = np.zeros(ratings.shape[1])
    
    for i in prange(ratings.shape[1]):
        for u in range(ratings.shape[0]):
            if ratings[u, i] != -1:
                means[i] += ratings[u, i]
                counts[i] += 1
                
    return means/(counts+eps)

@jit(nopython=True)
def sim(i: int, j: int, ratings: np.array, means: np.array, eps=1e-15) -> float:
    user_idxs = []
    
    # Get users (indexes) that rated both items i and j
    for u in range(ratings.shape[0]):
        if ratings[u, i] != -1 and ratings[u, j] != -1:
            user_idxs.append(u)
    
    # Calculate similarity        
    n_ui, n_uj, cov_ui_uj = 0, 0, 0
    for u in user_idxs:
        r_ui_r = ratings[u, i] - means[i]
        r_uj_r = ratings[u, j] - means[j]
        
        cov_ui_uj += (r_ui_r * r_uj_r)
        n_ui += r_ui_r**2
        n_uj += r_uj_r**2
        
    return cov_ui_uj/(np.sqrt(n_ui)*np.sqrt(n_uj) + eps)

@jit(nopython=True)
def pred(u: int, i: int, k: int, ratings: np.array, means: np.array, eps=1e-15) -> float:
    similarities = []
    for j in range(ratings.shape[1]):
        if i != j and ratings[u, j] != -1:
            similarities.append((sim(i, j, ratings, means), ratings[u, j]))
            
    numerator, denominator = 0, 0
    for similarity_ij, rating_uj in sorted(similarities, reverse=True)[:k]:
        numerator += similarity_ij*rating_uj
        denominator += similarity_ij
    
    return int(round(numerator/(denominator + eps)))

In [10]:
print(validation_data.shape)
validation_data.head()

(107157, 3)


Unnamed: 0,user_id,movie_id,rating
2,1,3328,4
4,1,980,3
6,1,1099,5
8,1,32,3
9,1,2286,5


In [11]:
@jit(nopython=True, nogil=True, parallel=True)
def predict_batch(X, ratings, neighbours=5):
    """
    X: Array of size (n_entries, 3) with columns representing:
        col 0: user_id
        col 1: item_id
    """
    means = get_means(ratings)

    predictions = np.zeros(len(X), dtype=np.int8)
    
    for i in prange(len(X)):
        user_id, movie_id = X[i]
        predictions[i] = pred(user_id, movie_id, neighbours, ratings, means)
        
    return predictions

## Making predictions for different k values on the validation data

In [12]:
# predictions_per_k = []
# data_arr = validation_data.drop(["rating"], axis=1).values

# for k in tqdm(range(1, 10+1)):
#     predictions_per_k.append(
#         predict_batch(data_arr, ratings_mat, neighbours=k)
#     )

## Calculate RMSE on validation data

In [13]:
# errors = []

# for k in range(len(predictions_per_k)):
#     num = 0
#     for pred_r, real_r in zip(predictions_per_k[k], validation_data["rating"].values):
#         num += (pred_r - real_r)**2
        
#     rmse = np.sqrt(num/len(predictions_per_k[k]))
    
#     print(f"k={k+1}: {rmse=}")
#     errors.append(rmse)

## Plot rmse

In [14]:
import plotly.graph_objects as go
import numpy as np

In [15]:
# fig = go.Figure(
#     data=go.Scatter(
#         x=np.arange(1, 10+1),
#         y=errors
#     )
# )
# fig.show()

Choose $k=8$

# Test prediction

## Build matrix with all the data

In [16]:
total_ratings = build_matrix_jit(train_csv.drop(["timestamp"], axis=1).values, users, movies)

In [17]:
test_csv.head()

Unnamed: 0,id,user_id,movie_id,timestamp
0,0,5,2962,974769784
1,1,5,3177,974769768
2,2,5,3153,974769768
3,3,5,501,974769768
4,4,5,3159,974769768


## Predict test set

In [18]:
test_predictions = predict_batch(
    test_csv.drop(["id", "timestamp"], axis=1).values,
    total_ratings,
    neighbours=8
)

In [27]:
test_predictions.min()

-23

In [24]:
test_csv.iloc[3893]

id                3893
user_id           3858
movie_id          3481
timestamp    968125906
Name: 3893, dtype: int64

In [39]:
# @jit(nopython=True)
def new_pred(u: int, i: int, k: int, ratings: np.array, means: np.array, eps=1e-15) -> float:
    similarities = []
    for j in range(ratings.shape[1]):
        if i != j and ratings[u, j] != -1:
            similarities.append((sim(i, j, ratings, means), ratings[u, j], means[j]))
            
    print(similarities)
            
    numerator, denominator = 0, 0
    for similarity_ij, rating_uj, mean_j in sorted(similarities, reverse=True)[:k]:
        print(similarity_ij*(rating_uj-mean_j))
        numerator += similarity_ij*rating_uj
        denominator += similarity_ij
        
    print(numerator, denominator)
    
    return int(round(numerator/(denominator + eps)))

In [40]:
new_pred(u=3858, i=3481, k=8, ratings=total_ratings, means=get_means(total_ratings))

[(-0.21757255577325, 4, 3.270935960591133), (-0.02676400265653024, 4, 3.33953488372093), (0.32063140275313634, 3, 3.3521739130434782), (-0.06235929326451677, 5, 3.4495798319327733)]
-0.11291801575219149
-0.017676690126638576
-0.09668310594372558
-0.15862432637655666
-0.32724849178229565 0.01393555105883934


-23