## Imports

In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from numba import jit, njit

# Data Preprocessing

In [18]:
data = pd.read_csv('../data/data_train.csv', index_col=0)

# rename column and turn ot uint8
data.rename(columns={'Prediction': 'Rating'}, inplace=True)
data['Rating'] = data['Rating'].astype('uint8')

# get user and movie id by splitting index given in format rX_cY
data['UserId'] = data.index.str.split('_').str[0].str[1:].astype('int32')
data['MovieId'] = data.index.str.split('_').str[1].str[1:].astype('int32')

# subtract min UserId and MovieID to get indices starting at 0
data['UserId'] = data['UserId'] - data['UserId'].min()
data['MovieId'] = data['MovieId'] - data['MovieId'].min()

# reorder columns to UserId, MovieId, Rating
data = data[['UserId', 'MovieId', 'Rating']]

# split into train and val data
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
# function to get sparse matrix from data given as a dataframe with row and column indices
def get_sparse_matrix(data, n_rows, n_cols):
    return csr_matrix((data['Rating'].values, (data['UserId'].values, data['MovieId'].values)), shape=(n_rows, n_cols))

n_rows = train_data['UserId'].max() + 1 # might raise errors if in val set user with larger id
n_cols = train_data['MovieId'].max() + 1 # might raise errors if in val set movie with larger id
train_matrix = get_sparse_matrix(train_data, n_rows, n_cols)
val_matrix = get_sparse_matrix(val_data, n_rows, n_cols)

Import the dataset

In [34]:
from surprise import Dataset, SVD, BaselineOnly, Reader, NormalPredictor, KNNBasic
from surprise.model_selection import cross_validate

In [35]:
reader = Reader(rating_scale=(1, 5))
train_dataset = Dataset.load_from_df(data[['UserId', 'MovieId', 'Rating']], reader)

algo = SVD()
# algo = KNNBasic()

In [36]:
cv = cross_validate(algo, train_dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0274  1.0294  1.0279  1.0253  1.0286  1.0277  0.0014  
MAE (testset)     0.8400  0.8409  0.8388  0.8370  0.8409  0.8395  0.0015  
Fit time          96.14   29867.1899.78   103.99  102.19  6053.86 11906.66
Test time         526.70  26745.90268.01  274.91  287.28  5620.56 10563.12


{'test_rmse': array([1.02743206, 1.02941419, 1.02788003, 1.02525599, 1.02855448]),
 'test_mae': array([0.83997464, 0.84089572, 0.83882355, 0.83704645, 0.84088354]),
 'fit_time': (96.14492225646973,
  29867.17919898033,
  99.78141617774963,
  103.98659014701843,
  102.18910479545593),
 'test_time': (526.6988308429718,
  26745.90246105194,
  268.01205801963806,
  274.90552020072937,
  287.2830331325531)}