In [1]:
%load_ext autoreload
%autoreload 2

In [112]:
import os
while 'notebooks' in os.getcwd():
    os.chdir("..")

import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
from statistics import mean
from tqdm import tqdm

from src.mapping import get_movies_id_map
from src.collaborative_filtering.preprocessing.make_dataset \
    import read_ratings_df

## Getting User x Movie review matrix

In [6]:
ratings_df = read_ratings_df()

In [8]:
users = ratings_df['userID'].unique()
movies = ratings_df['movieID'].unique()

In [9]:
ratings_matrix = pd.DataFrame(
    index= users,
    columns= movies,
    data= ratings_df['rating']
)

In [13]:
ratings_matrix = ratings_df.pivot(
    index = 'userID',
    columns= 'movieID',
    values= 'rating'
)

ratings_matrix

movieID,1,2,3,4,5,6,7,8,9,10,...,64997,64999,65006,65011,65037,65088,65091,65126,65130,65133
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
75,,,1.0,,,,,,,,...,,,,,,,,,,
78,,,,,,,,,,,...,,,,,,,,,,
127,,,,,,,,,,,...,,,,,,,,,,
170,3.0,2.0,,,,,,,,3.5,...,,,,,,,,,,
175,4.0,,,,,5.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71497,5.0,3.5,,,,,,,,,...,,,,,,,,,,
71509,4.0,,,,1.5,,,,,,...,,,,,,,,,,
71525,,,,,,,,,,,...,,,,,,,,,,
71529,4.5,,2.0,,,,,,,,...,,,,,,,,,,


### Matrix density

In [18]:
number_of_entries = \
    ratings_matrix.shape[0] * ratings_matrix.shape[1]

In [22]:
number_of_nulls = ratings_matrix\
    .isna()\
    .sum()\
    .sum()

density = (1-number_of_nulls/number_of_entries)
density, density * number_of_entries
 

(0.04005549168582101, 855598.0000000012)

Approximatelly 96% of dataframe is made of null values.

## Train-test split

Here, the ```train-test split``` method will be a little different:

* For each user, we will leave 30% of its ratings to testing
* The other 70% of its ratings will be used to training

In [114]:
train_matrix= ratings_matrix.copy()
test_matrix = pd.DataFrame(
    index = ratings_matrix.index,
    columns= ratings_matrix.columns,
    data = 0
)

In [115]:
TEST_SIZE = 0.3 
for user_id, ratings in tqdm(ratings_matrix.iterrows()):   
    non_null_ratings = ratings[~ratings.isna()]
    
    test_idx = np.random.choice(
        non_null_ratings.index,
        size = int(TEST_SIZE * non_null_ratings.shape[0]),
        replace= False
    )

    train_matrix.loc[user_id, test_idx] = 0
    test_matrix.loc[user_id, test_idx] = ratings.loc[test_idx]

2113it [05:51,  6.01it/s]


In [124]:
train_matrix.replace(np.nan, 0 )\
    .to_pickle("data/collaborative-filtering/train_matrix.pkl")

In [123]:
test_matrix.to_pickle("data/collaborative-filtering/test_matrix.pkl")

In [132]:
ratings_matrix.isna().sum(axis=1)

userID
75       10054
78        9641
127      10076
170      10026
175       9833
         ...  
71497     9859
71509     8953
71525     9695
71529    10008
71534     9939
Length: 2113, dtype: int64

In [141]:
train_matrix = train_matrix.replace(np.nan, 0)

In [142]:
non_null_test = (test_matrix != 0 )\
    .sum(axis=1)

non_null_train = (train_matrix != 0)\
    .sum(axis=1)

userID
75       0.709091
78       0.700855
127      0.727273
170      0.710843
175      0.702899
           ...   
71497    0.700000
71509    0.700692
71525    0.700483
71529    0.702970
71534    0.700000
Length: 2113, dtype: float64

In [143]:
non_null_train/(~ratings_matrix.isna() ).sum(axis = 1 )

userID
75       0.709091
78       0.700855
127      0.727273
170      0.710843
175      0.702899
           ...   
71497    0.700000
71509    0.700692
71525    0.700483
71529    0.702970
71534    0.700000
Length: 2113, dtype: float64

In [144]:
non_null_test/(~ratings_matrix.isna() ).sum(axis = 1 )

userID
75       0.290909
78       0.299145
127      0.272727
170      0.289157
175      0.297101
           ...   
71497    0.300000
71509    0.299308
71525    0.299517
71529    0.297030
71534    0.300000
Length: 2113, dtype: float64