## Imports

In [1]:
import os, sys, gzip
from ast import literal_eval
from itertools import product

import numpy as np
import pandas as pd

from polara import get_movielens_data
from polara.preprocessing.dataframes import leave_one_out, reindex

from scipy.sparse import coo_matrix, csr_matrix, diags
from scipy.sparse.linalg import svds, norm as spnorm

from tqdm.notebook import tqdm

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Download data

In [3]:
def amazon_data_reader(path):
    with gzip.open(path, 'rt') as gz:
        for line in gz:
            yield literal_eval(line)


In [4]:
if 'reviews_Electronics_5.json.gz' in os.listdir():
    pass
else:
    !wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz

col_names_mapping = dict(zip(
    ['reviewerID', 'asin', 'overall', 'unixReviewTime'],
    ['userid', 'itemid', 'rating', 'timestamp']
))


data_glob = pd.DataFrame.from_records(
        amazon_data_reader(f'reviews_Electronics_5.json.gz'),
        columns=['reviewerID', 'asin', 'overall', 'unixReviewTime']
).rename(columns=col_names_mapping)

## Prepare data

In [5]:
def get_random_subset(data, size, unique_users=None):
    if unique_users is None:
        unique_users = data['userid'].unique()
    user_subset = np.random.choice(unique_users, size=size)
    data = data[data['userid'].isin(user_subset)]
    return data

In [6]:
# Get data subset
# Just for making calculations faster
data = get_random_subset(data_glob, size=20000)

# Change user and item ids to be indexed by first-occurence-time index 
user_ids = data.sort_values('timestamp').userid.unique()
user_map = dict(zip(user_ids, range(len(user_ids))))
data['userid'] = data['userid'].map(user_map)

item_ids = data.sort_values('timestamp').itemid.unique()
item_map = dict(zip(item_ids, range(len(item_ids))))
data['itemid'] = data['itemid'].map(item_map)

print(data.shape)
data.head()

(168025, 4)


Unnamed: 0,userid,itemid,rating,timestamp
15,16675,35527,3.0,1377907200
20,10834,11269,5.0,1323993600
25,10267,11269,4.0,1368835200
50,12324,11269,5.0,1388620800
55,3909,11269,5.0,1390348800


### Split data

In [7]:
def train_test_val_split(data, train_size=0.5, test_size=0.2):
    assert train_size < 1 - test_size, "test data should not overlap train data"

    train_ts = data['timestamp'].quantile(train_size)
    test_ts = data['timestamp'].quantile(1 - test_size)

    train = data.query(f"timestamp <= @train_ts")
    val = data.query(f"timestamp > @train_ts and timestamp <= @test_ts")
    val = val.sort_values('timestamp')
    test = data.query(f"timestamp > @test_ts")
    return train, val, test

In [10]:
train, val, test = train_test_val_split(data, train_size=.5, test_size=.2)

train.shape, val.shape, test.shape

((84074, 4), (50351, 4), (33600, 4))

## Experiments

In [11]:
def algorithm(data, holdout):
    pass


def calc_metrics(preds, holdout):
    pass

In [20]:
max_user_id = train.userid.max()
val.query('userid <= @max_user_id').shape, val.shape

((35264, 4), (50351, 4))

In [12]:
def get_validation_subset(val, start_idx=0, bs=2048):
    return val.iloc[start_idx: start_idx + bs]
    #return val.iloc[start_idx: ] - in case of getting all the validation data
    

def prepare_holdout(val_set, train_set):
    # Get first interaction for each user in validation set
    holdout = val_set.drop_duplicates(subset='userid', keep='first')
    
    # We do not predict for user having 0 interactions
    max_user_id = train_set.userid.max()
    holdout = holdout.query("userid <= @max_user_id")
    if holdout.shape[0] == 0:
        warnings.warn("WARNING: holdout has no valid user for predicting")
    return holdout

In [13]:
def update_train(train, val):
    return pd.concat([train, val], axis=0)

In [14]:
def get_sparse_matrix(data, sparse_format='csr'):
    # Extract required data 
    user_idx = data['userid'].values
    item_idx = data['itemid'].values
    ratings = data['rating'].values

    n_users = data['userid'].nunique()
    n_items = data['itemid'].nunique()
    shape = (n_users, n_items)
        
    # Create a sparse user-item interaction matrix of specified format
    sparse_matrix_foos = {
        'csr': csr_matrix,
        'coo': coo_matrix
    }
    sparse_foo = sparse_matrix_foos[sparse_format] 
    user_item_mtx = sparse_foo((ratings, (user_idx, item_idx)), shape=shape, dtype='float64')
    return user_item_mtx

In [16]:
# Add validation data batch by batch
bs = 2048
start_idx = 0
cur_train = train.copy()

metrics_arr = []

for i in tqdm(range(val.shape[0] // bs)):
    # get current validation data
    cur_val = get_validation_subset(val, start_idx, bs)
    # turn it into a holdout
    holdout = prepare_holdout(cur_val, cur_train)

    # apply algorithms, get predictions
    preds = algorithm(cur_train, holdout)
    # calculate metrics
    metrics = calc_metrics(preds, holdout)
    metrics_arr.append(metrics)
    
    # update train data
    cur_train = update_train(cur_train, cur_val)
    
    start_idx += bs

  0%|          | 0/24 [00:00<?, ?it/s]