In [1]:
import re

import numpy as np 
import pandas as pd

from matplotlib import pyplot as plt

from data import RandomData, AmazonBooks, ToyData, MovieLensData
from model import RandomModel, CombinedMeanModel, ItemItemCollaborationModel, RNNModel

%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

### Load Data

In [3]:
ds = AmazonBooks(min_user_ratings=10).get_dataset(verbose=True)
train = ds['train']
val = ds['val']

loading raw data
cleaning review lenghts
removing products with no description
removing poducts with too few reviews
removing users with too few (or too many) ratings
compute test/validation user/product/cell holdout
constructing dataset split
relabelling raw users/products
saving dataset to disk


In [4]:
len(train['user_product_ratings'])

138944

### Accuracy Metrics

In [5]:
def mean_squared_error(pred, ground_truth):
    return np.mean((pred - ground_truth) ** 2.)

In [6]:
def accuracy(pred, ground_truth):
    return (np.round(pred) == ground_truth).sum() / float(len(pred))

In [7]:
def evaluate_model(model, train, val, loss_fn):
    model.fit(train, val_fn)
    train_up_rat = train['user_product_ratings']
    val_up_rat = val['user_product_ratings'].copy()
    val_up_rat['pred'] = model.predict(val_up_rat[['user_id', 'product_id']])
    
    train_users = set(train['user_product_ratings'].user_id)
    train_products = set(train['user_product_ratings'].product_id)
    
    train_loss = loss_fn(model.predict(train_up_rat[['user_id', 'product_id']]),
                         train_up_rat.rating)
    
    val_loss = loss_fn(model.predict(val_up_rat[['user_id', 'product_id']]),
                       val_up_rat.rating)
    
    A_data = val_up_rat[val_up_rat.user_id.isin(train_users) & 
                        val_up_rat.product_id.isin(train_products)]
    A_loss = loss_fn(A_data.pred, A_data.rating)
    
    B_data = val_up_rat[~val_up_rat.user_id.isin(train_users) &
                        val_up_rat.product_id.isin(train_products)]
    B_loss = loss_fn(B_data.pred, B_data.rating)
    
    C_data = val_up_rat[val_up_rat.user_id.isin(train_users) &
                        ~val_up_rat.product_id.isin(train_products)]
    C_loss = loss_fn(C_data.pred, C_data.rating)
    
    D_data = val_up_rat[~val_up_rat.user_id.isin(train_users) &
                        ~val_up_rat.product_id.isin(train_products)]
    D_loss = loss_fn(D_data.pred, D_data.rating)
    
    print('\n'.join(
        ['     Products',
         '    -----------',
         '   |       |   |  (A.x) TRAIN SET',
         ' U | (A.x) |   |  (A.o) VAL CELL HOLDOUT',
         ' s | (A.o) |(C)|    (B) VAL USER HOLDOUT',
         ' e |       |   |    (C) VAL PRODUCT HOLDOUT',
         ' r |       |   |    (D) VAL PRODUCT & USER HOLDOUT',
         ' s |-------|---|    (V) VAL SET',
         '   |  (B)  |(D)|',
         '    -----------',
         '================================',
         '']))
    
    def stats(data, loss, lbl):
        print(lbl)
        print(f'      Number of users: {len(set(data.user_id))}')
        print(f'      Number of proucts: {len(set(data.product_id))}')
        print(f'      Number of ratings: {len(data)}')
        print(f'      loss: {loss}\n')
    stats(train_up_rat, train_loss, '(A.x) TRAIN SET')
    stats(val_up_rat, val_loss, '  (V) VAL SET')
    stats(A_data, A_loss, '(A.o) VAL CELL HOLDOUT')
    stats(B_data, B_loss, '  (B) VAL USER HOLDOUT')
    stats(C_data, C_loss, '  (C) VAL PRODUCT HOLDOUT')
    stats(D_data, D_loss, '  (D) VAL PRODUCT & USER HOLDOUT')
    return val_loss, A_loss, B_loss, C_loss, D_loss

In [None]:
model = RNNModel()
model.fit(train, val)

copying required data ... done! (3.0 ms)
cleaning product descriptions and reviews ... done! (73.7 ms)
building product descriptions and review vocabs ... done! (238.1 ms)
building product description and review index sequence ... done! (1302.7 ms)
copying required data ... done! (2.4 ms)
cleaning product descriptions and reviews ... done! (76.3 ms)
building product description and review index sequence ... done! (1382.3 ms)
initializing description reader ... done! (6.5 ms)
initializing review reader ... done! (5.8 ms)
initializing user embeddings ... done! (0.1 ms)
using Adam optimizer with lr=0.001 ... done! (0.1 ms)
using SparseAdam optimizer with lr=0.001 ... done! (0.1 ms)
epoch 0
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0000, batch_mse = 14.82, batch_mun = 39.44, train_mse = 1.48, train_acc = 0.00%, p_norm = 552.10
tensor([ 0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  1,
         0,  0

tensor([ 1,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0034, batch_mse = 15.28, batch_mun = 37.75, train_mse = 14.03, train_acc = 0.00%, p_norm = 559.34
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0035, batch_mse = 13.90, batch_mun = 39.98, train_mse = 14.01, train_acc = 0.00%, p_norm = 559.62
tensor([ 0,  0,  0,  0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0,
         0,  1], dtype=torch.uint8)
tensor(0)
train, batch_num = 0036, batch_mse = 10.48, batch_mun = 37.18, train_mse = 13.66, train_acc = 0.00%, p_norm = 559.89
tensor([ 0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0037, batch_mse = 13.72, batch_mun = 43.01, train_mse = 13.67, train_acc = 0.00%, p_norm = 560.16
tensor([ 0,  0,  0,  1,  0,  1,  1,  1,  0,  0,  0,  0,  0,  0,
         1,  0], dtype=torch

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0071, batch_mse = 9.33, batch_mun = 42.40, train_mse = 11.66, train_acc = 0.00%, p_norm = 572.29
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,
         1,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0072, batch_mse = 12.82, batch_mun = 41.40, train_mse = 11.78, train_acc = 0.00%, p_norm = 572.72
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  1], dtype=torch.uint8)
tensor(0)
train, batch_num = 0073, batch_mse = 5.87, batch_mun = 42.31, train_mse = 11.18, train_acc = 0.00%, p_norm = 573.15
tensor([ 0,  1,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0074, batch_mse = 9.68, batch_mun = 36.86, train_mse = 11.03, train_acc = 0.00%, p_norm = 573.58
tensor([ 0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  1,  0,  0,  1,
         1,  0], dtype=torch.ui

tensor([ 0,  0,  0,  0,  1,  1,  1,  0,  0,  1,  1,  0,  0,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0108, batch_mse = 5.90, batch_mun = 38.18, train_mse = 8.24, train_acc = 0.00%, p_norm = 586.87
tensor([ 0,  0,  0,  0,  1,  0,  0,  0,  1,  0,  1,  0,  0,  0,
         0,  1], dtype=torch.uint8)
tensor(0)
train, batch_num = 0109, batch_mse = 9.27, batch_mun = 39.93, train_mse = 8.34, train_acc = 0.00%, p_norm = 587.27
tensor([ 0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  1,  0,  0,  0,
         1,  1], dtype=torch.uint8)
tensor(0)
train, batch_num = 0110, batch_mse = 5.43, batch_mun = 37.67, train_mse = 8.05, train_acc = 0.00%, p_norm = 587.65
tensor([ 0,  0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  1,
         1,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0111, batch_mse = 9.19, batch_mun = 38.56, train_mse = 8.16, train_acc = 0.00%, p_norm = 588.04
tensor([ 0,  0,  1,  1,  0,  1,  1,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.uint8)


tensor([ 0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  0,  1,  1,
         0,  1], dtype=torch.uint8)
tensor(0)
train, batch_num = 0145, batch_mse = 6.79, batch_mun = 40.20, train_mse = 7.38, train_acc = 0.00%, p_norm = 600.81
tensor([ 0,  0,  0,  1,  0,  0,  0,  1,  1,  0,  0,  1,  0,  0,
         1,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0146, batch_mse = 6.55, batch_mun = 40.49, train_mse = 7.30, train_acc = 0.00%, p_norm = 601.22
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0147, batch_mse = 8.30, batch_mun = 39.32, train_mse = 7.40, train_acc = 0.00%, p_norm = 601.63
tensor([ 1,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  1], dtype=torch.uint8)
tensor(0)
train, batch_num = 0148, batch_mse = 4.49, batch_mun = 35.95, train_mse = 7.11, train_acc = 0.00%, p_norm = 602.00
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,
         1,  0], dtype=torch.uint8)


tensor([ 0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0182, batch_mse = 5.75, batch_mun = 40.73, train_mse = 6.34, train_acc = 0.00%, p_norm = 613.97
tensor([ 0,  0,  0,  0,  0,  1,  1,  1,  1,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0183, batch_mse = 6.52, batch_mun = 40.15, train_mse = 6.35, train_acc = 0.00%, p_norm = 614.27
tensor([ 0,  0,  1,  1,  0,  0,  0,  0,  0,  0,  1,  0,  0,  1,
         1,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0184, batch_mse = 6.21, batch_mun = 39.28, train_mse = 6.34, train_acc = 0.00%, p_norm = 614.57
tensor([ 0,  0,  1,  0,  0,  1,  0,  0,  0,  1,  0,  0,  1,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0185, batch_mse = 5.53, batch_mun = 40.53, train_mse = 6.26, train_acc = 0.00%, p_norm = 614.89
tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  0,  0,  0,  0,
         0,  0], dtype=torch.uint8)


tensor([ 0,  0,  0,  0,  0,  1,  1,  0,  1,  0,  1,  0,  0,  0,
         1,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0219, batch_mse = 4.98, batch_mun = 38.32, train_mse = 6.10, train_acc = 0.00%, p_norm = 624.37
tensor([ 0,  0,  1,  1,  0,  0,  0,  1,  0,  1,  0,  0,  1,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0220, batch_mse = 6.95, batch_mun = 39.69, train_mse = 6.18, train_acc = 0.00%, p_norm = 624.52
tensor([ 0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,  0,  1,
         0,  1], dtype=torch.uint8)
tensor(0)
train, batch_num = 0221, batch_mse = 6.37, batch_mun = 39.44, train_mse = 6.20, train_acc = 0.00%, p_norm = 624.63
tensor([ 0,  0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  0,  0,  0,
         0,  1], dtype=torch.uint8)
tensor(0)
train, batch_num = 0222, batch_mse = 5.51, batch_mun = 36.22, train_mse = 6.13, train_acc = 0.00%, p_norm = 624.71
tensor([ 0,  0,  0,  0,  1,  0,  0,  0,  0,  0,  1,  0,  1,  1,
         0,  0], dtype=torch.uint8)


tensor([ 0,  1,  0,  0,  1,  0,  1,  0,  0,  1,  0,  0,  1,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0256, batch_mse = 5.07, batch_mun = 38.53, train_mse = 5.84, train_acc = 0.00%, p_norm = 632.19
tensor([ 0,  0,  0,  0,  0,  1,  0,  1,  1,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0257, batch_mse = 6.13, batch_mun = 38.03, train_mse = 5.87, train_acc = 0.00%, p_norm = 632.50
tensor([ 0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0258, batch_mse = 8.31, batch_mun = 40.89, train_mse = 6.12, train_acc = 0.00%, p_norm = 632.80
tensor([ 0,  1,  1,  0,  0,  1,  1,  1,  1,  0,  0,  0,  0,  1,
         1,  0], dtype=torch.uint8)
tensor(0)
train, batch_num = 0259, batch_mse = 5.23, batch_mun = 38.51, train_mse = 6.03, train_acc = 0.00%, p_norm = 633.09
tensor([ 0,  0,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         1,  0], dtype=torch.uint8)
