In [1]:
import numpy as np 
import pandas as pd

from data import RandomData, AmazonBooks, ToyData, MovieLensData
from model import RandomModel, CombinedMeanModel, ItemItemCollaborationModel, RNNModel

%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

### Load Data

In [3]:
ds = RandomData(min_user_ratings=5).get_dataset(verbose=True)
train = ds['train']
val = ds['val']

loading preprocessed dataset from disk


In [4]:
len(train['user_product_ratings'])

8781

### Accuracy Metrics

In [5]:
def mean_squared_error(pred, ground_truth):
    return np.mean((pred - ground_truth) ** 2.)

In [6]:
def accuracy(pred, ground_truth):
    return (np.round(pred) == ground_truth).sum() / float(len(pred))

In [7]:
def evaluate_model(model, train, val, loss_fn):
    model.fit(train, val_fn)
    train_up_rat = train['user_product_ratings']
    val_up_rat = val['user_product_ratings'].copy()
    val_up_rat['pred'] = model.predict(val_up_rat[['user_id', 'product_id']])
    
    train_users = set(train['user_product_ratings'].user_id)
    train_products = set(train['user_product_ratings'].product_id)
    
    train_loss = loss_fn(model.predict(train_up_rat[['user_id', 'product_id']]),
                         train_up_rat.rating)
    
    val_loss = loss_fn(model.predict(val_up_rat[['user_id', 'product_id']]),
                       val_up_rat.rating)
    
    A_data = val_up_rat[val_up_rat.user_id.isin(train_users) & 
                        val_up_rat.product_id.isin(train_products)]
    A_loss = loss_fn(A_data.pred, A_data.rating)
    
    B_data = val_up_rat[~val_up_rat.user_id.isin(train_users) &
                        val_up_rat.product_id.isin(train_products)]
    B_loss = loss_fn(B_data.pred, B_data.rating)
    
    C_data = val_up_rat[val_up_rat.user_id.isin(train_users) &
                        ~val_up_rat.product_id.isin(train_products)]
    C_loss = loss_fn(C_data.pred, C_data.rating)
    
    D_data = val_up_rat[~val_up_rat.user_id.isin(train_users) &
                        ~val_up_rat.product_id.isin(train_products)]
    D_loss = loss_fn(D_data.pred, D_data.rating)
    
    print('\n'.join(
        ['     Products',
         '    -----------',
         '   |       |   |  (A.x) TRAIN SET',
         ' U | (A.x) |   |  (A.o) VAL CELL HOLDOUT',
         ' s | (A.o) |(C)|    (B) VAL USER HOLDOUT',
         ' e |       |   |    (C) VAL PRODUCT HOLDOUT',
         ' r |       |   |    (D) VAL PRODUCT & USER HOLDOUT',
         ' s |-------|---|    (V) VAL SET',
         '   |  (B)  |(D)|',
         '    -----------',
         '================================',
         '']))
    
    def stats(data, loss, lbl):
        print(lbl)
        print(f'      Number of users: {len(set(data.user_id))}')
        print(f'      Number of proucts: {len(set(data.product_id))}')
        print(f'      Number of ratings: {len(data)}')
        print(f'      loss: {loss}\n')
    stats(train_up_rat, train_loss, '(A.x) TRAIN SET')
    stats(val_up_rat, val_loss, '  (V) VAL SET')
    stats(A_data, A_loss, '(A.o) VAL CELL HOLDOUT')
    stats(B_data, B_loss, '  (B) VAL USER HOLDOUT')
    stats(C_data, C_loss, '  (C) VAL PRODUCT HOLDOUT')
    stats(D_data, D_loss, '  (D) VAL PRODUCT & USER HOLDOUT')
    return val_loss, A_loss, B_loss, C_loss, D_loss

In [8]:
model = RNNModel()
model.fit(train, val)

copying required data ... done! (3.5 ms)
cleaning product descriptions and reviews ... done! (79.3 ms)
building product descriptions and review vocabs ... done! (249.5 ms)
building product description and review index sequence ... done! (1336.0 ms)
copying required data ... done! (2.6 ms)
cleaning product descriptions and reviews ... done! (80.5 ms)
building product description and review index sequence ... done! (1422.0 ms)
initializing description reader ... done! (6.8 ms)
initializing review reader ... done! (6.0 ms)
initializing user embeddings ... done! (0.1 ms)
using Adam optimizer with lr=0.001 ... done! (0.1 ms)
using SparseAdam optimizer with lr=0.001 ... done! (0.1 ms)
epoch 0
train, batch_num = 0000, batch_mse = 22.53, batch_mun = 41.30, train_mse = 2.25, train_acc = 0.01, p_norm = 549.78
train, batch_num = 0001, batch_mse = 21.21, batch_mun = 41.80, train_mse = 4.15, train_acc = 0.02, p_norm = 549.79
train, batch_num = 0002, batch_mse = 18.84, batch_mun = 40.88, train_mse =

train, batch_num = 0030, batch_mse = 6.80, batch_mun = 40.49, train_mse = 7.58, train_acc = 0.17, p_norm = 620.26
train, batch_num = 0031, batch_mse = 6.67, batch_mun = 40.95, train_mse = 7.48, train_acc = 0.17, p_norm = 621.76
train, batch_num = 0032, batch_mse = 6.93, batch_mun = 40.21, train_mse = 7.43, train_acc = 0.18, p_norm = 623.27
train, batch_num = 0033, batch_mse = 6.76, batch_mun = 40.37, train_mse = 7.36, train_acc = 0.18, p_norm = 624.75
train, batch_num = 0034, batch_mse = 6.11, batch_mun = 40.29, train_mse = 7.24, train_acc = 0.18, p_norm = 626.15
test: val_mse = 03.25, val_acc = 20.29%
epoch 2
train, batch_num = 0000, batch_mse = 6.14, batch_mun = 40.24, train_mse = 7.13, train_acc = 0.18, p_norm = 627.50
train, batch_num = 0001, batch_mse = 6.22, batch_mun = 40.48, train_mse = 7.04, train_acc = 0.18, p_norm = 628.84
train, batch_num = 0002, batch_mse = 6.11, batch_mun = 40.02, train_mse = 6.94, train_acc = 0.19, p_norm = 630.14
train, batch_num = 0003, batch_mse = 6.4

train, batch_num = 0032, batch_mse = 5.72, batch_mun = 40.08, train_mse = 5.70, train_acc = 0.20, p_norm = 680.41
train, batch_num = 0033, batch_mse = 5.49, batch_mun = 39.69, train_mse = 5.68, train_acc = 0.20, p_norm = 680.88
train, batch_num = 0034, batch_mse = 6.37, batch_mun = 39.30, train_mse = 5.75, train_acc = 0.20, p_norm = 681.30
test: val_mse = 02.68, val_acc = 20.82%
epoch 4
train, batch_num = 0000, batch_mse = 5.79, batch_mun = 40.36, train_mse = 5.75, train_acc = 0.20, p_norm = 681.74
train, batch_num = 0001, batch_mse = 5.62, batch_mun = 39.65, train_mse = 5.74, train_acc = 0.20, p_norm = 682.19
train, batch_num = 0002, batch_mse = 5.79, batch_mun = 39.90, train_mse = 5.74, train_acc = 0.20, p_norm = 682.62
train, batch_num = 0003, batch_mse = 5.53, batch_mun = 40.47, train_mse = 5.72, train_acc = 0.20, p_norm = 683.06
train, batch_num = 0004, batch_mse = 5.48, batch_mun = 39.88, train_mse = 5.70, train_acc = 0.20, p_norm = 683.46
train, batch_num = 0005, batch_mse = 5.8

train, batch_num = 0034, batch_mse = 5.40, batch_mun = 39.18, train_mse = 5.51, train_acc = 0.20, p_norm = 701.22
test: val_mse = 02.61, val_acc = 20.37%
epoch 6
train, batch_num = 0000, batch_mse = 5.73, batch_mun = 39.21, train_mse = 5.53, train_acc = 0.19, p_norm = 701.51
train, batch_num = 0001, batch_mse = 5.24, batch_mun = 39.23, train_mse = 5.50, train_acc = 0.20, p_norm = 701.78
train, batch_num = 0002, batch_mse = 5.59, batch_mun = 38.59, train_mse = 5.51, train_acc = 0.20, p_norm = 702.06
train, batch_num = 0003, batch_mse = 5.47, batch_mun = 40.45, train_mse = 5.51, train_acc = 0.20, p_norm = 702.30
train, batch_num = 0004, batch_mse = 5.40, batch_mun = 39.38, train_mse = 5.50, train_acc = 0.20, p_norm = 702.52
train, batch_num = 0005, batch_mse = 5.42, batch_mun = 39.10, train_mse = 5.49, train_acc = 0.20, p_norm = 702.71
train, batch_num = 0006, batch_mse = 5.53, batch_mun = 38.55, train_mse = 5.49, train_acc = 0.20, p_norm = 702.91
train, batch_num = 0007, batch_mse = 5.0

epoch 8
train, batch_num = 0000, batch_mse = 5.53, batch_mun = 38.32, train_mse = 5.28, train_acc = 0.20, p_norm = 713.89
train, batch_num = 0001, batch_mse = 5.35, batch_mun = 39.24, train_mse = 5.28, train_acc = 0.20, p_norm = 713.96
train, batch_num = 0002, batch_mse = 5.42, batch_mun = 38.66, train_mse = 5.30, train_acc = 0.20, p_norm = 714.07
train, batch_num = 0003, batch_mse = 5.12, batch_mun = 38.64, train_mse = 5.28, train_acc = 0.20, p_norm = 714.19
train, batch_num = 0004, batch_mse = 5.10, batch_mun = 38.54, train_mse = 5.26, train_acc = 0.20, p_norm = 714.27
train, batch_num = 0005, batch_mse = 5.32, batch_mun = 38.27, train_mse = 5.27, train_acc = 0.21, p_norm = 714.36
train, batch_num = 0006, batch_mse = 5.22, batch_mun = 38.02, train_mse = 5.26, train_acc = 0.21, p_norm = 714.48
train, batch_num = 0007, batch_mse = 5.38, batch_mun = 38.23, train_mse = 5.28, train_acc = 0.20, p_norm = 714.60
train, batch_num = 0008, batch_mse = 5.07, batch_mun = 37.85, train_mse = 5.25, 