In [19]:
import numpy as np 
import pandas as pd

from data import RandomData, AmazonBooks, ToyData, MovieLensData
from model import RandomModel, CombinedMeanModel, ItemItemCollaborationModel

%matplotlib inline

In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Load Data

In [21]:
ds = AmazonBooks().get_dataset(verbose=True)
train = ds['train']
val = ds['val']

loading preprocessed dataset from disk


In [22]:
(len(ds['test']['user_product_ratings']),
 len(ds['val']['user_product_ratings']),
 len(ds['train']['user_product_ratings']))

(119992, 124008, 233681)

In [6]:
train['user_product_ratings'].rating.value_counts()

Series([], Name: rating, dtype: int64)

In [6]:
len(set(ds['train']['user_product_ratings'].user_id))

31840

In [7]:
len(set(ds['val']['user_product_ratings'].user_id))

32613

In [8]:
(len(ds['test']['product_descriptions']),
 len(ds['val']['product_descriptions']),
 len(ds['train']['product_descriptions']))

(42287, 42767, 61076)

In [9]:
(len(ds['test']['product_reviews']),
 len(ds['val']['product_reviews']),
 len(ds['train']['product_reviews']))

(860816, 864242, 874476)

In [None]:
ds['train']['user_product_ratings'].groupby('user_id')['rating'].count().hist(log=True)

### Accuracy Metrics

In [15]:
def mean_squared_error(pred, ground_truth):
    return np.mean((pred - ground_truth) ** 2.)

In [16]:
def accuracy(pred, ground_truth):
    return (np.round(pred) == ground_truth).sum() / float(len(pred))

In [17]:
def evaluate_model(model, train, val, loss_fn):
    model.fit(train)
    train_up_rat = train['user_product_ratings']
    val_up_rat = val['user_product_ratings'].copy()
    val_up_rat['pred'] = model.predict(val_up_rat[['user_id', 'product_id']])
    
    train_users = set(train['user_product_ratings'].user_id)
    train_products = set(train['user_product_ratings'].product_id)
    
    train_loss = loss_fn(model.predict(train_up_rat[['user_id', 'product_id']]),
                         train_up_rat.rating)
    
    val_loss = loss_fn(model.predict(val_up_rat[['user_id', 'product_id']]),
                       val_up_rat.rating)
    
    A_data = val_up_rat[val_up_rat.user_id.isin(train_users) & 
                        val_up_rat.product_id.isin(train_products)]
    A_loss = loss_fn(A_data.pred, A_data.rating)
    
    B_data = val_up_rat[~val_up_rat.user_id.isin(train_users) &
                        val_up_rat.product_id.isin(train_products)]
    B_loss = loss_fn(B_data.pred, B_data.rating)
    
    C_data = val_up_rat[val_up_rat.user_id.isin(train_users) &
                        ~val_up_rat.product_id.isin(train_products)]
    C_loss = loss_fn(C_data.pred, C_data.rating)
    
    D_data = val_up_rat[~val_up_rat.user_id.isin(train_users) &
                        ~val_up_rat.product_id.isin(train_products)]
    D_loss = loss_fn(D_data.pred, D_data.rating)
    
    print('\n'.join(
        ['     Products',
         '    -----------',
         '   |       |   |  (A.x) TRAIN SET',
         ' U | (A.x) |   |  (A.o) VAL CELL HOLDOUT',
         ' s | (A.o) |(C)|    (B) VAL USER HOLDOUT',
         ' e |       |   |    (C) VAL PRODUCT HOLDOUT',
         ' r |       |   |    (D) VAL PRODUCT & USER HOLDOUT',
         ' s |-------|---|    (V) VAL SET',
         '   |  (B)  |(D)|',
         '    -----------',
         '================================',
         '']))
    
    def stats(data, loss, lbl):
        print(lbl)
        print(f'      Number of users: {len(set(data.user_id))}')
        print(f'      Number of proucts: {len(set(data.product_id))}')
        print(f'      Number of ratings: {len(data)}')
        print(f'      loss: {loss}\n')
    stats(train_up_rat, train_loss, '(A.x) TRAIN SET')
    stats(val_up_rat, val_loss, '  (V) VAL SET')
    stats(A_data, A_loss, '(A.o) VAL CELL HOLDOUT')
    stats(B_data, B_loss, '  (B) VAL USER HOLDOUT')
    stats(C_data, C_loss, '  (C) VAL PRODUCT HOLDOUT')
    stats(D_data, D_loss, '  (D) VAL PRODUCT & USER HOLDOUT')
    return val_loss, A_loss, B_loss, C_loss, D_loss

In [None]:
model = ItemItemCollaborationModel()
_ = evaluate_model(model, train, val, mean_squared_error)

In [None]:
model = SimpleMeanModel()
_ = evaluate_model(model, train, val, mean_squared_error)

In [None]:
model = UserMeanModel()
_ = evaluate_model(model, train, val, mean_squared_error)

In [None]:
model = ProductMeanModel()
_ = evaluate_model(model, train, val, mean_squared_error)

In [None]:
model = CombinedMeanModel()
_ = evaluate_model(model, train, val, mean_squared_error)