# This source code is from seminar (CoFFee).

In [1]:
import numpy as np
import pandas as pd
from polara.preprocessing.dataframes import leave_one_out, reindex
from dataprep import transform_indices
from evaluation import topn_recommendations
from polara.lib.tensor import hooi
from polara.lib.sparse import tensor_outer_at

In [2]:
# --------------- data from kaggle ----------------

data_train = pd.read_csv('training.csv').dropna()
data_train['rating'] = data_train['rating'].round(0).astype(int)
data_test = pd.read_csv('testset.csv')
data_test['rating'] = data_test['rating'].round(0).astype(int)
print(f'train shape = {data_train.shape},\ntest shape = {data_test.shape}\n')

train_items = list(data_train['movieid'].unique())
test_items = list(data_test['movieid'].unique())

all_items = set(train_items + test_items)
print(f'Number of movies:\nall items = {len(all_items)},\ntrain items = {len(train_items)}, \ntest items = {len(test_items)}\n')

n_not_in_train = data_test.query('movieid not in @data_train.movieid.unique()').movieid.nunique()
print(f'n items not in train = {n_not_in_train}')

train shape = (6371806, 4),
test shape = (1368134, 4)

Number of movies:
all items = 17995,
train items = 17092, 
test items = 17102

n items not in train = 903


In [4]:
n_not_in_test = data_train.query('movieid not in @data_test.movieid.unique()').movieid.nunique()
n_in_test = data_train.query('movieid in @data_test.movieid.unique()').movieid.nunique()

print(f'n items not in test = {n_not_in_test}')
print(f'n items in test = {n_not_in_test}')

n items not in test = 893
n items in test = 893


In [5]:
training, train_data_index = transform_indices(data_train, 'userid', 'movieid')
testset, test_data_index = transform_indices(data_test, 'userid', 'movieid')

training['rating'].min(), testset['rating'].min()

(0, 0)

In [123]:
data_description = dict(
    users = train_data_index['users'].name,
    items = train_data_index['items'].name,
    feedback = 'rating',
    n_users = len(train_data_index['users']),
    n_items = len(train_data_index['items']),
    n_ratings = training['rating'].nunique(),
    min_rating = training['rating'].min(),
    test_users = testset[test_data_index['users'].name].drop_duplicates().values  
)

In [124]:
def tf_model_build(config, data, data_description):
    userid = data_description["users"]
    itemid = data_description["items"]
    feedback = data_description["feedback"]

    idx = data[[userid, itemid, feedback]].values
    idx[:, -1] = idx[:, -1] - data_description['min_rating']    # works only for integer ratings!
    val = np.ones(idx.shape[0], dtype='f8')
    
    n_users = data_description["n_users"]
    n_items = data_description["n_items"]
    n_ratings = data_description["n_ratings"]
    shape = (n_users, n_items, n_ratings)
    core_shape = config['mlrank']
    num_iters = config["num_iters"]
    
    u0, u1, u2, g = hooi(
        idx, val, shape, core_shape,
        return_core=False, num_iters=num_iters,
        parallel_ttm=False, growth_tol=0.01,
    )
    return u0, u1, u2
        

In [125]:
config = {
    'mlrank': (10, 10, 2),
    "num_iters": 5,
}

In [126]:
tf_params = tf_model_build(config, training, data_description)

growth of the core: 1.0
growth of the core: 0.17199603498425142
growth of the core: 0.013735870929472668
growth of the core: 0.002955473459604283
Done


In [127]:
tf_params[0].shape

(71297, 10)

In [128]:
tf_params[1].shape

(17092, 10)

In [129]:
tf_params[2].shape

(6, 2)

# Generating predictions

In [134]:
def tf_scoring(params, data, data_description):
    user_factors, item_factors, feedback_factors = params
    userid = data_description["users"]
    itemid = data_description["items"]
    feedback = data_description["feedback"]

    data = data.sort_values(userid)
    useridx = data[userid].values
    itemidx = data[itemid].values
    ratings = data[feedback].values
    ratings = ratings - data_description['min_rating'] # works only for integer ratings!
    
    tensor_outer = tensor_outer_at('cpu')
    # use the fact that test data is sorted by users for reduction:
    scores = tensor_outer(
        1.0,
        item_factors,
        feedback_factors,
        itemidx,
        ratings
    )
    scores = np.add.reduceat(scores, np.r_[0, np.where(np.diff(useridx))[0]+1])
    scores = np.tensordot(
        scores,
        feedback_factors[-1, :],
        axes=(2, 0)
    ).dot(item_factors.T)
    return scores

In [135]:
userid = data_description['users']
seen_data = testset

In [136]:
tf_scores = tf_scoring(tf_params, seen_data, data_description)

In [141]:
testset.userid.nunique(), testset.movieid.nunique()

(2963, 17102)

In [139]:
tf_scores.shape, testset.shape

((2963, 17092), (1368134, 4))

In [144]:
tf_scores

array([[ 1.56606231e-02, -7.91213484e-05, -1.34813865e-02, ...,
        -1.59988298e-05,  2.03536969e-07, -6.64459577e-06],
       [-4.52375189e-04,  1.56695055e-02,  2.81648105e-02, ...,
         2.50992690e-05,  9.07331490e-08,  2.47021743e-05],
       [ 1.02932254e-02, -8.14245386e-04,  1.10547316e-03, ...,
        -3.27238476e-06,  5.65534720e-07, -8.76584079e-07],
       ...,
       [ 6.23958116e-02,  9.74045822e-03, -9.52910180e-03, ...,
        -7.56884737e-07, -1.99889364e-06,  1.87427331e-05],
       [-9.01737302e-05, -2.01185962e-05, -3.83353610e-04, ...,
        -4.77109182e-07, -3.14531608e-08, -1.06011370e-07],
       [ 3.33706788e-02,  1.14251361e-02, -9.08470023e-03, ...,
        -8.55553890e-06,  9.64259106e-07,  8.79485464e-06]])

In [145]:
tf_recs = topn_recommendations(tf_scores, topn=20)

In [146]:
tf_recs.shape

(2963, 20)

In [147]:
tf_recs

array([[ 106,  206,   62, ...,  410,  165,  156],
       [ 262,  146,  226, ...,  415,  164,  591],
       [ 106,  110,  165, ...,  121,  214,   61],
       ...,
       [ 110,   42,  864, ...,  732, 1246,  788],
       [ 966,  732,  643, ...,  762,  472,  698],
       [ 530,  421,  577, ...,  121,  206,  398]])

In [173]:
tf_recs[:, 0]

array([106, 262, 106, ..., 110, 966, 530])

In [149]:
test_data_index['users']

Int64Index([    55,    133,    136,    213,    539,    604,    718,   1029,
              1088,   1179,
            ...
            282326, 282382, 282402, 282420, 282522, 282663, 282836, 282999,
            283047, 283183],
           dtype='int64', name='userid', length=2963)

In [174]:
top_20_recs = {}
for i, user in enumerate(test_data_index['users'].tolist()):
    top_20_recs[user] = tf_recs[i,:]

In [177]:
pd_top_20_recs = pd.DataFrame(top_20_recs)
pd_top_20_recs

Unnamed: 0,55,133,136,213,539,604,718,1029,1088,1179,...,282326,282382,282402,282420,282522,282663,282836,282999,283047,283183
0,106,262,106,671,165,376,965,643,421,216,...,146,791,110,146,966,965,421,110,966,530
1,206,146,110,530,110,162,791,537,376,230,...,179,788,106,732,965,711,206,42,732,421
2,62,226,165,965,121,146,732,637,110,215,...,121,965,62,121,732,968,106,864,643,577
3,190,221,190,421,106,221,121,711,106,376,...,262,732,421,162,968,668,61,867,491,617
4,530,491,410,791,965,121,421,732,165,101,...,472,764,376,472,794,732,165,216,634,376
5,421,162,421,668,767,472,770,668,61,221,...,73,793,190,179,750,790,62,711,221,668
6,767,101,62,774,230,491,61,617,62,214,...,162,966,165,262,711,950,770,193,591,106
7,768,230,376,770,62,179,793,577,121,222,...,491,794,530,376,756,637,376,965,794,410
8,168,179,154,617,791,617,376,566,530,233,...,221,790,215,965,643,671,54,1037,233,671
9,225,392,530,565,410,577,966,591,770,537,...,171,750,61,491,762,774,530,637,198,774


In [178]:
pd_top_20_recs.to_csv('pd_top_20_recs_coffee.csv')