#**Proyecto - Sistemas Recomendadores - IIC3633**

## Session-Based RNNs for Recommendation with content-aware attention

### The main difference with respect to the baseline model is that,
### instead of receiving a simple ItemId token, the session is made up of
### the average GloVe encoding of the tags that the item has on the db

### The model then tries to predict the closest average GloVe embedding, 
### given the session, and we compare with the truth given by the label

### At test time, we simply take the kNN of the predicted embedding, and 
### that is the recommended that the system outputs


In [12]:
import os
import sys
import subprocess
import math
import pickle
import pandas as pd
import numpy as np
import sklearn
from tqdm import tqdm
import psutil
import humanize
import pyreclab
import GPUtil as GPU
import matplotlib.pyplot as plt
from sys import getsizeof
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.python.client import device_lib


import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

import warnings
#warnings.filterwarnings("ignore")

import keras
import keras.backend as K
from keras.utils import to_categorical
from keras.losses import cosine_proximity, categorical_crossentropy
from keras.models import Model, Sequential
from keras.initializers import glorot_uniform
from keras.layers.core import Permute, Reshape, RepeatVector
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding, concatenate, Lambda, multiply, merge, Flatten
from keras.callbacks import ModelCheckpoint


In [2]:
# Cargamos dataframes preprocesados de MovieLens20MM
PATH_TO_TRAIN = './data/all_train.csv'
PATH_TO_DEV = './data/dev.csv'
PATH_TO_TEST = './data/test.csv'
PATH_TO_TAGS = './data/tags.pickle'

train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
dev_data = pd.read_csv(PATH_TO_DEV, sep='\t', dtype={'ItemId':np.int64})
test_data = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId': np.int64})

with open(PATH_TO_TAGS, 'rb') as file:
    tags = pickle.load(file)

In [3]:
# Obtencion de GLoVE. Referencia: github.com/abisee/cs224n-win18-squad
_PAD = b"<pad>" # padding
_UNK = b"<unk>" # unknown
_START_VOCAB = [_PAD, _UNK]
PAD_ID = 0
UNK_ID = 1


def get_glove(glove_path, glove_dim):
    #print("Loading GLoVE vectors from file: %s" % glove_path)
    vocab_size = int(4e5)

    emb_matrix = np.zeros((vocab_size + len(_START_VOCAB), glove_dim))
    word2id = {}
    id2word = {}

    random_init = True
    if random_init:
        emb_matrix[:len(_START_VOCAB), :] = np.random.randn(len(_START_VOCAB), glove_dim)

    idx = 0
    for word in _START_VOCAB:
        word2id[word] = idx
        id2word[idx] = word
        idx += 1

    with open(glove_path, 'r') as fh:
        for line in tqdm(fh, total=vocab_size):
            line = line.lstrip().rstrip().split(" ")
            word = line[0]
            vector = list(map(float, line[1:]))
            if glove_dim != len(vector):
                raise Exception("You set --glove_path=%s but --embedding_size=%i. If you set --glove_path yourself then make sure that --embedding_size matches!" % (glove_path, glove_dim))
            emb_matrix[idx, :] = vector
            word2id[word] = idx
            id2word[idx] = word
            idx += 1

    final_vocab_size = vocab_size + len(_START_VOCAB)
    assert len(word2id) == final_vocab_size
    assert len(id2word) == final_vocab_size
    assert idx == final_vocab_size
    
    # retorno
    # emb_matrix: (400002, glove_dim), glove embeddings (PAD and UNK first two rows)
    # word2id: dictionary mapping word (string) to word id (int)
    # id2word: dictionary mapping word id (int) to word (string)
    
    return emb_matrix, word2id, id2word   

In [14]:
# Generacion de batches. Referencia: github.com/abisee/cs224n-win18-squad
glove_dimn = 50
emb_matrix, word2id, id2word = get_glove("./data/glove.6B.{}d.txt".format(glove_dimn), glove_dimn)

  2%|▏         | 6941/400000 [00:00<00:05, 69403.61it/s]

Loading GLoVE vectors from file: ./data/glove.6B.50d.txt


100%|██████████| 400000/400000 [00:04<00:00, 83579.09it/s]


In [18]:
def get_avg_embedding(tags, emb_matrix):
    # considering all tags for a given movie
    all_results = None # np.zeros((emb_matrix.shape[0], word2id[0].shape[1]))
    counter = 1
    
    for tag in tags:
        if type(all_results) == np.ndarray:
            print(counter)
            all_results = np.average([all_results, np.dot(emb_matrix, word2id[tag])], 
                                     axis=0, 
                                     weights=(1-1/counter, 1/counter))
        else:
            all_results = np.dot(emb_matrix, word2id[tag])
        counter += 1                             
        
    #all_results = np.array(all_results)
    #all_results = np.average(all_results)
    return all_results

print(tags[1])

# To use in the batch_generator
# for movie in tags_df:
#     get_avg_embedding(movie, emb_matrix)

get_avg_embedding(tags[1], emb_matrix)

{'lots', 'comedy', 'witty', 'heroic', 'children', 'adventure', 'boy', 'mission', 'weekly', 'unlikely', 'top', 'life', 'video', 'light', 'rotten', 'buddy', 'voice', 'buzz', 'friendship', 'travel', 'stereoscopic', 'woody', 'bright', 'engaging', 'computer', 'star', 'good', 'heart', 'fun', 'family', 'animated', 'clever', 'reissue', 'acting', 'entertainment', 'want', 'movie', 'funny', 'warm', 'animation', 'action', 'feature', 'humorous', 'story', 'see', 'figure', 'rousing', 'almost', 'national', 'cute', 'buy', 'film', 'come', 'daring', 'favorite', 'soothing', 'registry', 'innovative', 'toy', 'every', 'watched', 'best', 'rated', 'fantasy', 'classic', 'time', 'fanciful', 'cartoon', 'first'}
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69


array([[16769.16761031, -3841.06081035, -2105.25180516, ...,
        10177.61202515, -7660.82383032,  1448.41599331],
       [ 2262.25710803, -7477.76616702,  2873.25372288, ...,
        -1744.65777403,  4658.42713516, -5572.08397868],
       [ 2660.50942029,  1589.17701449, -2624.99352899, ...,
        -1171.83346739,  -732.84941304, -5001.56676449],
       ...,
       [-3257.59647464,   373.65518188,  6945.96634058, ...,
        -1591.40471377, -7160.46195652, 10096.56960145],
       [-4830.7977029 , -3018.59616667,  3015.03184783, ...,
         5025.30767391,   -89.8462942 ,  4104.05855072],
       [  462.1966808 , -3271.08996739,  3009.30347826, ...,
        -1203.40314855, -3756.60111232,  3536.24982971]])

In [11]:
len(tags)

19545

In [16]:
def batch_generator(data, batch_size=128, session_max_len=19, fraction=1, offset=0, embedding=True, n_items=None, itemids=None, itemidmap=None, aug = True):
    item_key = 'ItemId'
    session_key = 'SessionId'
    time_key = 'Time'
    
    # add column
    data = pd.merge(data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') 

    # sort by session
    data.sort_values([session_key, time_key], inplace=True) 

    length = len(data['ItemId'])
    
    # array with cummulative offset that gives each session start in the array
    offset_sessions = np.zeros(data[session_key].nunique()+1, dtype=np.int32)
    offset_sessions[1:] = data.groupby(session_key).size().cumsum() 
    
    actual_session = 0 + offset
    
    batch_feats = None
    batch_labels = None

    while True:
        # session info
        datum = data[offset_sessions[actual_session]:offset_sessions[actual_session+1]][item_key]  
        datum = datum.values.reshape(-1,1)          
        
        for i in range(offset_sessions[actual_session+1]-offset_sessions[actual_session]-1):
            if not aug:
                if (i != offset_sessions[actual_session+1]-offset_sessions[actual_session]-2):
                    continue
                    
            feats = datum[0:i+1]

            if feats.shape[0] > session_max_len:
                # take newest events
                feats = feats[feats.shape[0]-session_max_len:] 
            else:
                # left pad with zeros
                feats = np.append(np.zeros((session_max_len-feats.shape[0],1), dtype=np.int8), feats)

            feats = feats.reshape(1,-1) 
            label = datum[i+1]
            label = np.expand_dims(label, axis=0)  # Termina siendo (1, dimn_previa)

            if not isinstance(batch_feats, type(feats)):
                batch_feats = feats
            else:
                batch_feats = np.append(batch_feats, feats, axis=0)

            if not isinstance(batch_labels, type(label)):
                batch_labels = label
            else:
                batch_labels = np.append(batch_labels, label, axis=0)


            if batch_labels.shape[0] == batch_size:
                if not embedding:
                    batch_labels = to_categorical(itemidmap[batch_labels.flatten()], num_classes=n_items)
                    pass

                yield batch_feats, batch_labels
                
                # resume batch generation
                batch_feats = None
                batch_labels = None

        actual_session = (actual_session + 1) % len(offset_sessions)

      


In [17]:
batch_size = 512 #como en el paper
session_max_len = 100
embeddingp=False

n_items = len(train_data['ItemId'].unique())+1
print("Items unicos training:", n_items)

dev_n_items = len(dev_data['ItemId'].unique())+1
print("Items unicos dev:", dev_n_items)

test_n_items = len(test_data['ItemId'].unique())+1
print("Items unicos testing:", test_n_items)

train_samples_qty = len(train_data['SessionId'].unique()) # cantidad sesiones no augmentadas de train
print("Sesiones training:", train_samples_qty)

dev_samples_qty = len(dev_data['SessionId'].unique()) # cantidad sesiones no augmentadas de dev
print("Sesiones validation:",dev_samples_qty)

test_samples_qty = len(test_data['SessionId'].unique()) # cantidad sesiones no augmentadas de test
print("Sesiones testing:", test_samples_qty)

Items unicos training: 12811
Items unicos dev: 10103
Items unicos testing: 10365
Sesiones training: 80466
Sesiones validation: 5747
Sesiones testing: 5270


In [None]:
train_fraction = 1 #256 # 1/fraction es la cantidad de sesiones mas recientes a considerar
dev_fraction = 1 #2

train_offset_step=train_samples_qty//batch_size
dev_offset_step=dev_samples_qty//batch_size
test_offset_step=test_samples_qty//batch_size


aux = [0]
aux.extend(list(train_data['ItemId'].unique()))
itemids = np.array(aux)
itemidmap = pd.Series(data=np.arange(n_items), index=itemids) 

In [None]:
# Modelo

# ToDo: self-attention

def attention_3d_block(inputs, TIME_STEPS, SINGLE_ATTENTION_VECTOR=True):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    #a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    return output_attention_mul
    
emb_size = 50
size = emb_size
#size = emb_size if embeddingp else n_items

inputs = Input(shape=(session_max_len,))
#emb = Embedding(n_items, emb_size, embeddings_initializer='uniform', input_length=session_max_len)(inputs)
emb = Embedding(400002, glove_dimn, weights=[emb_matrix], trainable=False, input_length=session_max_len)(inputs)
drop1 = Dropout(0.25)(emb)
gru = CuDNNGRU(100)(drop1) # , return_sequences=True
drop2 = Dropout(0.25)(gru)
#attention_mul = attention_3d_block(drop2, session_max_len)
#attention_mul = Flatten()(attention_mul)
predictions = Dense(n_items, activation='softmax')(drop2)#(attention_mul)
model = Model(input=inputs, output=[predictions])
#custom_loss = custom_cosine_loss(itemidmap, n_items)
# lr original es 0.0001
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
# Try Nadam, too
model.compile(loss=categorical_crossentropy, optimizer=opt)
model.summary()

#filepath='./bast/model_checkpoint'
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
callbacks_list = []#[checkpoint]

In [None]:
real_epoca = 1
for epoch in range(50):
    #filepath='./weights/model_{}'.format(real_epoca)
    #model.load_weights('./weights/model_{}'.format(real_epoca-1))
    #model.save_weights(filepath)
    train_generator = batch_generator(train_data, 
                                      batch_size=batch_size, 
                                  session_max_len=session_max_len,
                                      fraction=train_fraction, 
                                      offset=train_offset_step*epoch,
                                     embedding=embeddingp,
                                      n_items=n_items,
                                     itemids=itemids,
                                     itemidmap=itemidmap)
    
    dev_generator = batch_generator(dev_data, 
                                    batch_size=batch_size,
                                  session_max_len=session_max_len,
                                    fraction=dev_fraction, 
                                    offset=dev_offset_step*epoch,
                                    embedding=embeddingp,
                                    n_items=n_items,
                                    itemids=itemids,
                                     itemidmap=itemidmap)
    
    history = model.fit_generator(train_generator,
                                steps_per_epoch=train_offset_step,
                                epochs=1,
                                validation_data=dev_generator,
                                validation_steps=dev_offset_step,
                                callbacks=callbacks_list)
    
    real_epoca += 1

In [None]:
weights = model.layers[1].get_weights()[0]
from sklearn.neighbors import NearestNeighbors

# RECALL @ 10
recall_k = 20

#nbrs = NearestNeighbors(n_neighbors=recall_k, algorithm='ball_tree').fit(weights)
#distances, indices = nbrs.kneighbors(weights) # Vienen ya ordenados! # Shape (37484, 20)
# Paso 3: Dado un vector embedding arbitrario, obtener el item más cercano a éste. Aplicarla sobre los 20 anteriores.
from sklearn.metrics import recall_score

test_generator = batch_generator(test_data, 
                                  batch_size=batch_size,
                                  session_max_len=session_max_len,
                                  fraction=train_fraction, 
                                  offset=0,
                                 embedding=embeddingp,
                                  n_items=n_items,
                                 itemids=itemids,
                                 itemidmap=itemidmap)


n = 0
suma = 0
suma_baseline = 0
while True:
    try:
        test_batch = next(test_generator)
        pred = model.predict(test_batch[0]) # batch_size, n_items => 512, 37484
        

        label = test_batch[1]               

        if n%100 == 0:
            print(n)
        #print(pred.shape)
        #print(label.shape) 

        for row_idx in range(test_batch[0].shape[0]):
          #print(test_batch[0][row_idx])
          #baseline_pred = obj.recommend( str(test_batch[0][row_idx][-1]), 20 )
          pred_row = pred[row_idx] # 37484, #.reshape(1, -1) # 50,
          label_row = label[row_idx]        #.reshape(1, -1) # 50,

          #print(pred_row.shape)
          #print(label_row.shape)

          idx1 = pred_row.argsort()[-recall_k:][::-1]
          idx2 = label_row.argsort()[-1:][::-1]

          n += 1
          #print(idx1)
          #print(idx2)
          if idx2[0] in idx1:
            suma += 1

          #if idx2[0] in baseline_pred:
          #  suma_baseline += 1

    except:
        break
print("Recall@{} epoch {}: {}".format(recall_k, epoch, suma/n))

#print("Recall@{} baseline: {}".format(recall_k, suma_baseline/n))

# All train set
Recall@10 epoch 29: 0.08087340943113773
Recall@20: 0.10473194236526946

vs Hidasi

Recall @ 20 0.2177499329156604
MRR@20: 0.06513681594077811



# Train Set
Recall@10 epoch ..100?: 0.09546921781437126

Recall@10 epoch 14: 0.06404879908501715

Recall @20 epoch 99: 0.08705440681137724

Con session_max_len = 100:

Recall @20 epoch 9: 0.12195335890718563

Con dwell_time NO FUNCIONA BIEN. Hacer ese supuesto en este dataset no tiene sentido.
