#**Proyecto - Sistemas Recomendadores - IIC3633**

## Implementación en Keras de Session-Based RNNs for Recommendation con soft atenttion

### V2: Implementación de embedding sobre one-hot vectors para entrenamiento más eficiente y modelo más chico

In [1]:
import os
import sys
import subprocess
import math
import pandas as pd
import numpy as np
import sklearn
import psutil
import humanize
import pyreclab
import GPUtil as GPU
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.python.client import device_lib


import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

import warnings
#warnings.filterwarnings("ignore")

import keras
import keras.backend as K
from keras.utils import to_categorical
from keras.losses import cosine_proximity, categorical_crossentropy
from keras.models import Model, Sequential
from keras.initializers import glorot_uniform
from keras.layers.core import Permute, Reshape, RepeatVector
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding, concatenate, Lambda, multiply, merge, Flatten
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
# Cargamos dataframes preprocesados de MovieLens20MM
PATH_TO_TRAIN = './data/all_train.csv'
PATH_TO_DEV = './data/dev.csv'
PATH_TO_TEST = './data/test.csv'

train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
dev_data = pd.read_csv(PATH_TO_DEV, sep='\t', dtype={'ItemId':np.int64})
test_data = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId': np.int64})

In [16]:
def batch_generator(data, batch_size=128, session_max_len=19, fraction=1, offset=0, embedding=True, n_items=None, itemids=None, itemidmap=None, aug = True):
    item_key = 'ItemId'
    session_key = 'SessionId'
    time_key = 'Time'
    
    # add column
    data = pd.merge(data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') 

    # sort by session
    data.sort_values([session_key, time_key], inplace=True) 

    length = len(data['ItemId'])
    
    # array with cummulative offset that gives each session start in the array
    offset_sessions = np.zeros(data[session_key].nunique()+1, dtype=np.int32)
    offset_sessions[1:] = data.groupby(session_key).size().cumsum() 
    
    actual_session = 0 + offset
    
    batch_feats = None
    batch_labels = None

    while True:
        # session info
        datum = data[offset_sessions[actual_session]:offset_sessions[actual_session+1]][item_key]  
        datum = datum.values.reshape(-1,1)          
        
        for i in range(offset_sessions[actual_session+1]-offset_sessions[actual_session]-1):
            if not aug:
                if (i != offset_sessions[actual_session+1]-offset_sessions[actual_session]-2):
                    continue
                    
            feats = datum[0:i+1]

            if feats.shape[0] > session_max_len:
                # take newest events
                feats = feats[feats.shape[0]-session_max_len:] 
            else:
                # left pad with zeros
                feats = np.append(np.zeros((session_max_len-feats.shape[0],1), dtype=np.int8), feats)

            feats = feats.reshape(1,-1) 
            feats = np.expand_dims(feats, axis=2)
            label = datum[i+1]
            label = np.expand_dims(label, axis=0)  # Termina siendo (1, dimn_previa)

            if not isinstance(batch_feats, type(feats)):
                batch_feats = feats
            else:
                batch_feats = np.append(batch_feats, feats, axis=0)

            if not isinstance(batch_labels, type(label)):
                batch_labels = label
            else:
                batch_labels = np.append(batch_labels, label, axis=0)


            if batch_labels.shape[0] == batch_size:
                if not embedding:
                    batch_labels = to_categorical(itemidmap[batch_labels.flatten()], num_classes=n_items)
                    pass

                yield batch_feats, batch_labels
                
                # resume batch generation
                batch_feats = None
                batch_labels = None

        actual_session = (actual_session + 1) % len(offset_sessions)

In [4]:
batch_size = 512 #como en el paper
session_max_len = 100
embeddingp=False

n_items = len(train_data['ItemId'].unique())+1
print("Items unicos training:", n_items)

dev_n_items = len(dev_data['ItemId'].unique())+1
print("Items unicos dev:", dev_n_items)

test_n_items = len(test_data['ItemId'].unique())+1
print("Items unicos testing:", test_n_items)

train_samples_qty = len(train_data['SessionId'].unique()) # cantidad sesiones no augmentadas de train
print("Sesiones training:", train_samples_qty)

dev_samples_qty = len(dev_data['SessionId'].unique()) # cantidad sesiones no augmentadas de dev
print("Sesiones validation:",dev_samples_qty)

test_samples_qty = len(test_data['SessionId'].unique()) # cantidad sesiones no augmentadas de test
print("Sesiones testing:", test_samples_qty)

Items unicos training: 12811
Items unicos dev: 10103
Items unicos testing: 10365
Sesiones training: 80466
Sesiones validation: 5747
Sesiones testing: 5270


In [5]:
train_fraction = 1#256 # 1/fraction es la cantidad de sesiones mas recientes a considerar
dev_fraction = 1#2

train_offset_step=train_samples_qty//batch_size
dev_offset_step=dev_samples_qty//batch_size
test_offset_step=test_samples_qty//batch_size


aux = [0]
aux.extend(list(train_data['ItemId'].unique()))
itemids = np.array(aux)
itemidmap = pd.Series(data=np.arange(n_items), index=itemids) 

In [25]:
# Modelo

# ToDo: self-attention

def attention_3d_block(inputs, TIME_STEPS, SINGLE_ATTENTION_VECTOR=True):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    #a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    return output_attention_mul
    
emb_size = 50
size = emb_size
#size = emb_size if embeddingp else n_items

inputs = Input(shape=(session_max_len,1))
#emb = Embedding(n_items, emb_size, embeddings_initializer='uniform', input_length=session_max_len)(inputs)
#drop1 = Dropout(0.25)(emb)
gru = CuDNNGRU(10, return_sequences=True)(inputs)# drop1) #
drop2 = Dropout(0.25)(gru)
attention_mul = attention_3d_block(drop2, session_max_len)
attention_mul = Flatten()(attention_mul)
predictions = Dense(n_items, activation='softmax')(attention_mul)#(drop2)
model = Model(input=inputs, output=[predictions])
#custom_loss = custom_cosine_loss(itemidmap, n_items)
# lr original es 0.0001
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
# Try Nadam, too
model.compile(loss=categorical_crossentropy, optimizer=opt)
model.summary()

#filepath='./bast/model_checkpoint'
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
callbacks_list = []#[checkpoint]

  from ipykernel import kernelapp as app


ValueError: Only layers of same output shape can be merged using mul mode. Layer shapes: [(None, 100, 100), (None, 10, 100)]

In [18]:
real_epoca = 1
for epoch in range(50):
    #filepath='./weights/model_{}'.format(real_epoca)
    #model.load_weights('./weights/model_{}'.format(real_epoca-1))
    #model.save_weights(filepath)
    train_generator = batch_generator(train_data, 
                                      batch_size=batch_size, 
                                  session_max_len=session_max_len,
                                      fraction=train_fraction, 
                                      offset=train_offset_step*epoch,
                                     embedding=embeddingp,
                                      n_items=n_items,
                                     itemids=itemids,
                                     itemidmap=itemidmap)
    
    dev_generator = batch_generator(dev_data, 
                                    batch_size=batch_size,
                                  session_max_len=session_max_len,
                                    fraction=dev_fraction, 
                                    offset=dev_offset_step*epoch,
                                    embedding=embeddingp,
                                    n_items=n_items,
                                    itemids=itemids,
                                     itemidmap=itemidmap)
    
    history = model.fit_generator(train_generator,
                                steps_per_epoch=train_offset_step,
                                epochs=1,
                                validation_data=dev_generator,
                                validation_steps=dev_offset_step,
                                callbacks=callbacks_list)
    
    real_epoca += 1

Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1
Epoch 1/1


In [19]:
weights = model.layers[1].get_weights()[0]
from sklearn.neighbors import NearestNeighbors

# RECALL @ 10
recall_k = 20

#nbrs = NearestNeighbors(n_neighbors=recall_k, algorithm='ball_tree').fit(weights)
#distances, indices = nbrs.kneighbors(weights) # Vienen ya ordenados! # Shape (37484, 20)
# Paso 3: Dado un vector embedding arbitrario, obtener el item más cercano a éste. Aplicarla sobre los 20 anteriores.
from sklearn.metrics import recall_score

test_generator = batch_generator(test_data, 
                                  batch_size=batch_size,
                                  session_max_len=session_max_len,
                                  fraction=train_fraction, 
                                  offset=0,
                                 embedding=embeddingp,
                                  n_items=n_items,
                                 itemids=itemids,
                                 itemidmap=itemidmap)


n = 0
suma = 0
suma_baseline = 0
while True:
    try:
        test_batch = next(test_generator)
        pred = model.predict(test_batch[0]) # batch_size, n_items => 512, 37484
        

        label = test_batch[1]               

        if n%100 == 0:
            print(n)
        #print(pred.shape)
        #print(label.shape) 

        for row_idx in range(test_batch[0].shape[0]):
          #print(test_batch[0][row_idx])
          #baseline_pred = obj.recommend( str(test_batch[0][row_idx][-1]), 20 )
          pred_row = pred[row_idx] # 37484, #.reshape(1, -1) # 50,
          label_row = label[row_idx]        #.reshape(1, -1) # 50,

          #print(pred_row.shape)
          #print(label_row.shape)

          idx1 = pred_row.argsort()[-recall_k:][::-1]
          idx2 = label_row.argsort()[-1:][::-1]

          n += 1
          #print(idx1)
          #print(idx2)
          if idx2[0] in idx1:
            suma += 1

          #if idx2[0] in baseline_pred:
          #  suma_baseline += 1

    except:
        break
print("Recall@{} epoch {}: {}".format(recall_k, epoch, suma/n))

#print("Recall@{} baseline: {}".format(recall_k, suma_baseline/n))

0
12800
25600
38400
51200
64000
76800
89600
102400
115200
128000
140800
153600
166400
Recall@20 epoch 49: 0.09473825785928144


# All train set
Recall@10 epoch 29: 0.08087340943113773
Recall@20: 0.10473194236526946

vs Hidasi

Recall @ 20 0.2177499329156604
MRR@20: 0.06513681594077811

Pruebas atencion
Baseline
Recall@20 epoch 49: 0.09473825785928144
MultAttn



# Train Set
Recall@10 epoch ..100?: 0.09546921781437126

Recall@10 epoch 14: 0.06404879908501715

Recall @20 epoch 99: 0.08705440681137724

Con session_max_len = 100:

Recall @20 epoch 9: 0.12195335890718563

Con dwell_time NO FUNCIONA BIEN. Hacer ese supuesto en este dataset no tiene sentido.
