#**Proyecto - Sistemas Recomendadores - IIC3633**

## Implementación en Keras de Session-Based RNNs for Recommendation con soft atenttion

### V1: modelo con embedding, GRU de 1000, densa sobre one-hot encoding de items


Preliminar: Configuración entorno GPUs, Google Drive, entre otros.

In [None]:
# Manejo de Google Drive
!pip install -U -q PyDrive

from google.colab import drive, auth
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials

drive.mount("/content/drive/")

In [None]:
# Librerias varias

!pip install gputil
!pip install humanize

In [2]:
import os
import sys
import subprocess
import math
import pandas as pd
import numpy as np
import sklearn
import psutil
import humanize
import GPUtil as GPU
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.python.client import device_lib


import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

import warnings
#warnings.filterwarnings("ignore")

import keras.backend as K
from keras.utils import to_categorical
from keras.losses import cosine_proximity, categorical_crossentropy
from keras.models import Model, Sequential
from keras.initializers import glorot_uniform
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding, concatenate, Lambda, multiply
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [3]:
# Configuracion GPUs
#!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi

GPUs = GPU.getGPUs()
gpu = GPUs[0]

def print_gpu_info():
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize(
          psutil.virtual_memory().available), " I Proc size: "  +
          humanize.naturalsize(process.memory_info().rss))
  print("GPU RAM Free {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total \
         {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, 
                           gpu.memoryTotal))
  
print_gpu_info()

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [(x.name, x.DESCRIPTOR, x.DEVICE_TYPE_FIELD_NUMBER, x.NAME_FIELD_NUMBER, x.PHYSICAL_DEVICE_DESC_FIELD_NUMBER) for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

Gen RAM Free: 12.8 GB  I Proc size: 236.6 MB
GPU RAM Free 7686MB | Used: 419MB | Util   5% | Total          8105MB


[('/device:GPU:0',
  <google.protobuf.pyext._message.MessageDescriptor at 0x7f43b48646d0>,
  2,
  1,
  7)]

In [4]:
# Cargamos dataframes preprocesados de RSC15
PATH_TO_TRAIN = '../processedData/rsc15_train_tr.txt'
PATH_TO_DEV = '../processedData/rsc15_train_valid.txt'
PATH_TO_TEST = '../processedData/rsc15_test.txt'

train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
dev_data = pd.read_csv(PATH_TO_DEV, sep='\t', dtype={'ItemId':np.int64})
test_data = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId': np.int64})

In [5]:
def batch_generator(data, batch_size=128, session_max_len=19, fraction=1, offset=0, embedding=True, n_items=None, itemids=None, itemidmap=None):
    """
    Generador de batches para RSC15.
    Se utiliza para entrenar, validar y testear en demanda.
        args:
            data: DataFrame pandas con info. a procesar
            batch_size: tamanho batch
            session_max_len: largo fijo para sesiones. De ser mayor, se trunca. De ser menor, left-zero-padding
            fraction: fraccion mas reciente de sesiones que se consideran para la generacion
            offset: parametro que permite salto manual de sesiones
            embedding: Booleano que indica si el modelo actual predice embeddings, o one-hot encodings
            n_items: cantidad de items unicos a predecir
            itemids: lista con los IDs unicos de items en el set de datos a considerar
            itemidmap: biyeccion de IDs en dataset a rango simple (0, n_items)
    """
    
    item_key = 'ItemId'
    session_key = 'SessionId'
    time_key = 'Time'
    
    # inner join dataframe con itemidmap
    data = pd.merge(data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') # agrego esa columna
    data.sort_values([session_key, time_key], inplace=True) # ordenamos por sesion

    length = len(data['ItemId'])
    #data.sort_values([time_key], inplace=True)
    #data = data[length-length//fraction:]
    
    # arreglo con offset acumulativo de inicio de cada sesion
    offset_sessions = np.zeros(data[session_key].nunique()+1, dtype=np.int32)
    offset_sessions[1:] = data.groupby(session_key).size().cumsum() 
    #offset_sessions = offset_sessions[length-length//fraction:]
    
    actual_session = 0 + offset
    batch_feats = None
    batch_labels = None
    
    # K.reset_states(GRU_LAYER) si usamos session parallel approach

    while True:
        # la info de la sesion a considerar
        datum = data[offset_sessions[actual_session]:offset_sessions[actual_session+1]][item_key]  
        datum = datum.values.reshape(-1,1)
        
        # data augmentation
        for i in range(offset_sessions[actual_session+1]-offset_sessions[actual_session]-1):
            feats = datum[0:i+1]
            
            # truncate
            if feats.shape[0] > session_max_len:
                feats = feats[:session_max_len]
            # zero padding
            else:
                feats = np.append(np.zeros((session_max_len-feats.shape[0],1), dtype=np.int8), feats) # left pad with zeros
                      
            feats = feats.reshape(1,-1) # (1, 19)        
            label = datum[i+1]
            label = np.expand_dims(label, axis=0)
            
            # add to batch
            if not isinstance(batch_feats, type(feats)):
                batch_feats = feats
            else:
                batch_feats = np.append(batch_feats, feats, axis=0)

            if not isinstance(batch_labels, type(label)):
                batch_labels = label
            else:
                batch_labels = np.append(batch_labels, label, axis=0)

            # if batch is ready
            if batch_labels.shape[0] == batch_size:
                if not embedding:
                    # do one hot if necessary
                    batch_labels = to_categorical(itemidmap[batch_labels.flatten()], num_classes=n_items)
                
                # return batch
                yield batch_feats, batch_labels
                
                # resume generation
                batch_feats = None
                batch_labels = None
        
        actual_session = (actual_session + 1) % len(offset_sessions)

In [12]:
batch_size = 512
session_max_len = 19
embeddingp=False

n_items = len(train_data['ItemId'].unique())+1
print("Items unicos training:", n_items)

dev_n_items = len(dev_data['ItemId'].unique())+1
print("Items unicos dev:", dev_n_items)

test_n_items = len(test_data['ItemId'].unique())+1
print("Items unicos testing:", test_n_items)

train_samples_qty = len(train_data['SessionId'].unique()) # cantidad sesiones no augmentadas de train
print("Sesiones training:", train_samples_qty)

dev_samples_qty = len(dev_data['SessionId'].unique()) # cantidad sesiones no augmentadas de dev
print("Sesiones validation:",dev_samples_qty)

test_samples_qty = len(test_data['SessionId'].unique()) # cantidad sesiones no augmentadas de test
print("Sesiones testing:", test_samples_qty)

Items unicos training: 37484
Items unicos dev: 6360
Items unicos testing: 6752
Sesiones training: 7953885
Sesiones validation: 12372
Sesiones testing: 15324


In [11]:
# Modelo

def custom_cosine_loss(model):
    """Funcion custom para el modelo con embedding, m4"""
    emb = model.layers[1]
    # y_pred: con embedding
    # y_true: one-hot
    def fn(y_true, y_pred):
        y_true_emb = emb.call(y_true)
        #y_true_emb = np.array([y_true], dtype='int32')
        #y_true_emb = tf.convert_to_tensor(y_true_emb)
        #y_true_emb = model.layers[0].call(y_true)
        #y_true_emb = K.get_value(y_true_emb)[0][0] # 50,
        return 1 - cosine_proximity(y_true_emb, y_pred)
    return fn
    
emb_size = 50
size = emb_size if embeddingp else n_items

inputs = Input(shape=(19,))
emb = Embedding(n_items, emb_size, embeddings_initializer='uniform', input_length=19)(inputs)
drop1 = Dropout(0.25)(emb)
gru = CuDNNGRU(1000)(drop1)
drop2 = Dropout(0.25)(gru)
predictions = Dense(size, activation='softmax')(drop2)
model = Model(input=inputs, output=[predictions])
#custom_loss = custom_cosine_loss(model)
model.compile(loss=categorical_crossentropy, optimizer='adam')
model.summary()

filepath="model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 19)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 19, 50)            1874200   
_________________________________________________________________
dropout_3 (Dropout)          (None, 19, 50)            0         
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (None, 1000)              3156000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 37484)             37521484  
Total params: 42,551,684
Trainable params: 42,551,684
Non-trainable params: 0
________________________________________________________________



In [13]:
train_fraction = 1 # 1/fraction: cantidad sesiones mas recientes a considerar
dev_fraction = 1

train_offset_step=45000 #*batch_size = 23MM
dev_offset_step=72      #*batch_size = 37k


aux = [0]  # 0 es token especial
aux.extend(list(train_data['ItemId'].unique()))
itemids = np.array(aux)
itemidmap = pd.Series(data=np.arange(n_items), index=itemids) 

In [14]:
# Training loop
for epoch in range(1, 15):
    train_generator = batch_generator(train_data, 
                                      batch_size=batch_size, 
                                      fraction=train_fraction, 
                                      offset=train_offset_step*epoch,
                                      embedding=embeddingp,
                                      n_items=n_items,
                                      itemids=itemids,
                                      itemidmap=itemidmap)
    
    dev_generator = batch_generator(dev_data, 
                                    batch_size=batch_size, 
                                    fraction=dev_fraction, 
                                    offset=dev_offset_step*epoch,
                                    embedding=embeddingp,
                                    n_items=n_items,
                                    itemids=itemids,
                                    itemidmap=itemidmap)
    
    history = model.fit_generator(  train_generator,
                                    steps_per_epoch=train_offset_step,#15530,
                                    epochs=1,
                                    validation_data=dev_generator,
                                    validation_steps=dev_offset_step,#105,
                                    callbacks=callbacks_list)

Epoch 1/1

Epoch 00001: loss improved from inf to 9.38474, saving model to model.hdf5
Epoch 1/1
  295/45000 [..............................] - ETA: 4:15:56 - loss: 8.2129

KeyboardInterrupt: 

In [25]:
# Test performance over test set

test_generator = batch_generator(   test_data, 
                                    batch_size=batch_size, 
                                    fraction=train_fraction, 
                                    offset=0*epoch,
                                    embedding=embeddingp,
                                    n_items=n_items,
                                    itemids=itemids,
                                    itemidmap=itemidmap)

model.evaluate_generator(   test_generator, 
                            steps=40, 
                            max_queue_size=10, 
                            workers=1, 
                            use_multiprocessing=False)

14.242373847961426

In [28]:
# Recall score

test_generator = batch_generator(test_data, 
                                  batch_size=batch_size, 
                                  fraction=train_fraction, 
                                  offset=0,
                                 embedding=embeddingp,
                                  n_items=n_items,
                                 itemids=itemids,
                                 itemidmap=itemidmap)


n = 0
suma = 0

while True:
    try:
        test_batch = next(test_generator)
        pred = model.predict(test_batch[0]) # batch_size, n_items => 512, 37484
        label = test_batch[1]               

        for row_idx in range(test_batch[0].shape[0]):
            pred_row = pred[row_idx] # 37484, #.reshape(1, -1) # 50,
            label_row = label[row_idx]        #.reshape(1, -1) # 50,

            #print(pred_row.shape)
            #print(label_row.shape)

            idx1 = pred_row.argsort()[-20:][::-1]
            idx2 = label_row.argsort()[-1:][::-1]

            #print(idx1)
            #print(idx2)
            
            n += 1
            if idx2[0] in idx1:
                suma += 1
    except:
        break
        
print("Recall: ", suma/n)

Recall:  0.0010571961009174311
