#**Proyecto - Sistemas Recomendadores - IIC3633**

## Implementación en Keras de Session-Based RNNs for Recommendation con soft atenttion

### V2: Implementación de embedding sobre one-hot vectors para entrenamiento más eficiente y modelo más chico


Preliminar: Configuración entorno GPUs, Google Drive, entre otros.

In [None]:
# Manejo de Google Drive
!pip install -U -q PyDrive

from google.colab import drive, auth
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials

drive.mount("/content/drive/")

In [None]:
# Librerias varias

!pip install gputil
!pip install humanize

In [1]:
import os
import sys
import subprocess
import math
import pandas as pd
import numpy as np
import sklearn
import psutil
import humanize
import GPUtil as GPU
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.python.client import device_lib


import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

import warnings
#warnings.filterwarnings("ignore")

import keras.backend as K
from keras.utils import to_categorical
from keras.losses import cosine_proximity, categorical_crossentropy
from keras.models import Model, Sequential
from keras.initializers import glorot_uniform
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
# Configuracion GPUs
#!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi

GPUs = GPU.getGPUs()
gpu = GPUs[0]

def print_gpu_info():
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize(
          psutil.virtual_memory().available), " I Proc size: "  +
          humanize.naturalsize(process.memory_info().rss))
  print("GPU RAM Free {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total \
         {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, 
                           gpu.memoryTotal))
  
print_gpu_info()

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [(x.name, x.DESCRIPTOR, x.DEVICE_TYPE_FIELD_NUMBER, x.NAME_FIELD_NUMBER, x.PHYSICAL_DEVICE_DESC_FIELD_NUMBER) for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

Gen RAM Free: 14.2 GB  I Proc size: 237.6 MB
GPU RAM Free 7682MB | Used: 423MB | Util   5% | Total          8105MB


[('/device:GPU:0',
  <google.protobuf.pyext._message.MessageDescriptor at 0x7fb92aee16b0>,
  2,
  1,
  7)]

In [3]:
# Cargamos dataframes preprocesados de RSC15
PATH_TO_TRAIN = '../processedData/rsc15_train_tr.txt'
PATH_TO_DEV = '../processedData/rsc15_train_valid.txt'
PATH_TO_TEST = '../processedData/rsc15_test.txt'

train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
dev_data = pd.read_csv(PATH_TO_DEV, sep='\t', dtype={'ItemId':np.int64})
test_data = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId': np.int64})

In [4]:
def batch_generator(data, batch_size=128, session_max_len=19, fraction=1, offset=0, embedding=True, n_items=None, itemids=None, itemidmap=None):
    item_key = 'ItemId'
    session_key = 'SessionId'
    time_key = 'Time'
    
    #if not embedding:
    #    itemidmap=itemidmap
    #else:
    #    itemids = data[item_key].unique()
    #    print(itemids.shape)
    #    n_items = len(itemids)
    #    # Mapeo desde los 37.5k a (0, 37.5k) id
    #    itemidmap = pd.Series(data=np.arange(n_items), index=itemids) 
    
    
    data = pd.merge(data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') # agrego esa columna
    
    #print("Cantidad de samples: {}".format(len(data)//fraction))

    data.sort_values([session_key, time_key], inplace=True) # ordenamos por sesion

    #data.sort_values([time_key], inplace=True)
    length = len(data['ItemId'])
    #data = data[length-length//fraction:]
    
    offset_sessions = np.zeros(data[session_key].nunique()+1, dtype=np.int32)
    offset_sessions[1:] = data.groupby(session_key).size().cumsum() # arreglo con offset acumulativo de inicio de cada sesion
    #offset_sessions = offset_sessions[length-length//fraction:]
    
    actual_session = 0 + offset
    
    batch_feats = None
    batch_labels = None
    
    # GRU_LAYER.reset_states() 

    while True:
        datum = data[offset_sessions[actual_session]:offset_sessions[actual_session+1]][item_key]  # aqui toda la info de la sesion
        datum = datum.values.reshape(-1,1)
        
        for i in range(offset_sessions[actual_session+1]-offset_sessions[actual_session]-1):
            feats = datum[0:i+1]
            if feats.shape[0] > session_max_len:
                feats = feats[:session_max_len] # aca cambiar a mas nuevos
            else:
                feats = np.append(np.zeros((session_max_len-feats.shape[0],1), dtype=np.int8), feats) # left pad with zeros
                      
            feats = feats.reshape(1,-1) # (1, 19)
            
            #if False:#not embedding:
            #    new_feats = np.zeros((1, session_max_len, n_items))
            #    for item_idx in range(feats[0].shape[0]):
            #        new_feats[0][item_idx][:] = to_categorical(itemidmap[feats[0][item_idx]], num_classes=n_items)
            #    feats = new_feats
                    
            label = datum[i+1]
            label = np.expand_dims(label, axis=0)  # Termina siendo (1, dimn_previa)
            
            if not embedding:
                new_label = np.zeros((1, n_items))
                new_label[0][:] = to_categorical(itemidmap[label[0][0]], num_classes=n_items)
                label = new_label
            
            if not isinstance(batch_feats, type(feats)):
                batch_feats = feats
            else:
                batch_feats = np.append(batch_feats, feats, axis=0)

            if not isinstance(batch_labels, type(label)):
                batch_labels = label #np.expand_dims(label, axis=0)
            else:
                batch_labels = np.append(batch_labels, label, axis=0)#np.expand_dims(label, axis=0), axis=0)

            if batch_labels.shape[0] == batch_size:
                #print("Yielding batch with shape {} train, {} target".format(batch_feats.shape, batch_labels.shape))
                yield batch_feats, batch_labels
                
                # resume batch generation
                batch_feats = None
                batch_labels = None
            
        # TODO: Dropout random como en el paper
        
        actual_session = (actual_session + 1) % len(offset_sessions)

In [5]:
batch_size = 512 # como en el paper
session_max_len = 19
embeddingp=True

n_items = len(train_data['ItemId'].unique())+1
print("Items unicos training:", n_items)

dev_n_items = len(dev_data['ItemId'].unique())+1
print("Items unicos dev:", dev_n_items)

test_n_items = len(test_data['ItemId'].unique())+1
print("Items unicos testing:", test_n_items)

train_samples_qty = len(train_data['SessionId'].unique()) # cantidad sesiones no augmentadas de train
print("Sesiones training:", train_samples_qty)

dev_samples_qty = len(dev_data['SessionId'].unique()) # cantidad sesiones no augmentadas de dev
print("Sesiones validation:",dev_samples_qty)

test_samples_qty = len(test_data['SessionId'].unique()) # cantidad sesiones no augmentadas de test
print("Sesiones testing:", test_samples_qty)

Items unicos training: 37484
Items unicos dev: 6360
Items unicos testing: 6752
Sesiones training: 7953885
Sesiones validation: 12372
Sesiones testing: 15324


In [13]:
# Modelo

# ToDo:
# meterle self-attention (hay implementaciones en Keras)

def custom_cosine_loss(emb):
    # y_pred ya viene con embedding, y_true solo como one-hot
    def fn(y_true, y_pred):
        y_true_emb = emb.call(y_true)[0][0]
        print(y_true)
        print(y_true_emb)
        print(y_pred)
        #y_true_emb = np.array([y_true], dtype='int32')
        #y_true_emb = tf.convert_to_tensor(y_true_emb)
        #y_true_emb = model.layers[0].call(y_true)
        #y_true_emb = K.get_value(y_true_emb)[0][0] # 50,
        
        return 1 - cosine_proximity(y_true_emb, y_pred)
    return fn
    
emb_size = 50

model = Sequential()
emb = Embedding(n_items, emb_size, embeddings_initializer='uniform', input_length=19)
model.add(emb)
model.add(Dropout(0.25))
model.add(CuDNNGRU(1000)) 
model.add(Dropout(0.25))
if embeddingp:
    model.add(Dense(emb_size, activation='softmax'))
    custom_loss = custom_cosine_loss(emb)  ## DUDA: Esta usando los pesos actuales?
    model.compile(loss=custom_loss, optimizer='adam')
else:
    model.add(Dense(n_items, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

filepath="model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

Tensor("dense_2_target:0", shape=(?, ?), dtype=float32)
Tensor("loss_1/dense_2_loss/strided_slice_1:0", shape=(50,), dtype=float32)
Tensor("dense_2/Softmax:0", shape=(?, 50), dtype=float32)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 19, 50)            1874200   
_________________________________________________________________
dropout_3 (Dropout)          (None, 19, 50)            0         
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (None, 1000)              3156000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                50050     
Total params: 5,080,250
Trainable params: 5,080,250
Non-trainable params: 0
________

In [14]:
train_fraction = 1#256 # 1/fraction es la cantidad de sesiones mas recientes a considerar
dev_fraction = 1#2

train_offset_step=40000#15530
dev_offset_step=71#240


aux = [0]
aux.extend(list(train_data['ItemId'].unique()))
itemids = np.array(aux)
itemidmap = pd.Series(data=np.arange(n_items), index=itemids) 

In [15]:
#todo meterle un offset de sesiones al generador para poder continuar training al cargar pesos
for epoch in range(1, 20):
    train_generator = batch_generator(train_data, 
                                      batch_size=batch_size, 
                                      fraction=train_fraction, 
                                      offset=train_offset_step*epoch,
                                     embedding=embeddingp,
                                      n_items=n_items,
                                     itemids=itemids,
                                     itemidmap=itemidmap)
    
    dev_generator = batch_generator(dev_data, 
                                    batch_size=batch_size, 
                                    fraction=dev_fraction, 
                                    offset=dev_offset_step*epoch,
                                    embedding=embeddingp,
                                    n_items=n_items,
                                    itemids=itemids,
                                     itemidmap=itemidmap)
    
    history = model.fit_generator(train_generator,
                                steps_per_epoch=train_offset_step,#15530,
                                epochs=1,
                                validation_data=dev_generator,
                                validation_steps=dev_offset_step,#105,
                                callbacks=callbacks_list)

Epoch 1/1

Epoch 00001: loss improved from inf to 1.00000, saving model to model.hdf5
Epoch 1/1
  513/40000 [..............................] - ETA: 1:26:18 - loss: 1.0000

KeyboardInterrupt: 

In [None]:
# Test performance on test set

test_generator = batch_generator(test_data, batch_size=batch_size)
#model.load_weights('./drive/My Drive/Cursos/2018/IIC3633/model_8.h5')
model.evaluate_generator(test_generator, steps=400, max_queue_size=10, workers=1, use_multiprocessing=False)

In [None]:
# Obtencion de metricas

# Paso 1: Tomar el train set, y para cada ItemId sacar su one hot y luego su embedding. Guardar esto en una matriz
# CONCLUSION: Esto ya está tal cual en la matriz de pesos de embedding. Para sacar el de un item, basta encontrar su itemidmap y luego comparar con la columna respectiva en ella
weights = model.layers[0].get_weights()[0]
print(weights.shape)


In [None]:
# Paso 2: Dado un embedding de output desde el modelo, obtener los k=20 vectores mas cercanos en distancia sobre el espacio de embedding

from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(weights)
distances, indices = nbrs.kneighbors(weights) # Vienen ya ordenados! # Shape (37484, 20)



In [None]:
# Paso 3: Dado un vector embedding arbitrario, obtener el item más cercano a éste. Aplicarla sobre los 20 anteriores.
from sklearn.metrics import recall_score

test_generator = batch_generator(test_data, batch_size=batch_size)
n = 0
suma = 0
while True:
  test_batch = next(test_generator)
  pred = model.predict(test_batch[0]) # 128, 50
  label = test_batch[1]               # 128, 1


  for row_idx in range(test_batch[0].shape[0]):
    pred_row = pred[row_idx] # 50,
    label_row = label[row_idx] # 50,

    # embedding a label
    elt = np.array([label_row], dtype='int32')
    elt = tf.convert_to_tensor(elt)
    called = model.layers[0].call(elt)
    print(called.shape)
    emb_label = K.get_value(called)[0][0] # 50,

    # ahora, comparamos distancias
    label_distances, label_indices = nbrs.kneighbors(emb_label.reshape(1, -1))
    pred_distances, pred_indices = nbrs.kneighbors(pred_row.reshape(1, -1))


    # OJO: Verificar que no ocurra que uno este sobre itemidmap y el otro sobre el rango normal
    #print(label_distances)
    #print(pred_distances)
    print(label_indices)
    print(pred_indices)
    recall = recall_score(label_indices[0], pred_indices[0], average='macro')
    print(recall)
    suma += recall
    n+=1
    
print(suma/n)

In [None]:
# Pasar params a fn

def test2(data):
  item_key = 'ItemId'
  session_key = 'SessionId'
  time_key = 'Time'

  itemids = data[item_key].unique()
  n_items = len(itemids)

  itemidmap = pd.Series(data=np.arange(n_items), index=itemids) # Mapeo desde los 37.5k a (0, 37.5k) id
  data = pd.merge(data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') # agrego esa columna

  for elt in indices[0]:
    print()
    
  for dist in distances:
    print(dist)
    
test2(train_data)

# Paso 4: Ya tenemos toda la informacion: el output y los 20 más cercanos a éste
# Paso 5: Calcular recall y MRR con librerías de manera sencilla (sklearn ofrece una, creo)

# LUEGO DE ESTO
# Si da muy mal comparado a M4 del paper, probar con 1000 hidden units.
# Si sigue mal, entonces entrenar el v1 por mucho tiempo, copiar los pesos de esa embedding, pegarlos aca, y entrenar de nuevo
# Si sigue mal, asumir pérdida por diferencia de implementación, y pasar a probar mecanismos de atención



In [None]:
# Chequeo veracidad paso 1

def test(train_data):
  item_key = 'ItemId'
  session_key = 'SessionId'
  time_key = 'Time'

  itemids = train_data[item_key].unique()
  n_items = len(itemids)

  itemidmap = pd.Series(data=np.arange(n_items), index=itemids) # Mapeo desde los 37.5k a (0, 37.5k) id
  train_data = pd.merge(train_data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') # agrego esa columna

  for iii in range(15):
    feats = np.array([train_data['ItemIdx'].unique()[iii]], dtype='int32')
    print(feats)
    if feats.shape[0] > session_max_len:
        feats = feats[:session_max_len]
    else:
        feats = np.append(np.zeros((session_max_len-feats.shape[0],1), dtype=np.int8), feats) # left pad with zeros
    print(feats)
    feats = tf.convert_to_tensor(feats)
    print(feats)
    print(feats.shape)
    emb_elt = K.get_value(model.layers[0].call(feats))
    print(emb_elt[-1]==weights[0][iii])
  
test(train_data)

def get_train_embs(train_data, model, emb_size):
  out = np.zeros((n_items, emb_size))
  idx = 0
  #for name, values in train_data.iteritems():
  #  if name=='ItemId':
  #for elt in values:
  for elt_idx in range(len(train_data['ItemId'].unique())):
    if elt_idx % 1000 == 0:
      print(elt_idx)
    elt = np.array([train_data['ItemId'].unique()[elt_idx]], dtype='int32')
    elt = tf.convert_to_tensor(elt)
    emb_elt = K.get_value(model.layers[0].call(elt))
    print(emb_elt)
    out[idx, :] = emb_elt
    idx += 1
  print(out.shape)
  return out

emb_items = get_train_embs(train_data, model, emb_size)