#**Proyecto - Sistemas Recomendadores - IIC3633**

## Implementación en Keras de Session-Based RNNs for Recommendation con soft atenttion

### V2: Implementación de embedding sobre one-hot vectors para entrenamiento más eficiente y modelo más chico


Preliminar: Configuración entorno GPUs, Google Drive, entre otros.

In [0]:
# Manejo de Google Drive
!pip install -U -q PyDrive

from google.colab import drive, auth
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials

drive.mount("/content/drive/")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive/


In [0]:
# Librerias varias

!pip install gputil
!pip install humanize

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/45/99/837428d26b47ebd6b66d6e1b180e98ec4a557767a93a81a02ea9d6242611/GPUtil-1.3.0.tar.gz
Building wheels for collected packages: gputil
  Running setup.py bdist_wheel for gputil ... [?25l- done
[?25h  Stored in directory: /root/.cache/pip/wheels/17/0f/04/b79c006972335e35472c0b835ed52bfc0815258d409f560108
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.3.0
Collecting humanize
  Downloading https://files.pythonhosted.org/packages/8c/e0/e512e4ac6d091fc990bbe13f9e0378f34cf6eecd1c6c268c9e598dcf5bb9/humanize-0.5.1.tar.gz
Building wheels for collected packages: humanize
  Running setup.py bdist_wheel for humanize ... [?25l- done
[?25h  Stored in directory: /root/.cache/pip/wheels/69/86/6c/f8b8593bc273ec4b0c653d3827f7482bb2001a2781a73b7f44
Successfully built humanize
Installing collected packages: humanize
Successfully installed humanize-0.5.1


In [1]:
import os
import sys
import subprocess
import math
import pandas as pd
import numpy as np
import sklearn
import psutil
import humanize
import GPUtil as GPU
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.python.client import device_lib


import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

import warnings
#warnings.filterwarnings("ignore")

import keras
import keras.backend as K
from keras.utils import to_categorical
from keras.losses import cosine_proximity, categorical_crossentropy
from keras.models import Model, Sequential
from keras.initializers import glorot_uniform
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding, concatenate, Lambda, multiply
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
# Configuracion GPUs
#!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi

GPUs = GPU.getGPUs()
gpu = GPUs[0]

def print_gpu_info():
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize(
          psutil.virtual_memory().available), " I Proc size: "  +
          humanize.naturalsize(process.memory_info().rss))
  print("GPU RAM Free {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total \
         {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, 
                           gpu.memoryTotal))
  
print_gpu_info()

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [(x.name, x.DESCRIPTOR, x.DEVICE_TYPE_FIELD_NUMBER, x.NAME_FIELD_NUMBER, x.PHYSICAL_DEVICE_DESC_FIELD_NUMBER) for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

Gen RAM Free: 12.8 GB  I Proc size: 237.2 MB
GPU RAM Free 7493MB | Used: 612MB | Util   8% | Total          8105MB


[('/device:GPU:0',
  <google.protobuf.pyext._message.MessageDescriptor at 0x7ff7f70646b0>,
  2,
  1,
  7)]

In [3]:
# Cargamos dataframes preprocesados de RSC15
PATH_TO_TRAIN = '../processedData/rsc15_train_tr.txt'
PATH_TO_DEV = '../processedData/rsc15_train_valid.txt'
PATH_TO_TEST = '../processedData/rsc15_test.txt'

train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
dev_data = pd.read_csv(PATH_TO_DEV, sep='\t', dtype={'ItemId':np.int64})
test_data = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId': np.int64})

In [9]:
def batch_generator(data, batch_size=128, session_max_len=19, fraction=1, offset=0, embedding=True, n_items=None, itemids=None, itemidmap=None, aug = True):
    item_key = 'ItemId'
    session_key = 'SessionId'
    time_key = 'Time'
    
    data = pd.merge(data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') # agrego esa columna
    
    #print("Cantidad de samples: {}".format(len(data)//fraction))

    data.sort_values([session_key, time_key], inplace=True) # ordenamos por sesion

    #data.sort_values([time_key], inplace=True)
    length = len(data['ItemId'])
    #data = data[length-length//fraction:]
    
    offset_sessions = np.zeros(data[session_key].nunique()+1, dtype=np.int32)
    offset_sessions[1:] = data.groupby(session_key).size().cumsum() # arreglo con offset acumulativo de inicio de cada sesion
    #offset_sessions = offset_sessions[length-length//fraction:]
    
    actual_session = 0 + offset
    
    batch_feats = None
    batch_labels = None
    # GRU_LAYER.reset_states() si usamos session parallel

    while True:
      datum = data[offset_sessions[actual_session]:offset_sessions[actual_session+1]][item_key]  # aqui toda la info de la sesion
      datum = datum.values.reshape(-1,1)           
      for i in range(offset_sessions[actual_session+1]-offset_sessions[actual_session]-1):
        if not aug:
          if (i != offset_sessions[actual_session+1]-offset_sessions[actual_session]-2):
            continue
        feats = datum[0:i+1]
   
        if feats.shape[0] > session_max_len:
            feats = feats[:session_max_len] # aca cambiar a mas nuevos
        else:
            feats = np.append(np.zeros((session_max_len-feats.shape[0],1), dtype=np.int8), feats) # left pad with zeros

        feats = feats.reshape(1,-1) # (1, 19)

        label = datum[i+1]
        label = np.expand_dims(label, axis=0)  # Termina siendo (1, dimn_previa)

        if not isinstance(batch_feats, type(feats)):
            batch_feats = feats
        else:
            if batch_labels.shape[0] < batch_size:
                batch_feats = np.append(batch_feats, feats, axis=0)

        if not isinstance(batch_labels, type(label)):
            batch_labels = label
        else:
            if batch_labels.shape[0] < batch_size:
                batch_labels = np.append(batch_labels, label, axis=0)

        #print(batch_feats)
        #print(batch_labels)
        
        #if batch_labels.shape[0] == batch_size:
        #if not embedding:
            # batch_labels.shape = (batch_size, 1)
            #new_labels = np.zeros((batch_size, n_items))
            #new_labels[0][:] = to_categorical(itemidmap[label[0][0]], num_classes=n_items)
        #    batch_labels = to_categorical(itemidmap[batch_labels.flatten()], num_classes=n_items)
            #print("Yielding batch with shape {} train, {} target".format(batch_feats.shape, batch_labels.shape))
            #pass

        #yield batch_feats, batch_labels
        # resume batch generation
        #batch_feats = None
        #batch_labels = None

    # TODO: Dropout random como en el paper

    actual_session = (actual_session + 1) % len(offset_sessions)
    print("Yielding batch with shape {} train, {} target".format(batch_feats.shape, batch_labels.shape))
    yield batch_feats, batch_labels
    # resume batch generation
    batch_feats = None
    batch_labels = None

      


In [5]:
batch_size = 512 #como en el paper
session_max_len = 19
embeddingp=False

n_items = len(train_data['ItemId'].unique())+1
print("Items unicos training:", n_items)

dev_n_items = len(dev_data['ItemId'].unique())+1
print("Items unicos dev:", dev_n_items)

test_n_items = len(test_data['ItemId'].unique())+1
print("Items unicos testing:", test_n_items)

train_samples_qty = len(train_data['SessionId'].unique()) # cantidad sesiones no augmentadas de train
print("Sesiones training:", train_samples_qty)

dev_samples_qty = len(dev_data['SessionId'].unique()) # cantidad sesiones no augmentadas de dev
print("Sesiones validation:",dev_samples_qty)

test_samples_qty = len(test_data['SessionId'].unique()) # cantidad sesiones no augmentadas de test
print("Sesiones testing:", test_samples_qty)

Items unicos training: 37484
Items unicos dev: 6360
Items unicos testing: 6752
Sesiones training: 7953885
Sesiones validation: 12372
Sesiones testing: 15324


In [6]:
train_fraction = 1#256 # 1/fraction es la cantidad de sesiones mas recientes a considerar
dev_fraction = 1#2

train_offset_step=35000#40000#15530
dev_offset_step=65#240


aux = [0]
aux.extend(list(train_data['ItemId'].unique()))
itemids = np.array(aux)
itemidmap = pd.Series(data=np.arange(n_items), index=itemids) 

In [7]:
# Modelo

# ToDo:
# meterle self-attention (hay implementaciones en Keras)

def custom_cosine_loss(itemidmap, n_items):
    #emb = model.layers[1]
    emb = itemidmap
    nu_items = n_items
    # y_pred ya viene con embedding, y_true solo como one-hot
    def fn(y_true, y_pred):
        #print(y_true.shape, y_pred.shape)
        y_pred_emb = to_categorical(emb[y_pred], num_classes=nu_items)
        #print(y_true_emb)
        #y_pred_emb = emb.call(y_pred)

    #y_true_emb = np.array([y_true], dtype='int32')
    #y_true_emb = tf.convert_to_tensor(y_true_emb)
    #y_true_emb = model.layers[0].call(y_true)
    #y_true_emb = K.get_value(y_true_emb)[0][0] # 50,

        return 1 - cosine_proximity(y_true, y_pred_emb)
        #return cosine_proximity(y_true_emb, y_pred_emb)
    return fn
    
emb_size = 50
size = emb_size
#size = emb_size if embeddingp else n_items

"""
model = Sequential()
emb = Embedding(n_items, emb_size, embeddings_initializer='uniform', input_length=19)
model.add(emb)
model.add(Dropout(0.25))
model.add(CuDNNGRU(1000)) 
model.add(Dropout(0.25))
if embeddingp:
    model.add(Dense(emb_size, activation='softmax'))
    custom_loss = custom_cosine_loss(emb)  ## DUDA: Esta usando los pesos actuales?
    model.compile(loss=custom_loss, optimizer='adam')
else:
    model.add(Dense(n_items, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()
"""

inputs = Input(shape=(19,))
emb = Embedding(n_items, emb_size, embeddings_initializer='uniform', input_length=19)(inputs)
drop1 = Dropout(0.25)(emb)
gru = CuDNNGRU(100)(drop1)
drop2 = Dropout(0.25)(gru)
predictions = Dense(n_items, activation='softmax')(drop2)
model = Model(input=inputs, output=[predictions])
custom_loss = custom_cosine_loss(itemidmap, n_items)
opt = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
# Try Nadam, too
model.compile(loss=categorical_crossentropy, optimizer=opt)
model.summary()

filepath='./OneSessionPerBatch/model_checkpoint'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 19)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 19, 50)            1874200   
_________________________________________________________________
dropout_1 (Dropout)          (None, 19, 50)            0         
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (None, 100)               45600     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 37484)             3785884   
Total params: 5,705,684
Trainable params: 5,705,684
Non-trainable params: 0
_________________________________________________________________




In [10]:
real_epoca = 0
for epoch in range(1):
    try:
        model.load_weights('./bast/model_{}'.format(real_epoca-1))
    except:
        pass
    train_generator = batch_generator(train_data, 
                                      batch_size=batch_size, 
                                      fraction=train_fraction, 
                                      offset=train_offset_step*epoch,
                                     embedding=embeddingp,
                                      n_items=n_items,
                                     itemids=itemids,
                                     itemidmap=itemidmap)
    
    dev_generator = batch_generator(dev_data, 
                                    batch_size=batch_size, 
                                    fraction=dev_fraction, 
                                    offset=dev_offset_step*epoch,
                                    embedding=embeddingp,
                                    n_items=n_items,
                                    itemids=itemids,
                                     itemidmap=itemidmap)
    
    history = model.fit_generator(train_generator,
                                steps_per_epoch=train_offset_step,#15530,
                                epochs=1,
                                validation_data=dev_generator,
                                validation_steps=dev_offset_step,#105,
                                callbacks=callbacks_list)
    
    model.save_weights(filepath)
    
    weights = model.layers[1].get_weights()[0]
    from sklearn.neighbors import NearestNeighbors
    nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(weights)
    distances, indices = nbrs.kneighbors(weights) # Vienen ya ordenados! # Shape (37484, 20)
    # Paso 3: Dado un vector embedding arbitrario, obtener el item más cercano a éste. Aplicarla sobre los 20 anteriores.
    from sklearn.metrics import recall_score

    test_generator = batch_generator(test_data, 
                                      batch_size=batch_size, 
                                      fraction=train_fraction, 
                                      offset=0,
                                     embedding=embeddingp,
                                      n_items=n_items,
                                     itemids=itemids,
                                     itemidmap=itemidmap)


    n = 0
    suma = 0
    while True:
          try:
            test_batch = next(test_generator)
            pred = model.predict(test_batch[0]) # batch_size, n_items => 512, 37484
            label = test_batch[1]               

            #print(pred.shape)
            #print(label.shape) 

            for row_idx in range(test_batch[0].shape[0]):
              pred_row = pred[row_idx] # 37484, #.reshape(1, -1) # 50,
              label_row = label[row_idx]        #.reshape(1, -1) # 50,

              #print(pred_row.shape)
              #print(label_row.shape)

              idx1 = pred_row.argsort()[-20:][::-1]
              idx2 = label_row.argsort()[-1:][::-1]

              n += 1
              #print(idx1)
              #print(idx2)
              if idx2[0] in idx1:
                suma += 1

          except:
            break
    print("Recall epoch {}: {}".format(epoch, suma/n))
    real_epoca += 1

Epoch 1/1


KeyboardInterrupt: 

In [19]:
filepath='./OneSessionPerBatch/model_{}'.format(epoch)
model.save_weights(filepath)

Started @ 9.2

En 8.01 se puso lento... ojo

Goin up 7.8464

Epoch time: 2:00 aprox.

In [20]:
weights = model.layers[1].get_weights()[0]
from sklearn.neighbors import NearestNeighbors
nbrs = NearestNeighbors(n_neighbors=20, algorithm='ball_tree').fit(weights)
distances, indices = nbrs.kneighbors(weights) # Vienen ya ordenados! # Shape (37484, 20)
# Paso 3: Dado un vector embedding arbitrario, obtener el item más cercano a éste. Aplicarla sobre los 20 anteriores.
from sklearn.metrics import recall_score

test_generator = batch_generator(test_data, 
                                  batch_size=batch_size, 
                                  fraction=train_fraction, 
                                  offset=0,
                                 embedding=embeddingp,
                                  n_items=n_items,
                                 itemids=itemids,
                                 itemidmap=itemidmap)


n = 0
suma = 0
while True:
  try:
    test_batch = next(test_generator)
    pred = model.predict(test_batch[0]) # batch_size, n_items => 512, 37484
    label = test_batch[1]               

    #print(pred.shape)
    #print(label.shape) 

    for row_idx in range(test_batch[0].shape[0]):
      pred_row = pred[row_idx] # 37484, #.reshape(1, -1) # 50,
      label_row = label[row_idx]        #.reshape(1, -1) # 50,

      #print(pred_row.shape)
      #print(label_row.shape)

      idx1 = pred_row.argsort()[-20:][::-1]
      idx2 = label_row.argsort()[-1:][::-1]

      n += 1
      #print(idx1)
      #print(idx2)
      if idx2[0] in idx1:
        suma += 1

  except:
    break
print("Recall epoch {}: {}".format(epoch, suma/n))

Recall epoch 0: 0.009550602064220183


Recall epoch 0: 0.0005554759174311927 # seste se disparo y lo corte

Recall epoch 0: 0.009550602064220183 # primera con lr mas bajo
Recall epoch 1: 0.009084719036697247

# TODO

## Batcher solo entrega una sesion a la vez, si no se pierde independencia