#**Proyecto - Sistemas Recomendadores - IIC3633**

## Implementación en Keras de Session-Based RNNs for Recommendation con soft atenttion


Preliminar: Configuración entorno GPUs, Google Drive, entre otros.

In [1]:
import os
import subprocess
import math
import pandas as pd
import numpy as np
import sklearn
import psutil
import humanize
import GPUtil as GPU
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.python.client import device_lib


import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

In [2]:
# Configuracion GPUs
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi

GPUs = GPU.getGPUs()
gpu = GPUs[0]

def print_gpu_info():
  process = psutil.Process(os.getpid())
  print("Gen RAM Free: " + humanize.naturalsize(
          psutil.virtual_memory().available), " I Proc size: "  +
          humanize.naturalsize(process.memory_info().rss))
  print("GPU RAM Free {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total \
         {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, 
                           gpu.memoryTotal))
  
print_gpu_info()

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [(x.name, x.DESCRIPTOR, x.DEVICE_TYPE_FIELD_NUMBER, x.NAME_FIELD_NUMBER, x.PHYSICAL_DEVICE_DESC_FIELD_NUMBER) for x in local_device_protos if x.device_type == 'GPU']

get_available_gpus()

ln: cannot remove '/usr/bin/nvidia-smi': Permission denied
Gen RAM Free: 14.8 GB  I Proc size: 237.3 MB
GPU RAM Free 7805MB | Used: 300MB | Util   4% | Total          8105MB


[('/device:GPU:0',
  <google.protobuf.pyext._message.MessageDescriptor at 0x7feb140636b0>,
  2,
  1,
  7)]

In [3]:
# Keras

import sys
import keras
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization
from keras.layers import SimpleRNN, Dense, Flatten, Dropout, TimeDistributed, LSTM
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [4]:
# Cargamos dataframes preprocesados de RSC15
PATH_TO_TRAIN = '../processedData/rsc15_train_tr.txt'
PATH_TO_TEST = '../processedData/rsc15_train_valid.txt'

data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
valid = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId':np.int64})

In [5]:
# Pruebas
"""
item_key = 'ItemId'
session_key = 'SessionId'
time_key = 'Time'

itemids = data[item_key].unique()
n_items = len(itemids)

itemidmap = pd.Series(data=np.arange(n_items), index=itemids) # Mapeo desde los 37.5k a (0, 37.5k) id
mdata = pd.merge(data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') # agrego esa columna

mdata.sort_values([session_key, time_key], inplace=True) # ordenamos por sesion
offset_sessions = np.zeros(mdata[session_key].nunique()+1, dtype=np.int32)
offset_sessions[1:] = mdata.groupby(session_key).size().cumsum() # arreglo con offset acumulativo de inicio de cada sesion
actual_session = 0
"""

"\nitem_key = 'ItemId'\nsession_key = 'SessionId'\ntime_key = 'Time'\n\nitemids = data[item_key].unique()\nn_items = len(itemids)\n\nitemidmap = pd.Series(data=np.arange(n_items), index=itemids) # Mapeo desde los 37.5k a (0, 37.5k) id\nmdata = pd.merge(data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') # agrego esa columna\n\nmdata.sort_values([session_key, time_key], inplace=True) # ordenamos por sesion\noffset_sessions = np.zeros(mdata[session_key].nunique()+1, dtype=np.int32)\noffset_sessions[1:] = mdata.groupby(session_key).size().cumsum() # arreglo con offset acumulativo de inicio de cada sesion\nactual_session = 0\n"

In [6]:
#mdata.head()

In [7]:
#list(mdata[mdata['ItemId']==214536500]['ItemIdx'])[0]
#print(mdata[mdata['ItemId']==214652220]['ItemIdx'].unique()[0])
#print(mdata[mdata['ItemId']==214652220]['ItemIdx'].unique()[0])

In [8]:
# Pruebas
"""
actual_session = 0

batch_size = 128
batch_feats = None
batch_labels = None

# Entrega tensores de shape (batch_size, n_items+1, 19)

while True:
    datum = mdata[offset_sessions[actual_session]:offset_sessions[actual_session+1]][item_key]  # aqui toda la info de la sesion
    datum = datum.values.reshape(-1,1)
    
    for i in range(offset_sessions[actual_session+1]-offset_sessions[actual_session]-1):
        
        feats = datum[0:i+1]
        if feats.shape[0] > 19:
            feats = feats[:19]
        else:
            feats = np.append(np.zeros((19-feats.shape[0],1), dtype=np.int8), feats) # left pad with zeros

        encoded_feats = None
        for elt in feats:
            if elt == 0:
                idx = n_items
            else:
                idx = mdata[mdata['ItemId']==elt]['ItemIdx'].unique()[0]
            encoded = to_categorical(idx, num_classes=n_items+1)
            encoded = encoded.reshape(1, -1)
            
            if not isinstance(encoded_feats, type(feats)):
                encoded_feats = encoded
            else:
                encoded_feats = np.append(encoded_feats, encoded, axis=0) # Termina siendo de (19, n_items)
                
        label_idx = mdata[mdata['ItemId']==datum[i+1][0]]['ItemIdx'].unique()[0]
        label = to_categorical(label_idx, num_classes=n_items+1)
        label = np.expand_dims(label, axis=0)  # Termina siendo (1, n_items)

        if not isinstance(batch_feats, type(feats)):
            batch_feats = np.expand_dims(encoded_feats, axis=0)
        else:
            batch_feats = np.append(batch_feats, np.expand_dims(encoded_feats, axis=0), axis=0)

        if not isinstance(batch_labels, type(label)):
            batch_labels = np.expand_dims(label, axis=0)
        else:
            batch_labels = np.append(batch_labels, np.expand_dims(label, axis=0), axis=0)

        if batch_labels.shape[0] == batch_size:
            print(batch_feats.shape)
            print(batch_labels.shape)
            break
    
    if batch_labels.shape[0] == batch_size:
        break
        
    actual_session += 1
"""

"\nactual_session = 0\n\nbatch_size = 128\nbatch_feats = None\nbatch_labels = None\n\n# Entrega tensores de shape (batch_size, n_items+1, 19)\n\nwhile True:\n    datum = mdata[offset_sessions[actual_session]:offset_sessions[actual_session+1]][item_key]  # aqui toda la info de la sesion\n    datum = datum.values.reshape(-1,1)\n    \n    for i in range(offset_sessions[actual_session+1]-offset_sessions[actual_session]-1):\n        \n        feats = datum[0:i+1]\n        if feats.shape[0] > 19:\n            feats = feats[:19]\n        else:\n            feats = np.append(np.zeros((19-feats.shape[0],1), dtype=np.int8), feats) # left pad with zeros\n\n        encoded_feats = None\n        for elt in feats:\n            if elt == 0:\n                idx = n_items\n            else:\n                idx = mdata[mdata['ItemId']==elt]['ItemIdx'].unique()[0]\n            encoded = to_categorical(idx, num_classes=n_items+1)\n            encoded = encoded.reshape(1, -1)\n            \n            i

In [9]:
def batch_generator(data, batch_size=128, session_max_len=19):
    item_key = 'ItemId'
    session_key = 'SessionId'
    time_key = 'Time'
    
    itemids = data[item_key].unique()
    n_items = len(itemids)
    
    itemidmap = pd.Series(data=np.arange(n_items), index=itemids) # Mapeo desde los 37.5k a (0, 37.5k) id
    data = pd.merge(data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') # agrego esa columna

    data.sort_values([session_key, time_key], inplace=True) # ordenamos por sesion
    offset_sessions = np.zeros(data[session_key].nunique()+1, dtype=np.int32)
    offset_sessions[1:] = data.groupby(session_key).size().cumsum() # arreglo con offset acumulativo de inicio de cada sesion
    actual_session = 0
    
    batch_feats = None
    batch_labels = None

    while True:
        datum = data[offset_sessions[actual_session]:offset_sessions[actual_session+1]][item_key]  # aqui toda la info de la sesion
        datum = datum.values.reshape(-1,1)
        
        for i in range(offset_sessions[actual_session+1]-offset_sessions[actual_session]-1):
            feats = datum[0:i+1]
            if feats.shape[0] > session_max_len:
                feats = feats[:session_max_len]
            else:
                feats = np.append(np.zeros((session_max_len-feats.shape[0],1), dtype=np.int8), feats) # left pad with zeros
            
            encoded_feats = None
            for elt in feats:
                if elt == 0:
                    idx = n_items
                else:
                    try:
                        idx = data[data['ItemId']==elt]['ItemIdx'].unique()[0]
                    except:
                        idx = data[data['ItemId']==elt[0]]['ItemIdx'].unique()[0]
                encoded = to_categorical(idx, num_classes=n_items+1)
                encoded = encoded.reshape(1, -1)

                if not isinstance(encoded_feats, type(feats)):
                    encoded_feats = encoded
                else:
                    encoded_feats = np.append(encoded_feats, encoded, axis=0) # Termina siendo de (19, n_items)

            label_idx = data[data['ItemId']==datum[i+1][0]]['ItemIdx'].unique()[0]
            label = to_categorical(label_idx, num_classes=n_items+1)
            label = np.expand_dims(label, axis=0)  # Termina siendo (1, n_items)
            
            if not isinstance(batch_feats, type(feats)):
                batch_feats = np.expand_dims(encoded_feats, axis=0)
            else:
                batch_feats = np.append(batch_feats, np.expand_dims(encoded_feats, axis=0), axis=0)

            if not isinstance(batch_labels, type(label)):
                batch_labels = label #np.expand_dims(label, axis=0)
            else:
                batch_labels = np.append(batch_labels, label, axis=0)#np.expand_dims(label, axis=0), axis=0)

            if batch_labels.shape[0] == batch_size:
                # return batch
                #print("Yielding batch with shape {} train, {} target".format(batch_feats.shape, batch_labels.shape))
                yield batch_feats, batch_labels
                
                # resume batch generation
                batch_feats = None
                batch_labels = None
            
        # TODO: Dropout
        
        actual_session = (actual_session + 1) % len(offset_sessions)

In [10]:
batch_size = 128 # 512
session_max_len = 19
n_items = len(data['ItemId'].unique())+1
print(n_items)

37484


In [12]:
# Modelo
# ToDo: reimplementar paper
# meterle self-attention (hay implementaciones en Keras)

import warnings
#warnings.filterwarnings("ignore")

from keras.models import Sequential
from keras.layers import Bidirectional, Dense, Dropout, CuDNNGRU, GRU, Embedding, Flatten
from keras.callbacks import ModelCheckpoint

model = Sequential()

#model.add(Embedding(n_items, 50, input_length=19)) # input_length=19,
#model.add(Dropout(0.25))
model.add(CuDNNGRU(10, input_shape=(19, n_items))) # Probar con 100 y 1000 una vez que tenga los embeddings. Sin ellos se cae por memoria 
model.add(Dropout(0.25))
model.add(Dense(n_items, activation='softmax'))
#model.add(Dropout(0.2)) # Probar esto mas adelante
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()

filepath="model.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnngru_1 (CuDNNGRU)       (None, 10)                1124880   
_________________________________________________________________
dropout_2 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 37484)             412324    
Total params: 1,537,204
Trainable params: 1,537,204
Non-trainable params: 0
_________________________________________________________________


In [13]:
train_generator = batch_generator(data, batch_size=batch_size)

dev_generator = batch_generator(valid, batch_size=batch_size)

In [14]:
history = model.fit_generator(train_generator,
                              steps_per_epoch=500,
                              epochs=10,
                              validation_data=dev_generator,
                              validation_steps=35,
                              callbacks=callbacks_list)

Epoch 1/10

ValueError: Error when checking input: expected cu_dnngru_1_input to have shape (19, 37484) but got array with shape (19, 6360)

In [0]:
insfile = drive.CreateFile({'title': 'model.hdf5'})
file.SetContentFile('./models.hdf5')
file.Upload() 

In [0]:
3   10.5315
4   10.5314
8   10.5305
13  10.5290
21  10.5265
62  10.4782
74  10.4348
101 10.3140
114 10.2392
165  9.9084 x
174  9.8366
287  9.3494
499  8.8895