#**Proyecto - Sistemas Recomendadores - IIC3633**

## Implementación en Keras de Session-Based RNNs for Recommendation con soft atenttion

### V2: Implementación de embedding sobre one-hot vectors para entrenamiento más eficiente y modelo más chico


Preliminar: Configuración entorno GPUs, Google Drive, entre otros.

In [1]:
import os
import sys
import subprocess
import math
import torch
import pandas as pd
import numpy as np
import sklearn
import psutil
import humanize
import pyreclab
import GPUtil as GPU
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.python.client import device_lib


import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

import warnings
#warnings.filterwarnings("ignore")

import keras
import keras.backend as K
from keras.utils import to_categorical
from keras.losses import cosine_proximity, categorical_crossentropy
from keras.models import Model, Sequential
from keras.initializers import glorot_uniform
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding, concatenate, Lambda, multiply
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
# Cargamos dataframes preprocesados de RSC15
PATH_TO_TRAIN = './data/train.csv'
PATH_TO_DEV = './data/dev.csv'
PATH_TO_TEST = './data/test.csv'

train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
dev_data = pd.read_csv(PATH_TO_DEV, sep='\t', dtype={'ItemId':np.int64})
test_data = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId': np.int64})

In [3]:
class SessionDataset:
    def __init__(self, data, sep='\t', session_key='SessionId', item_key='ItemId', time_key='Time', n_samples=-1, itemmap=None, time_sort=False):
        """
        Args:
            path: path of the csv file
            sep: separator for the csv
            session_key, item_key, time_key: name of the fields corresponding to the sessions, items, time
            n_samples: the number of samples to use. If -1, use the whole dataset.
            itemmap: mapping between item IDs and item indices
            time_sort: whether to sort the sessions by time or not
        """
        #self.df = pd.read_csv(path, sep=sep, names=[session_key, item_key, time_key])
        self.df = data
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.time_sort = time_sort
        self.df.sort_values([session_key, time_key], inplace=True) 
        # sampling
        #if n_samples > 0: self.df = self.df[:n_samples] 
        # Add item indices
        self.add_item_indices(itemmap=itemmap)
        """
        Sort the df by time, and then by session ID. That is, df is sorted by session ID and
        clicks within a session are next to each other, where the clicks within a session are time-ordered.
        """

        self.click_offsets = self.get_click_offsets()
        self.session_idx_arr = self.order_session_idx()
        
        
    def get_click_offsets(self):
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype=np.int32)
        # group & sort the df by session_key and get the offset values
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()

        return offsets
    

    def order_session_idx(self):
        """ Order the session indices """
        if self.time_sort:
            # starting time for each sessions, sorted by session IDs
            sessions_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            # order the session indices by session starting times
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(self.df[self.session_key].nunique())

        return session_idx_arr
    
    
    def add_item_indices(self, itemmap=None):
        """ 
        Add item index column named "item_idx" to the df
        Args:
            itemmap (pd.DataFrame): mapping between the item Ids and indices
        """
        if itemmap is None:
            item_ids = self.df[self.item_key].unique()  # unique item ids
            item2idx = pd.Series(data=np.arange(len(item_ids)),
                                 index=item_ids)
            itemmap = pd.DataFrame({self.item_key:item_ids,
                                   'item_idx':item2idx[item_ids].values})
        
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on=self.item_key, how='inner')
        
    
    @property    
    def items(self):
        return self.itemmap.ItemId.unique()
        

class SessionDataLoader:
    def __init__(self, dataset, batch_size=50):
        """
        A class for creating session-parallel mini-batches.
        Args:
             dataset (SessionDataset): the session dataset to generate the batches from
             batch_size (int): size of the batch
        """
        self.dataset = dataset
        self.batch_size = batch_size
        
        
    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,): torch.FloatTensor. Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        # initializations
        df = self.dataset.df
        session_key='SessionId'
        item_key='ItemId'
        time_key='TimeStamp'
        self.n_items = df[item_key].nunique()+1
        print(self.n_items)
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr

        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = [] # indicator for the sessions to be terminated
        finished = False        

        while not finished:
            minlen = (end - start).min()
            # Item indices(for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                input = idx_input
                target = idx_target
                #print(input)
                #print(target)
                yield input, target, mask
                
            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            for idx in mask:
                #print(idx)
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

In [11]:
def batch_generator(data, batch_size=128, session_max_len=19, fraction=1, offset=0, embedding=True, n_items=None, itemids=None, itemidmap=None, aug = True):
    item_key = 'ItemId'
    session_key = 'SessionId'
    time_key = 'Time'
    
    data = pd.merge(data, pd.DataFrame({item_key:itemids, 'ItemIdx':itemidmap[itemids].values}), on=item_key, how='inner') # agrego esa columna
    
    #print("Cantidad de samples: {}".format(len(data)//fraction))

    data.sort_values([session_key, time_key], inplace=True) # ordenamos por sesion

    #data.sort_values([time_key], inplace=True)
    length = len(data['ItemId'])
    #data = data[length-length//fraction:]
    
    offset_sessions = np.zeros(data[session_key].nunique()+1, dtype=np.int32)
    offset_sessions[1:] = data.groupby(session_key).size().cumsum() # arreglo con offset acumulativo de inicio de cada sesion
    #offset_sessions = offset_sessions[length-length//fraction:]
    
    actual_session = 0 + offset
    
    batch_feats = None
    batch_labels = None
    # GRU_LAYER.reset_states() si usamos session parallel

    while True:
      datum = data[offset_sessions[actual_session]:offset_sessions[actual_session+1]][item_key]  # aqui toda la info de la sesion
      datum = datum.values.reshape(-1,1)           
      for i in range(offset_sessions[actual_session+1]-offset_sessions[actual_session]-1):
        if not aug:
          if (i != offset_sessions[actual_session+1]-offset_sessions[actual_session]-2):
            continue
        feats = datum[0:i+1]
   
        if feats.shape[0] > session_max_len:
            feats = feats[:session_max_len] # aca cambiar a mas nuevos
        else:
            feats = np.append(np.zeros((session_max_len-feats.shape[0],1), dtype=np.int8), feats) # left pad with zeros

        feats = feats.reshape(1,-1) # (1, 19)
        feats = np.expand_dims(feats, axis=2)

        label = datum[i+1]
        label = np.expand_dims(label, axis=0)  # Termina siendo (1, dimn_previa)


        if not isinstance(batch_feats, type(feats)):
            batch_feats = feats
        else:
            batch_feats = np.append(batch_feats, feats, axis=0)

        if not isinstance(batch_labels, type(label)):
            batch_labels = label
        else:
            batch_labels = np.append(batch_labels, label, axis=0)

        #print(batch_feats)
        #print(batch_labels)
        
        if batch_labels.shape[0] == batch_size:
          if not embedding:
            # batch_labels.shape = (batch_size, 1)
            #new_labels = np.zeros((batch_size, n_items))
            #new_labels[0][:] = to_categorical(itemidmap[label[0][0]], num_classes=n_items)
            batch_labels = to_categorical(itemidmap[batch_labels.flatten()], num_classes=n_items)
          #print("Yielding batch with shape {} train, {} target".format(batch_feats.shape, batch_labels.shape))
            pass
          
          yield batch_feats, batch_labels
          # resume batch generation
          batch_feats = None
          batch_labels = None

    # TODO: Dropout random como en el paper

      actual_session = (actual_session + 1) % len(offset_sessions)

      


In [4]:
batch_size = 512 #como en el paper
session_max_len = 100
embeddingp=False

n_items = len(train_data['ItemId'].unique())+1
print("Items unicos training:", n_items)

dev_n_items = len(dev_data['ItemId'].unique())+1
print("Items unicos dev:", dev_n_items)

test_n_items = len(test_data['ItemId'].unique())+1
print("Items unicos testing:", test_n_items)

train_samples_qty = len(train_data['SessionId'].unique()) # cantidad sesiones no augmentadas de train
print("Sesiones training:", train_samples_qty)

dev_samples_qty = len(dev_data['SessionId'].unique()) # cantidad sesiones no augmentadas de dev
print("Sesiones validation:",dev_samples_qty)

test_samples_qty = len(test_data['SessionId'].unique()) # cantidad sesiones no augmentadas de test
print("Sesiones testing:", test_samples_qty)

Items unicos training: 11619
Items unicos dev: 10103
Items unicos testing: 10365
Sesiones training: 19850
Sesiones validation: 5747
Sesiones testing: 5270


In [5]:
train_fraction = 1#256 # 1/fraction es la cantidad de sesiones mas recientes a considerar
dev_fraction = 1#2

train_offset_step=39
dev_offset_step=12
test_offset_step=11


aux = [0]
aux.extend(list(train_data['ItemId'].unique()))
itemids = np.array(aux)
itemidmap = pd.Series(data=np.arange(n_items), index=itemids) 

In [7]:
# Modelo

# ToDo:
# meterle self-attention (hay implementaciones en Keras)

def custom_cosine_loss(itemidmap, n_items):
    #emb = model.layers[1]
    emb = itemidmap
    nu_items = n_items
    # y_pred ya viene con embedding, y_true solo como one-hot
    def fn(y_true, y_pred):
        #print(y_true.shape, y_pred.shape)
        y_pred_emb = to_categorical(emb[y_pred], num_classes=nu_items)
        #print(y_true_emb)
        #y_pred_emb = emb.call(y_pred)

    #y_true_emb = np.array([y_true], dtype='int32')
    #y_true_emb = tf.convert_to_tensor(y_true_emb)
    #y_true_emb = model.layers[0].call(y_true)
    #y_true_emb = K.get_value(y_true_emb)[0][0] # 50,

        return 1 - cosine_proximity(y_true, y_pred_emb)
        #return cosine_proximity(y_true_emb, y_pred_emb)
    return fn
    
emb_size = 50
size = emb_size
#size = emb_size if embeddingp else n_items
session_max_len=1#100

"""
model = Sequential()
emb = Embedding(n_items, emb_size, embeddings_initializer='uniform', input_length=19)
model.add(emb)
model.add(Dropout(0.25))
model.add(CuDNNGRU(1000)) 
model.add(Dropout(0.25))
if embeddingp:
    model.add(Dense(emb_size, activation='softmax'))
    custom_loss = custom_cosine_loss(emb)  ## DUDA: Esta usando los pesos actuales?
    model.compile(loss=custom_loss, optimizer='adam')
else:
    model.add(Dense(n_items, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()
"""

inputs = Input(shape=(session_max_len, 1))
#emb = Embedding(n_items, emb_size, embeddings_initializer='uniform', input_length=session_max_len)(inputs)
#drop1 = Dropout(0.25)(emb)
gru = CuDNNGRU(100)(inputs)
drop2 = Dropout(0.25)(gru)
predictions = Dense(n_items, activation='softmax')(drop2)
model = Model(input=inputs, output=[predictions])
custom_loss = custom_cosine_loss(itemidmap, n_items)
# lr original es 0.0001
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
# Try Nadam, too
model.compile(loss=categorical_crossentropy, optimizer=opt)
model.summary()

#filepath='./bast/model_checkpoint'
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
callbacks_list = []#[checkpoint]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 1, 1)              0         
_________________________________________________________________
cu_dnngru_2 (CuDNNGRU)       (None, 100)               30900     
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 11619)             1173519   
Total params: 1,204,419
Trainable params: 1,204,419
Non-trainable params: 0
_________________________________________________________________




In [None]:
real_epoca = 1
for epoch in range(10):
    train_generator = batch_generator(train_data, 
                                      batch_size=batch_size, 
                                  session_max_len=session_max_len,
                                      fraction=train_fraction, 
                                      offset=train_offset_step*epoch,
                                     embedding=embeddingp,
                                      n_items=n_items,
                                     itemids=itemids,
                                     itemidmap=itemidmap)
    
    dev_generator = batch_generator(dev_data, 
                                    batch_size=batch_size,
                                  session_max_len=session_max_len,
                                    fraction=dev_fraction, 
                                    offset=dev_offset_step*epoch,
                                    embedding=embeddingp,
                                    n_items=n_items,
                                    itemids=itemids,
                                     itemidmap=itemidmap)
    
    history = model.fit_generator(train_generator,
                                steps_per_epoch=train_offset_step,
                                epochs=1,
                                validation_data=dev_generator,
                                validation_steps=dev_offset_step,
                                callbacks=callbacks_list)
    
    real_epoca += 1

Epoch 1/1


In [1]:
dataset = SessionDataset(train_data)
loader = SessionDataLoader(dataset, batch_size=32)
for epoch in range(1):
    for inp, target, mask in loader:
        print(inp.shape)
        #input_oh = to_categorical(inp, num_classes=loader.n_items) 
        #print(input_oh.shape)
        target_oh = to_categorical(target, num_classes=loader.n_items)   
        print(target.shape)
        print(target_oh.shape)
        tr_loss = model.train_on_batch(inp, target_oh)
        print(tr_loss)
        #print(mask)

NameError: name 'SessionDataset' is not defined

In [19]:
weights = model.layers[1].get_weights()[0]
from sklearn.neighbors import NearestNeighbors

# RECALL @ 10
recall_k = 10

#nbrs = NearestNeighbors(n_neighbors=recall_k, algorithm='ball_tree').fit(weights)
#distances, indices = nbrs.kneighbors(weights) # Vienen ya ordenados! # Shape (37484, 20)
# Paso 3: Dado un vector embedding arbitrario, obtener el item más cercano a éste. Aplicarla sobre los 20 anteriores.
from sklearn.metrics import recall_score

test_generator = batch_generator(test_data, 
                                  batch_size=batch_size,
                                  session_max_len=session_max_len,
                                  fraction=train_fraction, 
                                  offset=0,
                                 embedding=embeddingp,
                                  n_items=n_items,
                                 itemids=itemids,
                                 itemidmap=itemidmap)


n = 0
suma = 0
suma_baseline = 0
while True:
    try:
        test_batch = next(test_generator)
        pred = model.predict(test_batch[0]) # batch_size, n_items => 512, 37484
        

        label = test_batch[1]               

        if n%100 == 0:
            print(n)
        #print(pred.shape)
        #print(label.shape) 

        for row_idx in range(test_batch[0].shape[0]):
          #print(test_batch[0][row_idx])
          #baseline_pred = obj.recommend( str(test_batch[0][row_idx][-1]), 20 )
          pred_row = pred[row_idx] # 37484, #.reshape(1, -1) # 50,
          label_row = label[row_idx]        #.reshape(1, -1) # 50,

          #print(pred_row.shape)
          #print(label_row.shape)

          idx1 = pred_row.argsort()[-recall_k:][::-1]
          idx2 = label_row.argsort()[-1:][::-1]

          n += 1
          #print(idx1)
          #print(idx2)
          if idx2[0] in idx1:
            suma += 1

          #if idx2[0] in baseline_pred:
          #  suma_baseline += 1

    except:
        break
print("Recall@{} epoch {}: {}".format(recall_k, epoch, suma/n))

#print("Recall@{} baseline: {}".format(recall_k, suma_baseline/n))

0
12800
25600
38400
51200
64000
76800
89600
102400
115200
128000
140800
153600
166400
Recall@10 epoch 9: 0.08086756175149701


Recall @20 epoch 99: 0.08705440681137724

Con session_max_len = 100:
Recall @20 epoch 9: 0.12075458458083832

Con dwell_time NO FUNCIONA BIEN. Hacer ese supuesto en este dataset no tiene sentido.
