#**Proyecto - Sistemas Recomendadores - IIC3633**

## Implementación en Keras de Session-Based RNNs for Recommendation con soft atenttion

### V2: Implementación de embedding sobre one-hot vectors para entrenamiento más eficiente y modelo más chico

In [0]:
# Manejo de Google Drive
!pip install -U -q PyDrive
!pip install humanize
!pip install GPUtil

from google.colab import drive, auth
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from oauth2client.client import GoogleCredentials

drive.mount("/content/drive/")

In [0]:
import os
import sys
import subprocess
import math
import pandas as pd
import numpy as np
import sklearn
import psutil
import humanize
#import pyreclab
import GPUtil as GPU
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.python.client import device_lib


import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

import warnings
#warnings.filterwarnings("ignore")

import keras
import keras.backend as K
from keras.utils import to_categorical
from keras.losses import cosine_proximity, categorical_crossentropy
from keras.models import Model, Sequential
from keras.initializers import glorot_uniform
from keras.layers.core import Permute, Reshape, RepeatVector
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding, concatenate, Lambda, multiply, merge, Flatten
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [0]:
# Cargamos dataframes preprocesados de MovieLens20MM
PATH_TO_ALL_TRAIN = './drive/My Drive/Cursos/2018/IIC3633/processedData/movie_all_train_tr.txt'
PATH_TO_TRAIN = './drive/My Drive/Cursos/2018/IIC3633/processedData/movie_train_tr.txt'
PATH_TO_DEV = './drive/My Drive/Cursos/2018/IIC3633/processedData/movie_train_valid.txt'
PATH_TO_TEST = './drive/My Drive/Cursos/2018/IIC3633/processedData/movie_test.txt'

train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
dev_data = pd.read_csv(PATH_TO_DEV, sep='\t', dtype={'ItemId':np.int64})
test_data = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId': np.int64})

In [0]:
train_data.head()

Unnamed: 0,SessionId,ItemId,Time
0,18,186,1267347706
1,18,858,1236356241
2,18,912,1283426281
3,18,1221,1236356224
4,18,1230,1236293194


In [0]:
class SessionDataset:
    def __init__(self, data, sep='\t', session_key='SessionId', item_key='ItemId', time_key='Time', n_samples=-1, itemmap=None, time_sort=False, sort = True):
        """
        Args:
            path: path of the csv file
            sep: separator for the csv
            session_key, item_key, time_key: name of the fields corresponding to the sessions, items, time
            n_samples: the number of samples to use. If -1, use the whole dataset.
            itemmap: mapping between item IDs and item indices
            time_sort: whether to sort the sessions by time or not
        """
        self.df = data
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.time_sort = time_sort
        self.add_item_indices(itemmap=itemmap)
        if sort:
          self.df.sort_values([session_key, time_key], inplace=True)
          #print(self.df.head)
        #self.add_item_indices(itemmap=itemmap)

        #Sort the df by time, and then by session ID. That is, df is sorted by session ID and
        #clicks within a session are next to each other, where the clicks within a session are time-ordered.

        self.click_offsets = self.get_click_offsets()
        self.session_idx_arr = self.order_session_idx()
        
        
    def get_click_offsets(self):
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype=np.int32)
        # group & sort the df by session_key and get the offset values
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()

        return offsets
    

    def order_session_idx(self):
        """ Order the session indices """
        if self.time_sort:
            # starting time for each sessions, sorted by session IDs
            sessions_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            # order the session indices by session starting times
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(self.df[self.session_key].nunique())

        return session_idx_arr
    
    
    def add_item_indices(self, itemmap=None):
        """ 
        Add item index column named "item_idx" to the df
        Args:
            itemmap (pd.DataFrame): mapping between the item Ids and indices
        """
        if itemmap is None:
            item_ids = self.df[self.item_key].unique()  # unique item ids
            item2idx = pd.Series(data=np.arange(len(item_ids)),
                                 index=item_ids)
            itemmap = pd.DataFrame({self.item_key:item_ids,
                                   'item_idx':item2idx[item_ids].values})
        
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on=self.item_key, how='inner')
        
    
    @property    
    def items(self):
        return self.itemmap.ItemId.unique()
        

class SessionDataLoader:
    def __init__(self, dataset, batch_size=50):
        """
        A class for creating session-parallel mini-batches.
        Args:
             dataset (SessionDataset): the session dataset to generate the batches from
             batch_size (int): size of the batch
        """
        self.dataset = dataset
        self.batch_size = batch_size
        
        
    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,): torch.FloatTensor. Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        # initializations
        df = self.dataset.df
        session_key='SessionId'
        item_key='ItemId'
        time_key='TimeStamp'
        self.n_items = df[item_key].nunique()+1
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr

        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = [] # indicator for the sessions to be terminated
        finished = False        

        while not finished:
            minlen = (end - start).min()
            # Item indices(for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                input = idx_input
                target = idx_target
                yield input, target, mask
                
            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

In [0]:
batch_size = 512 #como en el paper
session_max_len = 100
embeddingp=False

n_items = len(train_data['ItemId'].unique())+1
print("Items unicos training:", n_items)

dev_n_items = len(dev_data['ItemId'].unique())+1
print("Items unicos dev:", dev_n_items)

test_n_items = len(test_data['ItemId'].unique())+1
print("Items unicos testing:", test_n_items)

train_samples_qty = len(train_data['SessionId'].unique()) # cantidad sesiones no augmentadas de train
print("Sesiones training:", train_samples_qty)

dev_samples_qty = len(dev_data['SessionId'].unique()) # cantidad sesiones no augmentadas de dev
print("Sesiones validation:",dev_samples_qty)

test_samples_qty = len(test_data['SessionId'].unique()) # cantidad sesiones no augmentadas de test
print("Sesiones testing:", test_samples_qty)

Items unicos training: 11619
Items unicos dev: 10105
Items unicos testing: 10366
Sesiones training: 19853
Sesiones validation: 5749
Sesiones testing: 5271


In [0]:
train_fraction = 1#256 # 1/fraction es la cantidad de sesiones mas recientes a considerar
dev_fraction = 1#2

train_offset_step=train_samples_qty//batch_size
dev_offset_step=dev_samples_qty//batch_size
test_offset_step=test_samples_qty//batch_size


aux = [0]
aux.extend(list(train_data['ItemId'].unique()))
itemids = np.array(aux)
itemidmap = pd.Series(data=np.arange(n_items), index=itemids) 

In [0]:
# Modelo

# ToDo: self-attention

def attention_3d_block(inputs, TIME_STEPS, SINGLE_ATTENTION_VECTOR=True):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    #a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    return output_attention_mul
    
emb_size = 50
hidden_units = 100
size = emb_size
#size = emb_size if embeddingp else n_items

#MODELO BASELINE + DROPOUT

inputs = Input(batch_shape=(batch_size, 1, n_items))
#emb = Embedding(n_items, emb_size, embeddings_initializer='uniform', input_length=session_max_len)(inputs)
#drop1 = Dropout(0.25)(emb)
gru, gru_states = CuDNNGRU(hidden_units, stateful=True, return_state=True)(inputs)# drop1) #
drop2 = Dropout(0.25)(gru)
#attention_mul = attention_3d_block(drop2, session_max_len)
#attention_mul = Flatten()(attention_mul)
predictions = Dense(n_items, activation='softmax')(drop2)#(attention_mul)#
model = Model(input=inputs, output=[predictions])
#custom_loss = custom_cosine_loss(itemidmap, n_items)
# lr original es 0.0001
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
# Try Nadam, too
model.compile(loss=categorical_crossentropy, optimizer=opt)
model.summary()


#MODELO BASELINE + EMBEDDING + DROPOUT
inputs2 = Input(shape=(n_items,), batch_shape = (batch_size,n_items))
emb2 = Embedding(input_dim = n_items, output_dim = emb_size, embeddings_initializer='uniform')(inputs2)
drop12 = Dropout(0.25)(emb2)
gru12 = CuDNNGRU(100, stateful=True)(drop12)
drop22 = Dropout(0.25)(gru12)
predictions2 = Dense(n_items, activation='softmax')(drop22)
model2 = Model(input=inputs2, output=[predictions2])
opt2 = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model2.compile(loss=categorical_crossentropy, optimizer=opt2)
model2.summary()
#filepath='./bast/model_checkpoint'
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
callbacks_list = []#[checkpoint]



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (512, 1, 11619)           0         
_________________________________________________________________
cu_dnngru_9 (CuDNNGRU)       [(512, 100), (512, 100)]  3516300   
_________________________________________________________________
dropout_11 (Dropout)         (512, 100)                0         
_________________________________________________________________
dense_8 (Dense)              (512, 11619)              1173519   
Total params: 4,689,819
Trainable params: 4,689,819
Non-trainable params: 0
_________________________________________________________________
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (512, 11619)              0         
_________________________________________________________________




In [0]:
def get_states(model):
    return [K.get_value(s) for s,_ in model.state_updates]

def set_states(model, states):
    for (d,_), s in zip(model.state_updates, states):
        K.set_value(d, s)

In [0]:
#
train_dataset = SessionDataset(train_data)

#print(dataset.df.head())

for epoch in range(3):
    loader = SessionDataLoader(train_dataset, batch_size=batch_size)
    for feat, target, mask in loader:
                
        input_oh = to_categorical(feat, num_classes=loader.n_items) 
        input_oh = np.expand_dims(input_oh, axis=1)
        
        target_oh = to_categorical(target, num_classes=loader.n_items)
        
        tr_loss = model.train_on_batch(input_oh, target_oh)
        
        real_mask = np.ones((batch_size, 1))
        for elt in mask:
            real_mask[elt, :] = 0
        
        hidden_states = get_states(model)[0]#512,100    #get_states(model)[0]
               
        hidden_states = np.multiply(real_mask, hidden_states)
        hidden_states = np.array(hidden_states, dtype=np.float32)
        #hidden_states = np.expand_dims(hidden_states, axis=0)
        
        #set_states(model, hidden_states)\):
        #print(hidden_states.shape)
        #K.set_value(model.layers[1].states, hidden_states)
        model.layers[1].reset_states(hidden_states)

        print(tr_loss)    
            
    #pass
    #print(input, target, mask)

In [0]:
weights = model.layers[1].get_weights()[0]
from sklearn.neighbors import NearestNeighbors

# RECALL @ 20
recall_k = 20

#nbrs = NearestNeighbors(n_neighbors=recall_k, algorithm='ball_tree').fit(weights)
#distances, indices = nbrs.kneighbors(weights) # Vienen ya ordenados! # Shape (37484, 20)
# Paso 3: Dado un vector embedding arbitrario, obtener el item más cercano a éste. Aplicarla sobre los 20 anteriores.
from sklearn.metrics import recall_score

test_dataset = SessionDataset(test_data, itemmap = train_dataset.itemmap)
test_generator = SessionDataLoader(test_dataset, batch_size=batch_size)


n = 0
suma = 0
suma_baseline = 0

for feat, label, mask in test_generator:
    input_oh = to_categorical(feat, num_classes=loader.n_items) 
    input_oh = np.expand_dims(input_oh, axis=1)
    
    target_oh = to_categorical(label, num_classes=loader.n_items)
    
    pred = model.predict(input_oh, batch_size=batch_size)

    if n%100 == 0:
        try:
            print("{}:{}".format(n, suma/n))
        except:
            pass

    for row_idx in range(feat.shape[0]):
        #baseline_pred = obj.recommend( str(test_batch[0][row_idx][-1]), 20 )
        pred_row = pred[row_idx] # 37484, #.reshape(1, -1) # 50,
        label_row = target_oh[row_idx]        #.reshape(1, -1) # 50,
        #print(pred_row)
        #print(label_row)
        idx1 = pred_row.argsort()[-recall_k:][::-1]
        idx2 = label_row.argsort()[-1:][::-1]

        n += 1
        #print(idx1)
        #print(idx2)
        if idx2[0] in idx1:
            suma += 1

        #if idx2[0] in baseline_pred:
        #  suma_baseline += 1

print("Recall@{} epoch {}: {}".format(recall_k, epoch, suma/n))

#print("Recall@{} baseline: {}".format(recall_k, suma_baseline/n))

12800:0.254609375
25600:0.246484375
38400:0.24130208333333333
51200:0.23837890625
64000:0.23809375
76800:0.23796875
89600:0.23910714285714285
102400:0.24033203125
115200:0.24151909722222223
128000:0.2429296875
140800:0.24310369318181818
153600:0.2426171875
Recall@20 epoch 2: 0.24271844660194175


In [0]:
weights = model.layers[1].get_weights()[0]
from sklearn.neighbors import NearestNeighbors

# MRR @ 20
mrr_k = 20

#nbrs = NearestNeighbors(n_neighbors=recall_k, algorithm='ball_tree').fit(weights)
#distances, indices = nbrs.kneighbors(weights) # Vienen ya ordenados! # Shape (37484, 20)
# Paso 3: Dado un vector embedding arbitrario, obtener el item más cercano a éste. Aplicarla sobre los 20 anteriores.
from sklearn.metrics import recall_score

test_dataset = SessionDataset(test_data, itemmap = train_dataset.itemmap)
test_generator = SessionDataLoader(test_dataset, batch_size=batch_size)


n = 0
suma = 0
suma_baseline = 0

for feat, label, mask in test_generator:
    input_oh = to_categorical(feat, num_classes=loader.n_items) 
    input_oh = np.expand_dims(input_oh, axis=1)
    #print(label)
    target_oh = to_categorical(label, num_classes=loader.n_items)
    
    pred = model.predict(input_oh, batch_size=batch_size)

    if n%100 == 0:
        try:
            print("{}:{}".format(n, suma/n))
        except:
            pass

    for row_idx in range(feat.shape[0]):
        #baseline_pred = obj.recommend( str(test_batch[0][row_idx][-1]), 20 )
        pred_row = pred[row_idx] # 37484, #.reshape(1, -1) # 50,
        label_row = target_oh[row_idx]        #.reshape(1, -1) # 50,

        idx1 = pred_row.argsort()[-mrr_k:][::-1]
        idx2 = label_row.argsort()[-1:][::-1]

        n += 1
        #print(idx1)
        #print(idx2)
        if idx2[0] in idx1:
            suma += 1/int((np.where(idx1 == idx2[0])[0]+1))        
        #print(suma)
        #print(n)
 
       
        #if idx2[0] in baseline_pred:
        #  suma_baseline += 1

print("MRR@{} epoch {}: {}".format(mrr_k, epoch, suma/n))

#print("Recall@{} baseline: {}".format(recall_k, suma_baseline/n))

12800:0.08033569431239306
25600:0.07825079227624515
38400:0.07604660234253038
51200:0.07523685116753252
64000:0.07503162113923621
76800:0.07518242360743978
89600:0.07528284135734004
102400:0.07581448387456885
115200:0.07640642249057486
128000:0.07667791720009232
140800:0.07644054686484039
153600:0.0764714043625866
MRR@20 epoch 2: 0.07655110124218388
