In [13]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

import keras
import keras.backend as K
from keras.models import Model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.losses import categorical_crossentropy
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding

In [42]:
train_path = './data/preprocessed/rsc15_train_tr.pkl'
dev_path = './data/preprocessed/rsc15_train_valid.pkl'
test_path = './data/preprocessed/rsc15_test.pkl'
batch_size = 50

train_data = pd.read_pickle(train_path)
#dev_data   = pd.read_pickle(dev_path)
test_data  = pd.read_pickle(test_path)

print('train data shape : ', train_data.shape)
#print('dev data shape : ', dev_data.shape)
print('test data shape : ', test_data.shape)

train data shape :  (31579006, 3)
test data shape :  (71222, 3)


데이터 너무 커서 좀 줄여서 모델링 및 predict

In [44]:
#오백만개
train_data = train_data.head(5000000)
print('train data shape : ', train_data.shape)

train data shape :  (5000000, 3)


In [45]:
train_n_items = len(train_data['ItemId'].unique()) + 1
print('the number of train items : ', train_n_items)

train_samples_qty = len(train_data.SessionId.unique()) + 1
test_samples_qty = len(test_data.SessionId.unique()) + 1

print('the number of train session ID : ', train_samples_qty)
print('the number of test session ID : ', test_samples_qty)

the number of train items :  27733
the number of train session ID :  1210740
the number of test session ID :  15325


In [46]:
class SessionDataset :

    """Credit to yhs-968/pyGRU4REC."""
    
    def __init__(self, data, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time',
                n_samples = -1, itemmap = None, time_sort = False) :
        
        """
        Args:
            path: path of the data file
            session_key, item_key, time_key: name of the fields corresponding to the sessions, items, time
            n_samples: the number of samples to use. If -1, use the whole dataset.
            itemmap: mapping between item IDs and item indices
            time_sort: whether to sort the sessions by time or not
        """
        
        self.df = data
        self.session_key = session_key
        self.item_key = item_key
        self.time_sort = time_sort
        self.add_time_indices(itemmap=itemmap)
        self.df.sort_values([session_key, time_key], inplace = True)
        
        # new method
        self.click_offsets = self.get_click_offsets()
        self.session_idx_arr = self.order_session_idx()
        
    def get_click_offsets(self) :
        
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """
        
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype = np.int32)
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()
        
        return offsets
    
    def order_session_idx(self) :
        """order the session indices"""
        
        if self.time_sort :
            
            session_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            
            session_idx_arr = np.argsort(session_start_time)
        
        else :
            #0~
            session_idx_arr = np.arange(self.df[self.session_key].nunique())
            
        return session_idx_arr
    
    def add_time_indices(self, itemmap = None) :
        
        if itemmap is None :
            item_ids = self.df[self.item_key].unique() #unique item ids
            item2idx = pd.Series(data = np.arange(len(item_ids)), index = item_ids) # make index
            itemmap = pd.DataFrame({self.item_key : item_ids, 'item_idx' : item2idx[item_ids].values})
            
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on = self.item_key, how = 'inner')
        
    @property
    def items(self) :
        return self.itemmap.ItemId.unique()

In [47]:
class SessionDataLoader :
    """Credit to yhs-968/pyGRU4REC."""
    def __init__(self, dataset, batch_size = 50) :
        """
        A class for creating session-parallel mini-batches.
        Args:
            dataset (SessionDataset): the session dataset to generate the batches from
            batch_size (int): size of the batch
        """
        
        self.dataset = dataset
        self.batch_size = batch_size
        self.done_sessions_counter = 0
        
        
    def __iter__(self) :
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """
        
        df = self.dataset.df #???????
        session_key = 'SessionId'
        item_key='ItemId'
        time_key='TimeStamp'
        self.n_items = df[item_key].nunique()+1
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr
        
        iters = np.arange(self.batch_size) #batch size
        maxiter = iters.max()
        
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = []
        
        finished = False
        
        while not finished:
            minlen = (end - start).min()
            # Item indices (for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                inp = idx_input
                target = idx_target
                yield inp, target, mask
                
            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            self.done_sessions_counter = len(mask)
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

# create model

In [48]:
print('create model...')
emb_size = 50
hidden_units = 100
size = emb_size

inputs = Input(batch_shape=(batch_size, 1, train_n_items))
gru, gru_states = CuDNNGRU(hidden_units, stateful=True, return_state=True)(inputs)
drop2 = Dropout(0.25)(gru)
predictions = Dense(train_n_items, activation='softmax')(drop2)
model = Model(input=inputs, output=[predictions])
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(loss=categorical_crossentropy, optimizer=opt)
model.summary()

create model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (50, 1, 27733)            0         
_________________________________________________________________
cu_dnngru_3 (CuDNNGRU)       [(50, 100), (50, 100)]    8350500   
_________________________________________________________________
dropout_3 (Dropout)          (50, 100)                 0         
_________________________________________________________________
dense_3 (Dense)              (50, 27733)               2801033   
Total params: 11,151,533
Trainable params: 11,151,533
Non-trainable params: 0
_________________________________________________________________


  # Remove the CWD from sys.path while we load stuff.


In [54]:
def get_states(model):
    return [K.get_value(s) for s,_ in model.state_updates]

# train model

In [49]:
import gc
gc.collect()

251

In [50]:
train_dataset = SessionDataset(train_data)

In [56]:
get_states(model_to_train)[0].shape

(50, 100)

In [59]:
print('prepare  model')
model_to_train = model

print('Start!')
for epoch in range(1) :
    with tqdm(total = train_samples_qty) as pbar :
        loader = SessionDataLoader(train_dataset, batch_size)
        
        for feat, target, mask in loader :
            real_mask = np.ones((batch_size, 1))
            for elt in mask:
                real_mask[elt, :] = 0

                hidden_states = get_states(model_to_train)[0] #hidden state value
                #print(hidden_states.shape)
                hidden_states = np.multiply(real_mask, hidden_states)
                hidden_states = np.array(hidden_states, dtype=np.float32)
                model_to_train.layers[1].reset_states(hidden_states)

                input_oh = to_categorical(feat, num_classes=loader.n_items) 
                input_oh = np.expand_dims(input_oh, axis=1)

                target_oh = to_categorical(target, num_classes=loader.n_items)

                tr_loss = model_to_train.train_on_batch(input_oh, target_oh)
                # fit 의 경우,  epochs ,  batch_size 를 한번에 넘겨주는데 반해
                # train_on_batch 의 경우는 현재 전달받은 데이터를 모두 활용해서 gradient vector를 계산해서 업데이트

                pbar.set_description("Epoch {0}. Loss: {1:.5f}".format(epoch, tr_loss))
                pbar.update(loader.done_sessions_counter)
                
(rec, rec_k), (mrr, mrr_k) = get_metrics(model_to_train, train_dataset.itemmap)

print("\t - Recall@{} epoch {}: {:5f}".format(rec_k, epoch, rec))
print("\t - MRR@{}    epoch {}: {:5f}".format(mrr_k, epoch, mrr))
print("\n")

Epoch 0. Loss: 7.01599:   0%|          | 168/1210740 [00:00<27:05, 744.87it/s] 

prepare  model
Start!


Epoch 0. Loss: 7.37986:   9%|▉         | 110203/1210740 [01:48<18:01, 1017.52it/s]


KeyboardInterrupt: 

# evaluation
## get_metric

In [32]:
test_dataset = SessionDataset(test_data, itemmap=train_generator_map)
test_generator = SessionDataLoader(test_dataset, batch_size=batch_size)

(512, 1)