In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True #allow_growth 옵션 : 런타임 할당에 따라 GPU 메모리 할당하려고 시도하는 옵션.
# 처음에 메모리를 거의 할당하지 않고 세션을 시작한 후, 그 후에 더 많은 GPU 메모리가 필요할 때 TF필요한 GPU 메모리 영역을 확장한다.

import keras
import keras.backend as K
from keras.models import Model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.losses import categorical_crossentropy
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding

Using TensorFlow backend.


# import dataset

In [2]:
train_path = './data/preprocessed/rsc15_train_tr.pkl'
dev_path = './data/preprocessed/rsc15_train_valid.pkl'
test_path = './data/preprocessed/rsc15_test.pkl'
batch_size = 512

In [3]:
train_data = pd.read_pickle(train_path)
dev_data   = pd.read_pickle(dev_path)
test_data  = pd.read_pickle(test_path)

print('train data shape : ', train_data.shape)
print('dev data shape : ', dev_data.shape)
print('test data shape : ', test_data.shape)

train data shape :  (31579006, 3)
dev data shape :  (58233, 3)
test data shape :  (71222, 3)


In [5]:
train_data.head(3)

Unnamed: 0,SessionId,ItemId,Time
0,1,214536502,1396835000.0
1,1,214536500,1396836000.0
2,1,214536506,1396836000.0


In [6]:
dev_data.head(3)

Unnamed: 0,SessionId,ItemId,Time
0,11264996,214859872,1411882000.0
1,11264996,214859870,1411882000.0
2,11264996,214859902,1411883000.0


In [7]:
test_data.head(3)

Unnamed: 0,SessionId,ItemId,Time
0,11265009,214586805,1411971000.0
1,11265009,214509260,1411972000.0
2,11265017,214857547,1411985000.0


In [4]:
train_n_items = len(train_data['ItemId'].unique()) + 1
print('the number of train items : ', train_n_items)

the number of train items :  37484


In [5]:
train_samples_qty = len(train_data.SessionId.unique()) + 1
test_samples_qty = len(test_data.SessionId.unique()) + 1

print('the number of train session ID : ', train_samples_qty)
print('the number of test session ID : ', test_samples_qty)

the number of train session ID :  7953886
the number of test session ID :  15325


# preprocess

In [6]:
class SessionDataset :
    """Credit to yhs-968/pyGRU4REC."""
    
    def __init__(self, data, session_key = 'SessionId', item_key = 'ItemId', time_key = 'Time',
                n_samples = -1, itemmap = None, time_sort = False) :
        
        """
        Args:
            path: path of the data file
            session_key, item_key, time_key: name of the fields corresponding to the sessions, items, time
            n_samples: the number of samples to use. If -1, use the whole dataset.
            itemmap: mapping between item IDs and item indices
            time_sort: whether to sort the sessions by time or not
        """
        
        self.df = data
        self.session_key = session_key
        self.item_key = item_key
        self.time_sort = time_sort
        self.add_time_indices(itemmap=itemmap)
        self.df.sort_values([session_key, time_key], inplace = True)
        
        # new method
        self.click_offsets = self.get_click_offsets() #각 sessionID 별 시작 index
        self.session_idx_arr = self.order_session_idx()
        
    def get_click_offsets(self) :
        
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """
        
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype = np.int32)
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()
        
        return offsets
    
    def order_session_idx(self) :
        """order the session indices"""
        
        if self.time_sort :
            # 각 세션별 시작 타임의 최솟값
            session_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            # 최솟값들 중에서 작은 것들 순서
            session_idx_arr = np.argsort(session_start_time)
        
        else :
            #0~
            session_idx_arr = np.arange(self.df[self.session_key].nunique())
            
        return session_idx_arr
    
    def add_time_indices(self, itemmap = None) :
        """ 
        Add item index column named "item_idx" to the df
        Args:
            itemmap (pd.DataFrame): mapping between the item Ids and indices
        """
        
        if itemmap is None :
            item_ids = self.df[self.item_key].unique() #unique item ids
            item2idx = pd.Series(data = np.arange(len(item_ids)),
                                index = item_ids) # make index
            itemmap = pd.DataFrame({self.item_key : item_ids,
                                   'item_idx' : item2idx[item_ids].values})
            
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on = self.item_key, how = 'inner')
        
    @property
    def items(self) :
        return self.itemmap.ItemId.unique()

In [7]:
class SessionDataLoader :
    """Credit to yhs-968/pyGRU4REC."""
    def __init__(self, dataset, batch_size = 50) :
        """
        A class for creating session-parallel mini-batches.
        Args:
            dataset (SessionDataset): the session dataset to generate the batches from
            batch_size (int): size of the batch
        """
        
        self.dataset = dataset
        self.batch_size = batch_size
        self.done_sessions_counter = 0
        
        
    def __iter__(self) :
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """
        
        df = self.dataset.df #???????
        session_key = 'SessionId'
        item_key='ItemId'
        time_key='TimeStamp'
        self.n_items = df[item_key].nunique()+1
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr
        
        iters = np.arange(self.batch_size) #batch size
        maxiter = iters.max()
        
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = []
        
        finished = False
        
        while not finished:
            minlen = (end - start).min()
            # Item indices (for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                inp = idx_input
                target = idx_target
                yield inp, target, mask
                
            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            self.done_sessions_counter = len(mask)
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

# create model

In [8]:
def create_model():   
    print('create model...')
    emb_size = 50
    hidden_units = 100
    size = emb_size

    inputs = Input(batch_shape=(batch_size, 1, train_n_items))
    gru, gru_states = CuDNNGRU(hidden_units, stateful=True, return_state=True)(inputs)
    drop2 = Dropout(0.25)(gru)
    predictions = Dense(train_n_items, activation='softmax')(drop2)
    model = Model(input=inputs, output=[predictions])
    opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    model.compile(loss=categorical_crossentropy, optimizer=opt)
    model.summary()

    filepath='./model_checkpoint.h5'
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
    callbacks_list = []
    return model


def get_states(model):
    return [K.get_value(s) for s,_ in model.state_updates]

In [9]:
#resume True checkpoint에서부터 resuming할 경우
resume = False

if resume :
    try :
        model = keras.models.load_model(resume)
        print('Model checkpoint {} loaded!'.format(resume))
        
    except OSError :
        print('Model checkpoinit could not be loaded. Training from scratch...')
        model = create_model()
        
else :
    model = create_model()

create model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (512, 1, 37484)           0         
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       [(512, 100), (512, 100)]  11275800  
_________________________________________________________________
dropout_1 (Dropout)          (512, 100)                0         
_________________________________________________________________
dense_1 (Dense)              (512, 37484)              3785884   
Total params: 15,061,684
Trainable params: 15,061,684
Non-trainable params: 0
_________________________________________________________________


  # This is added back by InteractiveShellApp.init_path()


# evaluation

In [10]:
def get_metrics(model, train_generator_map, recall_k=20, mrr_k=20):

    test_dataset = SessionDataset(test_data, itemmap=train_generator_map)
    test_generator = SessionDataLoader(test_dataset, batch_size=batch_size)

    n = 0
    rec_sum = 0
    mrr_sum = 0

    with tqdm(total=test_samples_qty) as pbar:
        for feat, label, mask in test_generator:

            target_oh = to_categorical(label, num_classes=train_n_items)
            input_oh  = to_categorical(feat,  num_classes=train_n_items) 
            input_oh = np.expand_dims(input_oh, axis=1)
            
            pred = model.predict(input_oh, batch_size=batch_size)

            for row_idx in range(feat.shape[0]):
                pred_row = pred[row_idx] 
                label_row = target_oh[row_idx]

                rec_idx =  pred_row.argsort()[-recall_k:][::-1]
                mrr_idx =  pred_row.argsort()[-mrr_k:][::-1]
                tru_idx = label_row.argsort()[-1:][::-1]

                n += 1

                if tru_idx[0] in rec_idx:
                    rec_sum += 1

                if tru_idx[0] in mrr_idx:
                    mrr_sum += 1/int((np.where(mrr_idx == tru_idx[0])[0]+1))
            
            pbar.set_description("Evaluating model")
            pbar.update(test_generator.done_sessions_counter)

    recall = rec_sum/n
    mrr = mrr_sum/n
    return (recall, recall_k), (mrr, mrr_k)

# train model

In [12]:
def train_model(model, train_dataset, save_weights = False) :
    
    print('prepare model')
    model_to_train = model
    
    print('epoch start!')
    for epoch in range(1, 10):
        with tqdm(total=train_samples_qty) as pbar:
            loader = SessionDataLoader(train_dataset, batch_size=batch_size)
            for feat, target, mask in loader:
                
                real_mask = np.ones((batch_size, 1))
                for elt in mask:
                    real_mask[elt, :] = 0

                hidden_states = get_states(model_to_train)[0]
                hidden_states = np.multiply(real_mask, hidden_states)
                hidden_states = np.array(hidden_states, dtype=np.float32)
                model_to_train.layers[1].reset_states(hidden_states)

                input_oh = to_categorical(feat, num_classes=loader.n_items) 
                input_oh = np.expand_dims(input_oh, axis=1)

                target_oh = to_categorical(target, num_classes=loader.n_items)

                tr_loss = model_to_train.train_on_batch(input_oh, target_oh)

                pbar.set_description("Epoch {0}. Loss: {1:.5f}".format(epoch, tr_loss))
                pbar.update(loader.done_sessions_counter)
            
        if save_weights:
            print("Saving weights...")
            model_to_train.save('./GRU4REC_{}.h5'.format(epoch))
        
        (rec, rec_k), (mrr, mrr_k) = get_metrics(model_to_train, train_dataset.itemmap)

        print("\t - Recall@{} epoch {}: {:5f}".format(rec_k, epoch, rec))
        print("\t - MRR@{}    epoch {}: {:5f}".format(mrr_k, epoch, mrr))
        print("\n")

In [13]:
print('class Session Dataset')
train_dataset = SessionDataset(train_data)

class Session Dataset


In [15]:
train_dataset

<__main__.SessionDataset at 0x7f403c9024a8>

In [16]:
loader = SessionDataLoader(train_dataset, batch_size=batch_size)

In [17]:
loader

<__main__.SessionDataLoader at 0x7f3ff86769e8>

In [19]:
i = 1
for feat, target, mask in loader:
    print(feat, target, mask)
    if i%1 == 0 :
        break
    i += 1

[  0   4   9  12  14  16  18  19  20  29  30  32  36  34  32  37  39  42
  41  52  63  54  69  67  71  73  75  72  83  85  79  81  89  86  87  35
  90  91 106 104  99 115 113 110 121 120 118 117 137 128 125 143  89 145
 138 140 116 160 162 163 147 150 153 158 191 193 182 165 206 204  95 199
 196 202 218 219 212 213 210 209 146 235 233  20 222 224 198 246 243  94
  44  81 240 238 257 255 250 162 249 248  15 218 244 285 284 287  57 270
 273  35  20 279 278 240 306 296  36 310 290 284 294 295 296  97 137 327
 322 333 331 329 315 233 312 319 249 317 350 352   9 139 357 354 355  57
 336 338 199 346 345 159 300 377 370 368 374 372 361 367 363  27 360 358
 298 140 409 398  89 383 390 392 240 379 131 374 373 282 430 428  45 201
 249 415 415 199 411 458 455 454 383 450 449 202 439 433 496  89 488 203
 506 498 502 147  31 509 249 518 516 463 460 459 476 467 466 302 481 479
  20 486 121 483   9 532  31 485 162 534 537 538 498 199 495 542 301 250
 162 519 522 161 523 358 526  19  99 147 296 530 57

In [14]:
train_model(model, train_dataset, save_weights=True)

  0%|          | 0/7953886 [00:00<?, ?it/s]

prepare model
epoch start!


Epoch 1. Loss: 9.01415:   0%|          | 10493/7953886 [00:11<2:29:55, 883.02it/s] 


KeyboardInterrupt: 