In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

import keras
import keras.backend as K
from keras.models import Model
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.losses import categorical_crossentropy
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding

Using TensorFlow backend.


# import dataset

In [2]:
train_path = './data/preprocessed/rsc15_train_tr.pkl'
dev_path = './data/preprocessed/rsc15_train_valid.pkl'
test_path = './data/preprocessed/rsc15_test.pkl'
batch_size = 512

In [15]:
train_data = pd.read_pickle(train_path)
dev_data   = pd.read_pickle(dev_path)
test_data  = pd.read_pickle(test_path)

print('train data shape : ', train_data.shape)
print('dev data shape : ', dev_data.shape)
print('test data shape : ', test_data.shape)

train data shape :  (31579006, 3)
dev data shape :  (58233, 3)
test data shape :  (71222, 3)


In [9]:
train_data.head(3)

Unnamed: 0,SessionId,ItemId,Time
0,1,214536502,1396835000.0
1,1,214536500,1396836000.0
2,1,214536506,1396836000.0


In [6]:
dev_data.head(3)

Unnamed: 0,SessionId,ItemId,Time
0,11264996,214859872,1411882000.0
1,11264996,214859870,1411882000.0
2,11264996,214859902,1411883000.0


In [7]:
test_data.head(3)

Unnamed: 0,SessionId,ItemId,Time
0,11265009,214586805,1411971000.0
1,11265009,214509260,1411972000.0
2,11265017,214857547,1411985000.0


In [8]:
train_n_items = len(train_data['ItemId'].unique()) + 1
print('the number of train items : ', train_n_items)

the number of train items :  37484


In [9]:
train_samples_qty = len(train_data.SessionId.unique()) + 1
test_samples_qty = len(test_data.SessionId.unique()) + 1

print('the number of train session ID : ', train_samples_qty)
print('the number of test session ID : ', test_samples_qty)

the number of train session ID :  7953886
the number of test session ID :  15325


# SessionDataset class line decomposition

In [19]:
import gc
gc.collect()

2488

In [44]:
session_key = 'SessionId'
item_key = 'ItemId'
time_key = 'Time'
time_sort = False

In [21]:
train_data.head()

Unnamed: 0,SessionId,ItemId,Time
0,1,214536502,1396835000.0
1,1,214536500,1396836000.0
2,1,214536506,1396836000.0
3,1,214577561,1396836000.0
4,2,214662742,1396847000.0


## add time indices

In [28]:
## itemmap = None

item_ids = train_data[item_key].unique()
print('the number of train data item id : ', len(item_ids))
item2idx = pd.Series(data = np.arange(len(item_ids)), index = item_ids)
item2idx.head()

the number of train data item id :  37483


214536502    0
214536500    1
214536506    2
214577561    3
214662742    4
dtype: int64

In [32]:
itemmap = pd.DataFrame({item_key : item_ids, 'item_idx' : item2idx[item_ids].values})
itemmap.head(3)

Unnamed: 0,ItemId,item_idx
0,214536502,0
1,214536500,1
2,214536506,2


In [31]:
train_data = pd.merge(train_data, itemmap, on = item_key, how = 'inner')
train_data.head(3)

Unnamed: 0,SessionId,ItemId,Time,item_idx
0,1,214536502,1396835000.0,0
1,561,214536502,1396394000.0,0
2,877,214536502,1396868000.0,0


## sorting

In [33]:
train_data.sort_values([session_key, time_key], inplace = True)
train_data.head(3)

Unnamed: 0,SessionId,ItemId,Time,item_idx
0,1,214536502,1396835000.0,0
1770,1,214536500,1396836000.0,1
2312,1,214536506,1396836000.0,2


## click_offsets

In [56]:
offsets = np.zeros(train_data[session_key].nunique() + 1, dtype = np.int32)
print('length of offsets : ', len(offsets))
offsets[1:] = train_data.groupby(session_key).size().cumsum()
click_offsets = offsets
click_offsets

length of offsets :  7953886


array([       0,        4,       10, ..., 31579001, 31579003, 31579006],
      dtype=int32)

## order_session_idx

In [46]:
'''

time_sort = True
각 세션별 시작 타임의 최솟값
session_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
최솟값들 중에서 작은 것들 순서
session_idx_arr = np.argsort(session_start_time)

'''
#time_sort = False
session_idx_arr = np.arange(train_data[session_key].nunique())
session_idx_arr

array([      0,       1,       2, ..., 7953882, 7953883, 7953884])

# SessionDataset class decomposition
- A class for creating session-parallel mini-batches

In [50]:
batch_size = 50
done_sessions_counter = 0
train_data.head(3)

Unnamed: 0,SessionId,ItemId,Time,item_idx
0,1,214536502,1396835000.0,0
1770,1,214536500,1396836000.0,1
2312,1,214536506,1396836000.0,2


In [51]:
n_items = train_data[item_key].nunique() + 1
iters = np.arange(batch_size)
maxiter = iters.max()
maxiter

49

## while statement
### first batch

In [69]:
# batch_size인 50개씩 start, end
start = click_offsets[session_idx_arr[iters]]
end = click_offsets[session_idx_arr[iters] + 1]
mask = []

print('length of start : ', len(start))
print('length of end : ', len(end))

print('start : ', start[:5])
print('end : ', end[:5])

length of start :  50
length of end :  50
start :  [ 0  4 10 13 15]
end :  [ 4 10 13 15 17]


In [68]:
finished = False

In [71]:
# 가장 짧은 세션
# 전처리 과정에서 세션의 길이가 1인 것은 삭제
minlen = (end - start).min()
minlen

2

In [77]:
train_data.head(3)

Unnamed: 0,SessionId,ItemId,Time,item_idx
0,1,214536502,1396835000.0,0
1770,1,214536500,1396836000.0,1
2312,1,214536506,1396836000.0,2


In [76]:
idx_target = train_data.item_idx.values[start]
idx_target

array([  0,   4,   9,  12,  14,  16,  18,  19,  20,  29,  30,  32,  36,
        34,  32,  37,  39,  42,  41,  52,  63,  54,  69,  67,  71,  73,
        75,  72,  83,  85,  79,  81,  89,  86,  87,  35,  90,  91, 106,
       104,  99, 115, 113, 110, 121, 120, 118, 117, 137, 128])

In [79]:
# Item indices (for embedding) for clicks where the first sessions start
for i in range(minlen - 1) :
    #Build inputs & targets
    inp = idx_target #start point의 item idx
    target = train_data.item_idx.values[start + i + 1] # start지점 다음으로 오는 item_idx
    
    # SessionDataLoader 에서 return하는 값들
    # 차례로 input, target, mask
    print(inp, target, mask)

[  0   4   9  12  14  16  18  19  20  29  30  32  36  34  32  37  39  42
  41  52  63  54  69  67  71  73  75  72  83  85  79  81  89  86  87  35
  90  91 106 104  99 115 113 110 121 120 118 117 137 128] [  1   4  10  13  15  17  18  19  20  29  31  33  36  35  32  37  39  42
  41  52  64  55  69  68  71  74  76  16  84  71  80  82  89  86  88  34
  90  92 107 105 100 116 114 111 122 120 119 117 137  66] []


In [82]:
iters

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])

In [83]:
end - start

array([ 3,  5,  2,  1,  1,  1,  1,  2, 11,  1,  2,  2,  2,  1,  1,  3,  5,
       15,  3,  3,  3,  9,  2,  1,  1,  1, 15,  1,  1,  1,  2,  1,  4,  1,
        1,  7,  5,  2,  3,  1,  4,  1,  3,  5,  4,  1,  1,  1,  4,  9],
      dtype=int32)

In [86]:
# click indices where a particular session meets second-to-last element
start = start + (minlen - 1)
# see if how many sessions should terminate
mask = np.arange(len(iters))[(end - start) <= 1]
done_sessions_counter = len(mask)
done_sessions_counter #40개의 세션은 위에서 끝남

40

In [89]:
mask

array([ 0,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 18, 19, 20,
       22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 34, 37, 38, 39, 40, 41,
       42, 44, 45, 46, 47, 48])

In [87]:
maxiter

49

In [88]:
click_offsets

array([       0,        4,       10, ..., 31579001, 31579003, 31579006],
      dtype=int32)

In [90]:
for idx in mask :
    maxiter += 1
    if maxiter >= len(click_offsets) - 1 :
        finished = True #the end!
        
    # update the next startint, ending point
    iters[idx] = maxiter
    start[idx] = click_offsets[session_idx_arr[maxiter]]
    end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

### second batch

In [95]:
minlen = (end - start).min()
minlen

2

In [96]:
start

array([210,   8, 213, 216, 219, 221, 223, 225,  28, 227, 231, 233, 235,
       238, 246, 255,  59,  65, 257, 262, 266,  93, 277, 304, 306, 308,
       112, 310, 313, 317, 319, 321, 323, 325, 330, 148, 156, 332, 334,
       336, 340, 342, 344, 182, 347, 365, 367, 369, 372, 204], dtype=int32)

In [97]:
end

array([213,  10, 216, 219, 221, 223, 225, 227,  36, 231, 233, 235, 238,
       246, 255, 257,  61,  77, 262, 266, 277,  99, 304, 306, 308, 310,
       124, 313, 317, 319, 321, 323, 325, 330, 332, 152, 158, 334, 336,
       340, 342, 344, 347, 184, 365, 367, 369, 372, 374, 210], dtype=int32)

In [98]:
idx_target = train_data.item_idx.values[start]
idx_target

array([125,   7, 143,  89, 145, 138, 140, 116,  21, 160, 162, 163, 147,
       150, 153, 158,  39,  43, 191, 193, 182,  58, 165, 206, 204,  95,
        77, 199, 196, 202, 218, 219, 212, 213, 210,  95,  90, 209, 146,
       235, 233,  20, 222, 111, 224, 198, 246, 243,  94, 131])

In [99]:
# Item indices (for embedding) for clicks where the first sessions start
for i in range(minlen - 1) :
    #Build inputs & targets
    inp = idx_target #start point의 item idx
    target = train_data.item_idx.values[start + i + 1] # start지점 다음으로 오는 item_idx
    
    # SessionDataLoader 에서 return하는 값들
    # 차례로 input, target, mask
    print(inp, target, mask)

[125   7 143  89 145 138 140 116  21 160 162 163 147 150 153 158  39  43
 191 193 182  58 165 206 204  95  77 199 196 202 218 219 212 213 210  95
  90 209 146 235 233  20 222 111 224 198 246 243  94 131] [126   8 144  89 146 139 141 142  22 160 162 164 148 150 154 159  40  44
 191  36 183  58 166 207 205 208  77 200 101 203 218 220 212 214 211  96
  90  44 146 202 234 235 223 112 224 221 247 244  20 132] [ 0  2  3  4  5  6  7  9 10 11 12 13 14 15 18 19 20 22 23 24 25 27 28 29
 30 31 32 33 34 37 38 39 40 41 42 44 45 46 47 48]
