In [21]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

cur_dir = os.getcwd()
data_path = cur_dir + '/div4rec/rc15_data/'

In [22]:
click_df = pd.read_csv(os.path.join(data_path, 'yoochoose-clicks.dat'), header=None)

  click_df = pd.read_csv(os.path.join(data_path, 'yoochoose-clicks.dat'), header=None)


## 0. step: raw input data
See raw input data by running following cell:

In [11]:
click_df.head(10)

Unnamed: 0,0,1,2,3
0,1,2014-04-07T10:51:09.277Z,214536502,0
1,1,2014-04-07T10:54:09.868Z,214536500,0
2,1,2014-04-07T10:54:46.998Z,214536506,0
3,1,2014-04-07T10:57:00.306Z,214577561,0
4,2,2014-04-07T13:56:37.614Z,214662742,0
5,2,2014-04-07T13:57:19.373Z,214662742,0
6,2,2014-04-07T13:58:37.446Z,214825110,0
7,2,2014-04-07T13:59:50.710Z,214757390,0
8,2,2014-04-07T14:00:38.247Z,214757407,0
9,2,2014-04-07T14:02:36.889Z,214551617,0


## 1. step: sample_data_rc15.py
**Clicks** and **Buys** datasets:
 - adds column names
 - removes session shorter than 3
 - sample 200 000 sessions
 - change item_id to smaller numbers

See data after this step by running following cell:

In [13]:
sampled_click_df = pd.read_pickle(os.path.join(data_path, 'sampled_clicks.df'))
sampled_click_df.head(10)

Unnamed: 0,session_id,timestamp,item_id,category
95,26,2014-04-06T16:42:55.741Z,5247,0
96,26,2014-04-06T16:44:58.482Z,15726,0
97,26,2014-04-06T16:45:11.344Z,10707,0
98,26,2014-04-06T16:46:19.569Z,6926,0
153,44,2014-04-07T05:51:30.888Z,18574,0
154,44,2014-04-07T05:52:24.351Z,19042,0
155,44,2014-04-07T05:53:47.689Z,18982,0
156,44,2014-04-07T05:54:49.212Z,11711,0
157,44,2014-04-07T05:56:02.520Z,21257,0
158,44,2014-04-07T06:01:14.185Z,2145,0


## 2. step: merge_and_sort_rc15.py
**Clicks** and **Buys** datasets:
 - drop unnecessary columns
 - add column determining if action is click or buy
 - merge clicks and buys to one dataset
 - sort dataset based on session_id and timestamp

See data after this step by running following cell:

In [14]:
sampled_sessions_df = pd.read_pickle(os.path.join(data_path, 'sampled_sessions.df'))
sampled_sessions_df.head(100)

Unnamed: 0,session_id,timestamp,item_id,is_buy
0,26,2014-04-06T16:42:55.741Z,5247,0
1,26,2014-04-06T16:44:58.482Z,15726,0
2,26,2014-04-06T16:45:11.344Z,10707,0
3,26,2014-04-06T16:46:19.569Z,6926,0
4,44,2014-04-07T05:51:30.888Z,18574,0
...,...,...,...,...
83,831,2014-04-02T21:44:26.116Z,17104,0
84,831,2014-04-02T21:47:23.231Z,16391,0
85,831,2014-04-02T21:48:13.889Z,16391,0
86,831,2014-04-02T21:48:24.795Z,17104,0


## 3. step: split_data.py
Sessions dataset - Clicks and Buys merged
 - shuffle sessions randomly
 - split dataset 8:1:1 to train, test, val

Outputs are the same as above, just for different subsets of sessions


## 4. step: replay_buffer_rc15.py
On test, train, val datasets
 - set state size to 10
 - set item_num to total number of items (26702)
 - set pad_item to 26702
 - group data according to session_ids
 - compute various attributes and create replay_buffer

To see replay buffer, run following code:

In [15]:
replay_buffer_df = pd.read_pickle(os.path.join(data_path, 'replay_buffer_test.df'))
replay_buffer_df.head(20)

Unnamed: 0,state,len_state,action,is_buy,next_state,len_next_states,is_done
0,"[26702, 26702, 26702, 26702, 26702, 26702, 267...",1,20904,0,"[20904, 26702, 26702, 26702, 26702, 26702, 267...",1,False
1,"[20904, 26702, 26702, 26702, 26702, 26702, 267...",1,20904,0,"[20904, 20904, 26702, 26702, 26702, 26702, 267...",2,False
2,"[20904, 20904, 26702, 26702, 26702, 26702, 267...",2,20904,0,"[20904, 20904, 20904, 26702, 26702, 26702, 267...",3,False
3,"[20904, 20904, 20904, 26702, 26702, 26702, 267...",3,20904,0,"[20904, 20904, 20904, 20904, 26702, 26702, 267...",4,False
4,"[20904, 20904, 20904, 20904, 26702, 26702, 267...",4,6077,0,"[20904, 20904, 20904, 20904, 6077, 26702, 2670...",5,False
5,"[20904, 20904, 20904, 20904, 6077, 26702, 2670...",5,6077,0,"[20904, 20904, 20904, 20904, 6077, 6077, 26702...",6,False
6,"[20904, 20904, 20904, 20904, 6077, 6077, 26702...",6,2720,0,"[20904, 20904, 20904, 20904, 6077, 6077, 2720,...",7,False
7,"[20904, 20904, 20904, 20904, 6077, 6077, 2720,...",7,2720,0,"[20904, 20904, 20904, 20904, 6077, 6077, 2720,...",8,False
8,"[20904, 20904, 20904, 20904, 6077, 6077, 2720,...",8,1246,0,"[20904, 20904, 20904, 20904, 6077, 6077, 2720,...",9,False
9,"[20904, 20904, 20904, 20904, 6077, 6077, 2720,...",9,1246,0,"[20904, 20904, 20904, 20904, 6077, 6077, 2720,...",10,False


## Issues
#### I1: Buy Items are not real user actions
Look at timestamps of buys - these are not actions of real user but rather  
actions that happened in the system after user clicked "buy" on the whole  
basket. Note: We do not see an event of adding item to the basket.


In [17]:
sessions=[2316938]           # see sessions:
sampled_sessions_df.loc[sampled_sessions_df['session_id'].isin(sessions)]

Unnamed: 0,session_id,timestamp,item_id,is_buy
247578,2316938,2014-05-10T18:22:12.726Z,19500,0
247579,2316938,2014-05-10T18:25:17.282Z,19774,0
247580,2316938,2014-05-10T18:26:01.744Z,19499,0
247581,2316938,2014-05-10T18:27:26.906Z,19499,0
247582,2316938,2014-05-10T18:30:52.061Z,19499,0
247583,2316938,2014-05-10T18:31:50.137Z,19501,0
247584,2316938,2014-05-10T18:32:42.161Z,19774,0
247585,2316938,2014-05-10T18:34:03.407Z,19499,0
247586,2316938,2014-05-10T18:35:26.326Z,19774,0
247587,2316938,2014-05-10T18:45:04.322Z,19502,0



#### I2: Replay Buffer contains empty sequences
Look at replay buffer example above. It contains **state**s with only padding  
items which should lead to **next_state** with single item. This may be  
unlucky choice for training and testing data as predicting first item in  
sequence based on nothing is random guessing that happens exactly once per  
session. If average session length is between 5 and 10 interaction then  
10 to 20% of dataset/testset is random guessing.