In [1]:
import numpy as np
import pandas as pd
import datetime as dt

In [3]:
#uploading preprocessed data
events_df_reorganized=pd.read_csv('../../events_df_reorganized.csv',index_col=0)


In [4]:
#in session-based recommender system, RNN tries to predict the next item that will be bought in a session
#To evaluate the performance of our RNN system, we should split the data according to this goal

In [5]:
#I determined a certain time point
#Buying operations that happened before these time point become train data
#Buying operations that happened after these time point become test data

In [6]:
tmax = events_df_reorganized.time.max()
session_max_times = events_df_reorganized.groupby('sessionid').time.max()
session_train = session_max_times[session_max_times < tmax-2000000].index
session_test = session_max_times[session_max_times >= tmax-2000000].index
train = events_df_reorganized[np.in1d(events_df_reorganized.sessionid, session_train)]
test = events_df_reorganized[np.in1d(events_df_reorganized.sessionid, session_test)]

In [7]:
#In this RNN structure, when an input is given, RNN estimates the probability of each item being the next item to be retrieved.
#Because of that, test data must not contain any different item than train data
test = test[np.in1d(test.productid, train.productid)]
#After this preprocessing, we should recheck that in every session, in test data bought at least 2 products
tslength = test.groupby('sessionid').size()
test = test[np.in1d(test.sessionid, tslength[tslength>=2].index)]

In [8]:
print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), train.sessionid.nunique(), train.productid.nunique()))
train.to_csv('events_df_reorganized_train_full.txt', sep='\t', index=False)
print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), test.sessionid.nunique(), test.productid.nunique()))
test.to_csv('events_df_reorganized_test.txt', sep='\t', index=False)

Full train set
	Events: 138253
	Sessions: 15424
	Items: 5952
Test set
	Events: 95508
	Sessions: 11753
	Items: 5767


In [9]:
#same processes repeated to create dev and tr data
tmax = train.time.max()
session_max_times = train.groupby('sessionid').time.max()
print(session_max_times.shape)
session_train = session_max_times[session_max_times < tmax-250000].index
print(len(session_train))
session_valid = session_max_times[session_max_times >= tmax-250000].index
print(len(session_valid))
train_tr = train[np.in1d(train.sessionid, session_train)]
valid = train[np.in1d(train.sessionid, session_valid)]
valid = valid[np.in1d(valid.productid, train_tr.productid)]
tslength = valid.groupby('sessionid').size()
valid = valid[np.in1d(valid.sessionid, tslength[tslength>=2].index)]
print('Train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train_tr), train_tr.sessionid.nunique(), train_tr.productid.nunique()))
train_tr.to_csv('train_tr.txt', sep='\t', index=False)
print('Validation set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(valid), valid.sessionid.nunique(), valid.productid.nunique()))
valid.to_csv('train_valid.txt', sep='\t', index=False)

(15424,)
10737
4687
Train set
	Events: 97453
	Sessions: 10737
	Items: 5772
Validation set
	Events: 40204
	Sessions: 4620
	Items: 4777


Full train set
	Events: 138384
	Sessions: 15445
	Items: 5955
Test set
	Events: 95609
	Sessions: 11773
	Items: 5770


In [11]:
train

Unnamed: 0,sessionid,productid,time
1,d2ea7bd3-9235-4a9f-a9ea-d7f296e71318,HBV00000U2B18,1.590991e+09
2,5e594788-78a0-44dd-8e66-37022d48f691,OFIS3101-080,1.590991e+09
4,9e9d4f7e-898c-40fb-aae9-256c40779933,HBV00000NE0T4,1.590991e+09
5,9e9d4f7e-898c-40fb-aae9-256c40779933,HBV00000NE0T6,1.590991e+09
7,bf3a141e-ed91-4dfa-b4e1-de5aadf61d97,HBV00000U2B4I,1.590991e+09
...,...,...,...
387468,974f9d79-355d-456b-8815-cb42063258be,HBV00000ABCBB,1.591492e+09
387473,974f9d79-355d-456b-8815-cb42063258be,ZYHEN4438,1.591492e+09
387474,974f9d79-355d-456b-8815-cb42063258be,ZYHEN100683890,1.591492e+09
387477,5e25ee2b-52d2-42c7-8b35-867cedf48243,HBV00000NE1DG,1.591492e+09


(15445,)
10755
4690
Train set
	Events: 97562
	Sessions: 10755
	Items: 5775
Validation set
	Events: 40226
	Sessions: 4623
	Items: 4779


In [13]:
tmax

1591572890.614