In [2]:
from datetime import datetime
import pandas as pd
import numpy as np
import random
import time

# Dataset Details

In [2]:
def properties(path, sid, iid):
    todrop = []
    data = pd.read_csv(path, sep = '\t')
    data.sort_values([sid, 'Time'], inplace = True)
    data = data.reset_index(drop=True)
    for i, row in data.iterrows():
        if i == 0:
            old_row = row
            continue
        if old_row[sid] == row[sid] and old_row[iid] == row[iid]:
            todrop.append(i)
        old_row = row
        
    data.drop(todrop, axis = 0, inplace = True)
    clicks = len(data)
    print('Number of Clicks', clicks)
    sessions = len(data.groupby(sid))
    print('Number of Sessions', sessions)
    items = len(data.groupby(iid))
    print('Number of Items', items)
    print('Average Session Length', clicks / sessions)

In [41]:
properties('tmall15-raw/processed/tmall15_train_full.txt', 'SessionId', 'ItemId')

Number of Clicks 10731643
Number of Sessions 1567725
Number of Items 618770
Average Session Length 6.845360634039771


In [33]:
properties('Benchmarking/TMall/TMall_test.csv', 'SessionID', 'ItemID')

Number of Clicks 93526
Number of Sessions 10041
Number of Items 43439
Average Session Length 9.314410915247485


In [44]:
(1567725+10041)

1577766

In [42]:
(10731643+93526) / (1567725+10041)

6.861073822100362

# RecSys15

In [113]:
import matplotlib.pyplot as plt
data = pd.read_csv('Benchmarking/RecSys/recSys15Valid.csv')
#data = pd.read_csv('Benchmarking/TMall/TMall_test.csv')
#data = pd.read_csv('Benchmarking/Digintica/Digintica_test.csv')
#data = pd.read_csv('Benchmarking/RetailRocket/RetailRocket_test.csv')

In [111]:
prv_item = -1; prv_ssn = -1; inx = []
for i, row in data.iterrows():
    if prv_ssn == row['sessionId'] and prv_item == row['itemId']:
        inx.append(i)
    prv_ssn = row['sessionId']; prv_item = row['itemId']

data.drop(data.index[inx], inplace=True)

In [8]:
## Keep Only Short Sessions < 5
data_short = data_grouped.filter(lambda x: len(x) < 5)
print('Finished_Short')

## Keep Only Intermediate Sessions >= 5 and < 10
data_intermediate = data_grouped.filter(lambda x: len(x) >= 5 and len(x) < 10)
print('Finished_Intermediate')

## Keep Only Long Sessions >= 10
data_long = data_grouped.filter(lambda x: len(x) >= 10)
print('Finished_Long')

Finished_Short
Finished_Intermediate
Finished_Long


In [9]:
data_short.head()

Unnamed: 0,SessionID,Time,ItemID
0,1,1396857000.0,214536502
1,1,1396857000.0,214536500
2,1,1396857000.0,214536506
3,1,1396857000.0,214577561
10,3,1396434000.0,214716935


In [10]:
#Save Splits Short
data_short.to_csv('2-a-1(short).csv', index=False)
#Save Splits Intermediate
data_intermediate.to_csv('2-a-2(Intermediate).csv', index=False)
#Save Splits Long
data_long.to_csv('2-a-3(Long).csv', index=False)

In [11]:
del data_grouped
del data_short
del data_intermediate
del data_long

c. Which model performs better for small/large period of time of dataset collection?

In [25]:
sec_per_day = 86400
last_date = data.iloc[-1,1]
first_date = data.iloc[0,1]
#two days
data_recent2 = data[data['Time'] > (last_date - sec_per_day * 2)]
#one Week
data_recent7 = data[data['Time'] > (last_date - sec_per_day * 7)]
#two weeks
data_recent14 = data[data['Time'] > (last_date - sec_per_day * 14)]
#One Month
data_recent30 = data[data['Time'] > (last_date - sec_per_day * 30)]
#More than One Month
#data

In [14]:
data_recent2.head()

Unnamed: 0,SessionID,Time,ItemID
29753408,11249479,1411403000.0,214834935
29753409,11249479,1411403000.0,214582942
29753410,11249479,1411403000.0,214582942
29753690,11249649,1411404000.0,214848605
29753691,11249649,1411404000.0,214853072


In [15]:
#Save 2 days
data_recent2.to_csv('2-c-1(2days).csv', index=False)
#Save 7 days
data_recent7.to_csv('2-c-2(7days).csv', index=False)
#Save 14 days
data_recent14.to_csv('2-c-3(14days).csv', index=False)
#Save 30 days
data_recent30.to_csv('2-c-4(30days).csv', index=False)

In [16]:
del data_recent2
del data_recent7
del data_recent14
del data_recent30

d. How model is affected by data recency? Recent data is enough or needs history?

In [51]:
#most recent 10
data_recent10 = data[data['Time'] > (last_date - sec_per_day * 10)]
#most old 10
data_old10 = data[data['Time'] < (first_date + sec_per_day * 10)]
#mix old 5 and recent 5
data_mix1 = data[data['Time'] < (first_date + sec_per_day * 5)] 
data_mix2 = data[data['Time'] > (last_date - sec_per_day * 5)]
data_mix = data_mix1.append(data_mix2, ignore_index=True)

In [52]:
#Save most recent
data_recent10.to_csv('2-d-1(recent).csv', index=False)
#Save most old
data_old10.to_csv('2-d-2(old).csv', index=False)
#Save mix
data_mix.to_csv('2-d-3(mix).csv', index=False)

In [53]:
del data_recent10
del data_old10
del data_mix

e. Which model performs better at cold start problem (new item (freq < 5) / short session (length < 5) ) ?

**Train Using 2-c-4(2days).csv and Validate using the following splits**

In [17]:
data_test = pd.read_csv('RecSys_Dataset_After/recSys15Valid.txt')
data_test_grouped = data_test.groupby('SessionID')

**Sessions Part**

In [18]:
## Keep Only Short Test Sessions < 5
data_test_short = data_test_grouped.filter(lambda x: len(x) < 5)
print('Finished_Short')

## Keep Only Intermediate Test Sessions >= 5 and < 10
data_test_intermediate = data_test_grouped.filter(lambda x: len(x) >= 5 and len(x) < 10)
print('Finished_Intermediate')

## Keep Only Long Test Sessions >= 10
data_test_long = data_test_grouped.filter(lambda x: len(x) >= 10)
print('Finished_Long')

Finished_Short
Finished_Intermediate
Finished_Long


In [19]:
#Save Splits Short
data_test_short.to_csv('2-e-1(test-short).csv', index=False)
#Save Splits Intermediate
data_test_intermediate.to_csv('2-e-2(test-Intermediate).csv', index=False)
#Save Splits Long
data_test_long.to_csv('2-e-3(test-Long).csv', index=False)

In [20]:
del data_test_short
del data_test_intermediate
del data_test_long

**Items Part**

In [35]:
items_counts = data_recent2['ItemID'].value_counts().to_dict()
data_test_cpy = data_test.copy(deep = True)

In [40]:
for i, row in data_test_cpy.iterrows():
    if row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 50:
        data_test_cpy.loc[i,'freq_threshold'] = 50
    elif row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 100:
        data_test_cpy.loc[i,'freq_threshold'] = 100
    elif row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 200:
        data_test_cpy.loc[i,'freq_threshold'] = 200
    elif row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 300:
        data_test_cpy.loc[i,'freq_threshold'] = 300
    else:
        data_test_cpy.loc[i,'freq_threshold'] = 301

In [41]:
#Save Splits Short
data_test_cpy.to_csv('2-e-4(freq-threshold).csv', index=False)

In [24]:
del data_test_cpy

f. Which model performs better at small/high number of recommended items K (ie. Recall@K - MRR@K) ?

**For each of the above experiments:**
1. Record Recall@3, Recall@5, Recall@10, Recall@20, Recall@30
2. Record MRR@3, MRR@5, MRR@10, MRR@20, MRR@30

### 3. What is the effect of adding Item Features on the previous questions?

**Not Applicable in this Dataset**

### 4. Under constraints of lack of data which model performs better?

In [54]:
data_2 = data.iloc[-int(len(data) / 2) :]   #just take 1/2 last instances
data_8 = data.iloc[-int(len(data) / 8) :]   #just take 1/8 last instances
data_16 = data.iloc[-int(len(data) / 16) :]   #just take 1/16 last instances
data_64 = data.iloc[-int(len(data) / 64) :]   #just take 1/64 last instances
data_256 = data.iloc[-int(len(data) / 256) :]   #just take 1/256 last instances

In [59]:
all_ids = data['SessionID'].unique()
random.shuffle(all_ids)

In [62]:
ids = all_ids[: int(len(all_ids) / 2)]
data_2 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 8)]
data_8 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 16)]
data_16 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 64)]
data_64 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 256)]
data_256 = data[data['SessionID'].isin(ids)]

data_2.to_csv('4-1(2 portion).csv', index=False)
data_8.to_csv('4-2(8 portion).csv', index=False)
data_16.to_csv('4-3(16 portion).csv', index=False)
data_64.to_csv('4-4(64 portion).csv', index=False)
data_256.to_csv('4-5(256 portion).csv', index=False)

Finished_Short
Finished_Short
Finished_Short
Finished_Short


In [63]:
del data_2
del data_8
del data_16
del data_64
del data_256

### 5. How to evaluate recommendation system in a standard way?

**Train Using 4-3(16 portion).csv and measure the following metrics on test set for K = 1, 3, 5, 10, 20**:

**Experiments should be done on the same machine**

a. items popularity = Average(Frequency of predicted item from training set / Frequency of most frequent item in training set)  

b. item coverage = (Number of distinct predicted items / Number of distinct total items in test set).

c. Mean Reciprocal Rank

d. Recall

d. Time (Training Time + Testing Time) and Memory complexities

# RetailRocket

In [144]:
data = pd.read_csv('retailrocket/processed/events_train_full.txt', sep = '\t')
data.columns = ['Time', 'UserID', 'ItemID', 'SessionID']

prv_item = -1; prv_ssn = -1; inx = []
for i, row in data.iterrows():
    if prv_ssn == row['SessionID'] and prv_item == row['ItemID']:
        inx.append(i)
    prv_ssn = row['SessionID']; prv_item = row['ItemID']

data.drop(data.index[inx], inplace=True)

### 2. Which deep learning model is better for different dataset meta-features values using only Session ID and item ID
a. Which model performs better when we have short/long session length?

In [145]:
data_grouped = data.groupby('SessionID')

In [146]:
## Keep Only Short Sessions < 5
data_short = data_grouped.filter(lambda x: len(x) < 5)
print('Finished_Short')

## Keep Only Intermediate Sessions >= 5 and < 10
data_intermediate = data_grouped.filter(lambda x: len(x) >= 5 and len(x) < 10)
print('Finished_Intermediate')

## Keep Only Long Sessions >= 10
data_long = data_grouped.filter(lambda x: len(x) >= 10)
print('Finished_Long')

Finished_Short
Finished_Intermediate
Finished_Long


In [147]:
data_short.head()

Unnamed: 0,Time,UserID,ItemID,SessionID
0,1442004589,0,285930,0
1,1442004759,0,357564,0
2,1442004917,0,67045,0
11,1440916778,6,253615,7
12,1440916823,6,344723,7


In [148]:
#Save Splits Short
data_short.to_csv('2-a-1(short).csv', index=False)
#Save Splits Intermediate
data_intermediate.to_csv('2-a-2(Intermediate).csv', index=False)
#Save Splits Long
data_long.to_csv('2-a-3(Long).csv', index=False)

In [149]:
data.to_csv('Train.csv', index=False)

In [150]:
del data_grouped
del data_short
del data_intermediate
del data_long

c. Which model performs better for small/large period of time of dataset collection?

In [151]:
sec_per_day = 86400
last_date = max(data.iloc[:,0])
first_date = min(data.iloc[:,0])
print(last_date, first_date)
#two days
data_recent2 = data[data['Time'] > (last_date - sec_per_day * 2)]
#one Week
data_recent7 = data[data['Time'] > (last_date - sec_per_day * 7)]
#two weeks
data_recent14 = data[data['Time'] > (last_date - sec_per_day * 14)]
#One Month
data_recent30 = data[data['Time'] > (last_date - sec_per_day * 30)]
#More than One Month
#data

1442372382 1430622033


In [152]:
data_recent2.head()

Unnamed: 0,Time,UserID,ItemID,SessionID
36,1442338531,54,388096,66
37,1442338665,54,283115,66
38,1442338748,54,38965,66
39,1442338841,54,319680,66
40,1442339111,54,283115,66


In [153]:
#Save 2 days
data_recent2.to_csv('2-c-1(2days).csv', index=False)
#Save 7 days
data_recent7.to_csv('2-c-2(7days).csv', index=False)
#Save 14 days
data_recent14.to_csv('2-c-3(14days).csv', index=False)
#Save 30 days
data_recent30.to_csv('2-c-4(30days).csv', index=False)

In [154]:
del data_recent2
del data_recent7
del data_recent14
del data_recent30

d. How model is affected by data recency? Recent data is enough or needs history?

In [155]:
#most recent 10
data_recent10 = data[data['Time'] > (last_date - sec_per_day * 20)]
#most old 10
data_old10 = data[data['Time'] < (first_date + sec_per_day * 20)]
#mix old 5 and recent 5
data_mix1 = data[data['Time'] < (first_date + sec_per_day * 10)] 
data_mix2 = data[data['Time'] > (last_date - sec_per_day * 10)]
data_mix = data_mix1.append(data_mix2, ignore_index=True)

In [156]:
#Save most recent
data_recent10.to_csv('2-d-1(recent).csv', index=False)
#Save most old
data_old10.to_csv('2-d-2(old).csv', index=False)
#Save mix
data_mix.to_csv('2-d-3(mix).csv', index=False)

In [157]:
del data_recent10
del data_old10
del data_mix

e. Which model performs better at cold start problem (new item (freq < 5) / short session (length < 5) ) ?

**Train Using 2-c-4(2days).csv and Validate using the following splits**

In [159]:
data_test = pd.read_csv('retailrocket/processed/events_test.txt', sep = '\t')
data_test.columns = ['Time', 'UserID', 'ItemID', 'SessionID']
data_test_grouped = data_test.groupby('SessionID')

**Sessions Part**

In [160]:
## Keep Only Short Test Sessions < 5
data_test_short = data_test_grouped.filter(lambda x: len(x) < 5)
print('Finished_Short')

## Keep Only Intermediate Test Sessions >= 5 and < 10
data_test_intermediate = data_test_grouped.filter(lambda x: len(x) >= 5 and len(x) < 10)
print('Finished_Intermediate')

## Keep Only Long Test Sessions >= 10
data_test_long = data_test_grouped.filter(lambda x: len(x) >= 10)
print('Finished_Long')

Finished_Short
Finished_Intermediate
Finished_Long


In [161]:
data_test.to_csv('Rocket_Test.csv', index=False)

In [162]:
#Save Splits Short
data_test_short.to_csv('2-e-1(test-short).csv', index=False)
#Save Splits Intermediate
data_test_intermediate.to_csv('2-e-2(test-intermediate).csv', index=False)
#Save Splits Long
data_test_long.to_csv('2-e-3(test-long).csv', index=False)

In [163]:
del data_test_short
del data_test_intermediate
del data_test_long

**Items Part**

In [164]:
items_counts = data['ItemID'].value_counts().to_dict()
data_test_cpy = data_test.copy(deep = True)

In [165]:
for i, row in data_test_cpy.iterrows():
    if row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 10:
        data_test_cpy.loc[i,'freq_threshold'] = 10
    elif row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 30:
        data_test_cpy.loc[i,'freq_threshold'] = 30
    elif row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 60:
        data_test_cpy.loc[i,'freq_threshold'] = 60
    elif row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 100:
        data_test_cpy.loc[i,'freq_threshold'] = 100
    else:
        data_test_cpy.loc[i,'freq_threshold'] = 101

In [166]:
#Save Splits Short
data_test_cpy.to_csv('2-e-4(freq-threshold).csv', index=False)

In [167]:
del data_test_cpy

f. Which model performs better at small/high number of recommended items K (ie. Recall@K - MRR@K) ?

**For each of the above experiments:**
1. Record Recall@3, Recall@5, Recall@10, Recall@20, Recall@30
2. Record MRR@3, MRR@5, MRR@10, MRR@20, MRR@30

### 4. Under constraints of lack of data which model performs better?

In [168]:
data_2 = data.iloc[-int(len(data) / 2) :]   #just take 1/2 last instances
data_8 = data.iloc[-int(len(data) / 8) :]   #just take 1/8 last instances
data_16 = data.iloc[-int(len(data) / 16) :]   #just take 1/16 last instances
data_64 = data.iloc[-int(len(data) / 64) :]   #just take 1/64 last instances
data_256 = data.iloc[-int(len(data) / 256) :]   #just take 1/256 last instances

In [169]:
all_ids = data['SessionID'].unique()
random.shuffle(all_ids)

In [170]:
ids = all_ids[: int(len(all_ids) / 2)]
data_2 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 8)]
data_8 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 16)]
data_16 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 64)]
data_64 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 256)]
data_256 = data[data['SessionID'].isin(ids)]

data_2.to_csv('4-1(2 portion).csv', index=False)
data_8.to_csv('4-2(8 portion).csv', index=False)
data_16.to_csv('4-3(16 portion).csv', index=False)
data_64.to_csv('4-4(64 portion).csv', index=False)
data_256.to_csv('4-5(256 portion).csv', index=False)

Finished_Short
Finished_Short
Finished_Short
Finished_Short


In [171]:
del data_2
del data_8
del data_16
del data_64
del data_256

### 5. How to evaluate recommendation system in a standard way?

**Train Using 4-3(16 portion).csv and measure the following metrics on test set for K = 1, 3, 5, 10, 20**:

**Experiments should be done on the same machine**

a. items popularity = Average(Frequency of predicted item from training set / Frequency of most frequent item in training set)  

b. item coverage = (Number of distinct predicted items / Number of distinct total items in test set).

c. Mean Reciprocal Rank

d. Recall

d. Time (Training Time + Testing Time) and Memory complexities

# TMall

In [178]:
data = pd.read_csv('tmall15-raw/processed/tmall15_train_full.txt', sep = '\t')
data.columns = ['UserID', 'ItemID', 'SessionID', 'Time']

prv_item = -1; prv_ssn = -1; inx = []
for i, row in data.iterrows():
    if prv_ssn == row['SessionId'] and prv_item == row['ItemID']:
        inx.append(i)
    prv_ssn = row['SessionId']; prv_item = row['ItemID']

data.drop(data.index[inx], inplace=True)

In [179]:
data.head()

Unnamed: 0,UserID,ItemID,SessionID,Time
0,328862,706479,1813025,1442783000.0
1,328862,544690,1813025,1442783000.0
2,328862,93242,1813025,1442783000.0
3,328862,605963,1813025,1442783000.0
4,328862,589922,1813025,1442783000.0


### 2. Which deep learning model is better for different dataset meta-features values using only Session ID and item ID
a. Which model performs better when we have short/long session length?

In [180]:
data_grouped = data.groupby('SessionID')

In [181]:
## Keep Only Short Sessions < 5
data_short = data_grouped.filter(lambda x: len(x) < 5)
print('Finished_Short')

## Keep Only Intermediate Sessions >= 5 and < 10
data_intermediate = data_grouped.filter(lambda x: len(x) >= 5 and len(x) < 10)
print('Finished_Intermediate')

## Keep Only Long Sessions >= 10
data_long = data_grouped.filter(lambda x: len(x) >= 10)
print('Finished_Long')

Finished_Short
Finished_Intermediate
Finished_Long


In [182]:
data_short.head()

Unnamed: 0,UserID,ItemID,SessionID,Time
5,328862,313424,1813030,1444684000.0
6,328862,313424,1813030,1444684000.0
7,328862,1024236,1813027,1443733000.0
8,328862,99493,1813027,1443733000.0
9,328862,251929,1813027,1443733000.0


In [183]:
#Save Splits Short
data_short.to_csv('2-a-1(short).csv', index=False)
#Save Splits Intermediate
data_intermediate.to_csv('2-a-2(intermediate).csv', index=False)
#Save Splits Long
data_long.to_csv('2-a-3(long).csv', index=False)

In [184]:
del data_grouped
del data_short
del data_intermediate
del data_long

c. Which model performs better for small/large period of time of dataset collection?

In [185]:
last_date = max(data['Time'])
first_date = min(data['Time'])
print(first_date, last_date)

1440968400.0 1446069895.0


In [199]:
sec_per_day = 86400
print(last_date)
#two days
data_recent2 = data[data['Time'] > (last_date - sec_per_day * 2)]
#one Week
data_recent7 = data[data['Time'] > (last_date - sec_per_day * 7)]
#two weeks
data_recent14 = data[data['Time'] > (last_date - sec_per_day * 14)]
#One Month
data_recent30 = data[data['Time'] > (last_date - sec_per_day * 30)]
#More than One Month
#data

1446069895.0


In [187]:
data_recent2.head()

Unnamed: 0,UserID,ItemID,SessionID,Time
368,237078,52943,1304381,1445983000.0
369,237078,1038902,1304381,1445983000.0
411,300681,155199,1657228,1446070000.0
412,300681,14674,1657228,1446070000.0
413,300681,440085,1657228,1446070000.0


In [188]:
#Save 2 days
data_recent2.to_csv('2-c-1(2days).csv', index=False)
#Save 7 days
data_recent7.to_csv('2-c-2(7days).csv', index=False)
#Save 14 days
data_recent14.to_csv('2-c-3(14days).csv', index=False)
#Save 30 days
data_recent30.to_csv('2-c-4(30days).csv', index=False)

In [189]:
del data_recent7
del data_recent14
del data_recent30

d. How model is affected by data recency? Recent data is enough or needs history?

In [190]:
#most recent 10
data_recent10 = data[data['Time'] > (last_date - sec_per_day * 10)]
#most old 10
data_old10 = data[data['Time'] < (first_date + sec_per_day * 10)]
#mix old 5 and recent 5
data_mix1 = data[data['Time'] < (first_date + sec_per_day * 5)] 
data_mix2 = data[data['Time'] > (last_date - sec_per_day * 5)]
data_mix = data_mix1.append(data_mix2, ignore_index=True)

In [191]:
#Save most recent
data_recent10.to_csv('2-d-1(recent).csv', index=False)
#Save most old
data_old10.to_csv('2-d-2(old).csv', index=False)
#Save mix
data_mix.to_csv('2-d-3(mix).csv', index=False)

In [192]:
del data_recent10
del data_old10
del data_mix

e. Which model performs better at cold start problem (new item (freq < 5) / short session (length < 5) ) ?

**Train Using 2-c-4(2days).csv and Validate using the following splits**

In [2]:
data_test = pd.read_csv('tmall15-raw/processed/tmall15_test.txt', sep = '\t')
data_test.columns = ['UserID', 'ItemID', 'SessionID', 'Time']
data_test_grouped = data_test.groupby('SessionID')
data_test.to_csv('TMall_test.csv', index = False)

In [194]:
last_datet = max(data_test['Time'])
first_datet = min(data_test['Time'])
print(first_datet, last_datet)

1446156000.0 1446156554.0


**Sessions Part**

In [195]:
## Keep Only Short Test Sessions < 5
data_test_short = data_test_grouped.filter(lambda x: len(x) < 5)
print('Finished_Short')

## Keep Only Intermediate Test Sessions >= 5 and < 10
data_test_intermediate = data_test_grouped.filter(lambda x: len(x) >= 5 and len(x) < 10)
print('Finished_Intermediate')

## Keep Only Long Test Sessions >= 10
data_test_long = data_test_grouped.filter(lambda x: len(x) >= 10)
print('Finished_Long')

Finished_Short
Finished_Intermediate
Finished_Long


In [196]:
#Save Splits Short
data_test_short.to_csv('2-e-1(test-short).csv', index=False)
#Save Splits Intermediate
data_test_intermediate.to_csv('2-e-2(test-intermediate).csv', index=False)
#Save Splits Long
data_test_long.to_csv('2-e-3(test-long).csv', index=False)

In [197]:
del data_test_short
del data_test_intermediate
del data_test_long

**Items Part**

In [200]:
items_counts = data_recent2['ItemID'].value_counts().to_dict()
data_test_cpy = data_test.copy(deep = True)

In [201]:
for i, row in data_test_cpy.iterrows():
    if row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 50:
        data_test_cpy.loc[i,'freq_threshold'] = 50
    elif row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 100:
        data_test_cpy.loc[i,'freq_threshold'] = 100
    elif row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 200:
        data_test_cpy.loc[i,'freq_threshold'] = 200
    elif row['ItemID'] in items_counts.keys() and items_counts[row['ItemID']] < 300:
        data_test_cpy.loc[i,'freq_threshold'] = 300
    else:
        data_test_cpy.loc[i,'freq_threshold'] = 301

In [202]:
#Save Splits Short
data_test_cpy.to_csv('2-e-4(freq-threshold).csv', index=False)

In [203]:
del data_test_cpy

f. Which model performs better at small/high number of recommended items K (ie. Recall@K - MRR@K) ?

**For each of the above experiments:**
1. Record Recall@3, Recall@5, Recall@10, Recall@20, Recall@30
2. Record MRR@3, MRR@5, MRR@10, MRR@20, MRR@30

### 4. Under constraints of lack of data which model performs better?

In [204]:
#data_2 = data.iloc[-int(len(data) / 2) :]   #just take 1/2 last instances
data_8 = data.iloc[-int(len(data) / 8) :]   #just take 1/8 last instances
data_16 = data.iloc[-int(len(data) / 16) :]   #just take 1/16 last instances
data_64 = data.iloc[-int(len(data) / 64) :]   #just take 1/64 last instances
data_256 = data.iloc[-int(len(data) / 256) :]   #just take 1/256 last instances

In [205]:
all_ids = data['SessionID'].unique()
random.shuffle(all_ids)

In [206]:
#ids = all_ids[: int(len(all_ids) / 2)]
#data_2 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 8)]
data_8 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 16)]
data_16 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 64)]
data_64 = data[data['SessionID'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 256)]
data_256 = data[data['SessionID'].isin(ids)]

#data_2.to_csv('4-1(2 portion).csv', index=False)
data_8.to_csv('4-2(8 portion).csv', index=False)
data_16.to_csv('4-3(16 portion).csv', index=False)
data_64.to_csv('4-4(64 portion).csv', index=False)
data_256.to_csv('4-5(256 portion).csv', index=False)

Finished_Short
Finished_Short
Finished_Short
Finished_Short


In [207]:
#del data_2
del data_8
del data_16
del data_64
del data_256

### 5. How to evaluate recommendation system in a standard way?

**Train Using 4-3(16 portion).csv and measure the following metrics on test set for K = 1, 3, 5, 10, 20**:

**Experiments should be done on the same machine**

a. items popularity = Average(Frequency of predicted item from training set / Frequency of most frequent item in training set)  

b. item coverage = (Number of distinct predicted items / Number of distinct total items in test set).

c. Mean Reciprocal Rank

d. Recall

d. Time (Training Time + Testing Time) and Memory complexities

# CIKMCUP

In [28]:
data = pd.read_csv('Digintica/Digintica_train.csv')
data_test = pd.read_csv('Digintica/Digintica_test.csv')

prv_item = -1; prv_ssn = -1; inx = []
for i, row in data.iterrows():
    if prv_ssn == row['sessionId'] and prv_item == row['itemId']:
        inx.append(i)
    prv_ssn = row['sessionId']; prv_item = row['itemId']

data.drop(data.index[inx], inplace=True)

In [9]:
data.head()

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate,category,price
0,4076,1794.0,30895,17862,2016-01-01,88.0,9.0
1,4077,1795.0,79203,9430,2016-01-01,619.0,9.0
2,4733,2072.0,36031,15524,2016-01-01,822.0,9.0
3,4734,2073.0,89649,8400,2016-01-01,446.0,7.0
4,4735,2074.0,5602,75323,2016-01-01,495.0,6.0


### 2. Which deep learning model is better for different dataset meta-features values using only Session ID and item ID
a. Which model performs better when we have short/long session length?

In [68]:
data_grouped = data.groupby('sessionId')

In [69]:
## Keep Only Short Sessions < 5
data_short = data_grouped.filter(lambda x: len(x) < 5)
print('Finished_Short')

## Keep Only Intermediate Sessions >= 5 and < 10
data_intermediate = data_grouped.filter(lambda x: len(x) >= 5 and len(x) < 10)
print('Finished_Intermediate')

## Keep Only Long Sessions >= 10
data_long = data_grouped.filter(lambda x: len(x) >= 10)
print('Finished_Long')

Finished_Short
Finished_Intermediate
Finished_Long


In [70]:
data_short.head()

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate,category,price
0,4076,1794.0,30895,17862,2016-01-01,88.0,9.0
1,4077,1795.0,79203,9430,2016-01-01,619.0,9.0
2,4733,2072.0,36031,15524,2016-01-01,822.0,9.0
3,4734,2073.0,89649,8400,2016-01-01,446.0,7.0
4,4735,2074.0,5602,75323,2016-01-01,495.0,6.0


In [71]:
#Save Splits Short
data_short.to_csv('2-a-1(short).csv', index=False)
#Save Splits Intermediate
data_intermediate.to_csv('2-a-2(Intermediate).csv', index=False)
#Save Splits Long
data_long.to_csv('2-a-3(Long).csv', index=False)

In [72]:
del data_grouped
del data_short
del data_intermediate
del data_long

c. Which model performs better for small/large period of time of dataset collection?

In [30]:
times = data['eventdate'].to_numpy()

In [None]:
for i in range(len(times)):
    times[i] = time.mktime(datetime.strptime(times[i], "%Y-%m-%d").timetuple())
data['eventdate'] = times

In [33]:
data.head()

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate,category,price
0,4076,1794.0,30895,17862,1451600000.0,88.0,9.0
1,4077,1795.0,79203,9430,1451600000.0,619.0,9.0
2,4733,2072.0,36031,15524,1451600000.0,822.0,9.0
3,4734,2073.0,89649,8400,1451600000.0,446.0,7.0
4,4735,2074.0,5602,75323,1451600000.0,495.0,6.0


In [79]:
sec_per_day = 86400
last_date = data.iloc[-1,4]
first_date = data.iloc[0,4]
#two days
data_recent2 = data[data['eventdate'] > (last_date - sec_per_day * 2)]
#one Week
data_recent7 = data[data['eventdate'] > (last_date - sec_per_day * 7)]
#two weeks
data_recent14 = data[data['eventdate'] > (last_date - sec_per_day * 14)]
#One Month
data_recent30 = data[data['eventdate'] > (last_date - sec_per_day * 30)]
#More than One Month
#data

In [36]:
data_recent2.head()

Unnamed: 0,sessionId,userId,itemId,timeframe,eventdate,category,price
1190647,2192,975.0,33914,11747,1464560000.0,88.0,9.0
1190648,2195,,18363,10356,1464560000.0,191.0,7.0
1190649,2663,,389860,17571,1464560000.0,47.0,10.0
1190650,2664,,134497,106376,1464560000.0,528.0,9.0
1190651,2664,,103537,400197,1464560000.0,528.0,9.0


In [37]:
#Save 2 days
data_recent2.to_csv('2-c-1(2days).csv', index=False)
#Save 7 days
data_recent7.to_csv('2-c-2(7days).csv', index=False)
#Save 14 days
data_recent14.to_csv('2-c-3(14days).csv', index=False)
#Save 30 days
data_recent30.to_csv('2-c-4(30days).csv', index=False)

In [38]:
del data_recent2
del data_recent7
del data_recent14
del data_recent30

d. How model is affected by data recency? Recent data is enough or needs history?

In [47]:
#first_date = last_date - sec_per_day * 150
first_date = data.iloc[0,4]

In [62]:
#most recent 10
data_recent10 = data[data['eventdate'] > (last_date - sec_per_day * 10)]
#most old 40
data_old10 = data[data['eventdate'] < (first_date + sec_per_day * 45)]
#mix old 35 and recent 5
data_mix1 = data[data['eventdate'] < (first_date + sec_per_day * 35)] 
data_mix2 = data[data['eventdate'] > (last_date - sec_per_day * 5)]
data_mix = data_mix1.append(data_mix2, ignore_index=True)

In [63]:
#Save most recent
data_recent10.to_csv('2-d-1(recent).csv', index=False)
#Save most old
data_old10.to_csv('2-d-2(old).csv', index=False)
#Save mix
data_mix.to_csv('2-d-3(mix).csv', index=False)

In [None]:
del data_recent10
del data_old150
del data_mix

e. Which model performs better at cold start problem (new item (freq < 50) / short session (length < 5) ) ?

**Train Using 2-c-4(30days).csv and Validate using the following splits**

In [74]:
data_test = pd.read_csv('Digintica/Digintica_test.csv')
data_test_grouped = data_test.groupby('sessionId')

**Sessions Part**

In [75]:
## Keep Only Short Test Sessions < 5
data_test_short = data_test_grouped.filter(lambda x: len(x) < 5)
print('Finished_Short')

## Keep Only Intermediate Test Sessions >= 5 and < 10
data_test_intermediate = data_test_grouped.filter(lambda x: len(x) >= 5 and len(x) < 10)
print('Finished_Intermediate')

## Keep Only Long Test Sessions >= 10
data_test_long = data_test_grouped.filter(lambda x: len(x) >= 10)
print('Finished_Long')

Finished_Short
Finished_Intermediate
Finished_Long


In [76]:
#Save Splits Short
data_test_short.to_csv('2-e-1(test-short).csv', index=False)
#Save Splits Intermediate
data_test_intermediate.to_csv('2-e-3(test-Intermediate).csv', index=False)
#Save Splits Long
data_test_long.to_csv('2-e-3(test-Long).csv', index=False)

In [77]:
del data_test_short
del data_test_intermediate
del data_test_long

**Items Part**

In [85]:
items_counts = data_recent30['itemId'].value_counts().to_dict()
data_test_cpy = data_test.copy(deep = True)

In [86]:
for i, row in data_test_cpy.iterrows():
    if row['itemId'] in items_counts.keys() and items_counts[row['itemId']] < 10:
        data_test_cpy.loc[i,'freq_threshold'] = 10
    elif row['itemId'] in items_counts.keys() and items_counts[row['itemId']] < 30:
        data_test_cpy.loc[i,'freq_threshold'] = 30
    elif row['itemId'] in items_counts.keys() and items_counts[row['itemId']] < 60:
        data_test_cpy.loc[i,'freq_threshold'] = 60
    elif row['itemId'] in items_counts.keys() and items_counts[row['itemId']] < 100:
        data_test_cpy.loc[i,'freq_threshold'] = 100
    else:
        data_test_cpy.loc[i,'freq_threshold'] = 101

In [87]:
#Save Splits Short
data_test_cpy.to_csv('2-e-4(freq-threshold).csv', index=False)

In [88]:
del data_test_cpy

f. Which model performs better at small/high number of recommended items K (ie. Recall@K - MRR@K) ?

**For each of the above experiments:**
1. Record Recall@3, Recall@5, Recall@10, Recall@20, Recall@30
2. Record MRR@3, MRR@5, MRR@10, MRR@20, MRR@30

### 4. Under constraints of lack of data which model performs better?

In [89]:
data_2 = data.iloc[-int(len(data) / 2) :]   #just take 1/2 last instances
data_8 = data.iloc[-int(len(data) / 8) :]   #just take 1/8 last instances
data_16 = data.iloc[-int(len(data) / 16) :]   #just take 1/16 last instances
data_64 = data.iloc[-int(len(data) / 64) :]   #just take 1/64 last instances
data_256 = data.iloc[-int(len(data) / 256) :]   #just take 1/256 last instances

In [90]:
all_ids = data['sessionId'].unique()
random.shuffle(all_ids)

In [91]:
ids = all_ids[: int(len(all_ids) / 2)]
data_2 = data[data['sessionId'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 8)]
data_8 = data[data['sessionId'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 16)]
data_16 = data[data['sessionId'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 64)]
data_64 = data[data['sessionId'].isin(ids)]
print('Finished_Short')

ids = all_ids[: int(len(all_ids) / 256)]
data_256 = data[data['sessionId'].isin(ids)]

data_2.to_csv('4-1(2 portion).csv', index=False)
data_8.to_csv('4-2(8 portion).csv', index=False)
data_16.to_csv('4-3(16 portion).csv', index=False)
data_64.to_csv('4-4(64 portion).csv', index=False)
data_256.to_csv('4-5(256 portion).csv', index=False)

Finished_Short
Finished_Short
Finished_Short
Finished_Short


In [None]:
del data_2
del data_8
del data_16
del data_64
del data_256

### 5. How to evaluate recommendation system in a standard way?

**Train Using 2-c-4(30days).csv and measure the following metrics on test set for K = 5, 10, 20, 30**:

**Experiments should be done on the same machine**

a. items popularity = Average(Frequency of predicted item from training set / Frequency of most frequent item in trainig set)  

b. item coverage = (Number of distinct predicted items / Number of distinct total items in test set).

c. Mean Reciprocal Rank

d. Recall

d. Time (Training Time + Testing Time) and Memory complexities

In [77]:
del data_grouped
del data_short
del data_intermediate
del data_long