  <h1 align="center">E-commerce behaviour predictions </h1> 



#Dataset description

The training data contains full e-commerce session information. The aim is to predict the `aid` values for each session type thats occur after the last timestamp `ts` in the test session for each session in the test data. In other words, the test data contains sessions truncated by timestamp, and model should predict what occurs after the point of truncation.

> train.csv - the training data, which contains full session data: 

`session` - the unique session id 

`aid` - the article id (product code) of the associated event 

`ts` - the Unix timestamp of the event 

`type` - the event type, i.e., whether a product was clicked, added to the user's cart, or ordered during the session: 
0.  'clicks', 
1.  'carts', 
2. 'orders' 

> test.csv - the test data, which contains truncated session data
your task is to predict the next aid clicked after the session truncation, as well as the the remaining aids that are added to carts and orders; you may predict up to 20 values for each session type


> Acknowledgements:
> > Copyright (c) 2022 Otto (GmbH & Co KG), https://www.otto.de/jobs/technology/ueberblick/

#Loading and exploring dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.image import imread
import seaborn as sns

from datetime import datetime


import warnings
warnings.filterwarnings('ignore')

import gc

from scipy.sparse import csr_matrix

from sklearn.neighbors import NearestNeighbors

import tqdm.notebook as tq

import joblib


In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Colab Notebooks/Na GITa/

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/Na GITa


In [3]:
train = pd.read_csv('data/onlineshop/train_colab.csv', usecols=[1, 2, 3, 4])
test = pd.read_csv('data/onlineshop/test_colab.csv', usecols=[1, 2, 3, 4])

In [4]:
train.head()

Unnamed: 0,session,aid,ts,type
0,0,1349536,1661634295,0
1,0,165096,1661634321,0
2,0,315914,1661634351,0
3,0,315914,1661634431,1
4,0,1680276,1661634664,0


In [5]:
train.tail()

Unnamed: 0,session,aid,ts,type
12941604,12899776,1737908,1661723987,0
12941605,12899777,384045,1661723976,0
12941606,12899777,384045,1661723986,0
12941607,12899778,561560,1661723983,0
12941608,12899778,32070,1661723994,0


In [6]:
test.head()

Unnamed: 0,session,aid,ts,type
0,12899779,59625,1661724000,0
1,12899780,1142000,1661724000,0
2,12899780,582732,1661724058,0
3,12899780,973453,1661724109,0
4,12899780,736515,1661724136,0


In [7]:
test.tail()

Unnamed: 0,session,aid,ts,type
6540533,14571577,1141710,1662328774,0
6540534,14571578,519105,1662328775,0
6540535,14571579,739876,1662328775,0
6540536,14571580,202353,1662328781,0
6540537,14571581,1100210,1662328791,0


Replacing `ts` with info about hour and day

In [8]:
#datetime.fromtimestamp(train.ts[1]).strftime('%a')

In [9]:
#datetime.fromtimestamp(train.ts[1]).strftime('%H%M')

In [10]:
train['ts'] = pd.to_datetime(train['ts'], unit='s')
test['ts'] = pd.to_datetime(test['ts'], unit='s')

In [11]:
train['day'] = train['ts'].dt.day_name()
test['day'] = test['ts'].dt.day_name()

In [12]:
train['hour'] = train['ts'].dt.hour
test['hour'] = test['ts'].dt.hour

In [13]:
train_time = train.drop(columns=['ts'])
test_time = test.drop(columns=['ts'])

In [14]:
del train
del test

In [15]:
gc.collect()

36

#KNN

In [16]:
data = pd.concat([train_time, test_time])

In [17]:
data['type'] = data['type'] + 1 #to make sparse matrix with pivot (NaN replaced by 0)

In [None]:
data.session.nunique()

3366233

In [None]:
test_time.session.nunique()

1617733

In [None]:
data.aid.nunique()

1027688

In [None]:
#df.groupby(['userId','movieId'])['rating'].max().unstack()

In [None]:
first_chunk = data[data['aid'].isin(data.aid.unique()[:1000])]

In [None]:
first_chunk.head() 

Unnamed: 0,session,aid,type,day,hour
0,0,1349536,1,Saturday,21
1,0,165096,1,Saturday,21
2,0,315914,1,Saturday,21
3,0,315914,2,Saturday,21
4,0,1680276,1,Saturday,21


In [None]:
# chunk_size = 10000
# chunks = [x for x in range(0, df.shape[0], chunk_size)]
# type_2_df = pd.concat([df.iloc[chunks[i]:chunks[i + 1] - 1].pivot_table(index = 'session', columns = 'aid', values = 'type', aggfunc='mean').fillna(0) for i in range(0, len(chunks) - 1)])

In [None]:
first_chunk_df = first_chunk.pivot_table(index = 'session', columns = 'aid', values = 'type').fillna(0)

In [None]:
first_chunk_df.head()

aid,2027,4322,4525,5606,6362,6851,7651,8017,9827,9891,...,1830578,1836610,1837737,1837818,1845526,1847491,1847685,1849394,1854762,1854872
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
first_chunk_matrix = csr_matrix(first_chunk_df.values)

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(first_chunk_matrix)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [None]:
query_index = 105000 #random index
print(query_index)
distances_1, indices_1 = model_knn.kneighbors(first_chunk_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 20)

105000


In [None]:
for i in range(0, len(distances_1.flatten())):
  if i == 0:
    print('Recommendations for {0}:\n'.format(first_chunk_df.index[query_index]))
  else:
    print('{0}: {1}, with distance of {2}:'.format(i, first_chunk_df.index[indices_1.flatten()[i]], distances_1.flatten()[i]))

Recommendations for 12432611:

1: 6516776, with distance of 0.0:
2: 13698819, with distance of 0.0:
3: 1906444, with distance of 0.0:
4: 14339840, with distance of 0.0:
5: 13282071, with distance of 0.0:
6: 9827242, with distance of 0.0:
7: 13801181, with distance of 0.0:
8: 14276910, with distance of 0.0:
9: 12804110, with distance of 0.0:
10: 14406942, with distance of 0.0:
11: 2949144, with distance of 0.0:
12: 12759161, with distance of 0.0:
13: 12607492, with distance of 0.0:
14: 12641138, with distance of 0.0:
15: 5315225, with distance of 0.0:
16: 9828604, with distance of 0.0:
17: 7420120, with distance of 0.0:
18: 13411039, with distance of 0.0:
19: 13745515, with distance of 0.0:


In [None]:
del first_chunk_matrix
del first_chunk_df
gc.collect()

24

In [None]:
second_chunk = data[data['aid'].isin(data.aid.unique()[1000:2000])]
second_chunk_df = second_chunk.pivot_table(index = 'session', columns = 'aid', values = 'type').fillna(0)
second_chunk_matrix = csr_matrix(second_chunk_df.values)

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(second_chunk_matrix)

distances_2, indices_2 = model_knn.kneighbors(second_chunk_df[second_chunk_df.index == 12432611].values.reshape(1, -1), n_neighbors = 20)

for i in range(0, 20):
  if i == 0:
    print('Recommendations for {0}:\n'.format(second_chunk_df[second_chunk_df.index == 12432611].index[0]))
  else:
    print('{0}: {1}, with distance of {2}:'.format(i, second_chunk_df.index[indices_2.flatten()[i]], distances_2.flatten()[i]))

Recommendations for 12432611:

1: 13215372, with distance of 0.0:
2: 2888676, with distance of 0.0:
3: 1835561, with distance of 0.0:
4: 12694377, with distance of 0.0:
5: 12770847, with distance of 0.0:
6: 12770890, with distance of 0.0:
7: 10606496, with distance of 0.0:
8: 13553598, with distance of 0.0:
9: 13400137, with distance of 0.0:
10: 2886424, with distance of 0.0:
11: 12624609, with distance of 0.0:
12: 14281750, with distance of 0.0:
13: 12538073, with distance of 0.0:
14: 11740851, with distance of 0.0:
15: 1836195, with distance of 0.0:
16: 12872379, with distance of 0.0:
17: 12770758, with distance of 0.0:
18: 13553036, with distance of 0.0:
19: 13215418, with distance of 0.0:


In [None]:
A = model_knn.kneighbors_graph(second_chunk_df[second_chunk_df.index.isin([12432611, 14571363])].values)
B = A.toarray()

In [None]:
B != 0

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [None]:
res = second_chunk_df[B[0] != 0]
res

aid,2306,3923,6643,12782,14161,21885,24496,24614,24649,25530,...,1846140,1846519,1846802,1848540,1848943,1849385,1852263,1852609,1853288,1854775
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1835561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1836195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2886424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2888676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10606496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11740851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12538073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12624609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12694377,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12770758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
res = second_chunk_df[B[1] != 0]
res

aid,2306,3923,6643,12782,14161,21885,24496,24614,24649,25530,...,1846140,1846519,1846802,1848540,1848943,1849385,1852263,1852609,1853288,1854775
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
102358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3073076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3836139,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5949764,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10611623,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12553176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13007629,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13295022,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13564154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13564195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
m2 = (res != 0).any()
products = m2.index[m2].tolist()
products

[868327]

In [None]:
recommend = np.zeros([max(data.aid)])
recommend[products] = recommend[products] + 1

In [None]:
np.where(recommend>0)

(array([868327]),)

In [None]:
m1 = (second_chunk_df[second_chunk_df.index == 12432611] != 0).any()
used_products = m1.index[m1].tolist()
used_products 

[496180]

In [None]:
second_chunk_df[B[1] != 0].index.to_list()

[102358,
 3073076,
 3836139,
 5949764,
 10611623,
 12553176,
 13007629,
 13295022,
 13564154,
 13564195,
 13747035,
 13901119,
 14170161,
 14260909,
 14302716,
 14303334,
 14337956,
 14368637,
 14416274,
 14539326]

In [None]:
c = second_chunk_df.index.values*B

In [None]:
c[0][c[0]>0]

array([ 1835561.,  1836195.,  2886424.,  2888676., 10606496., 11740851.,
       12538073., 12624609., 12694377., 12770758., 12770847., 12770890.,
       12872379., 13215372., 13215418., 13400137., 13553036., 13553598.,
       14281750., 14282589.])

In [None]:
c[1][c[1]>0]

array([  102358.,  3073076.,  3836139.,  5949764., 10611623., 12553176.,
       13007629., 13295022., 13564154., 13564195., 13747035., 13901119.,
       14170161., 14260909., 14302716., 14303334., 14337956., 14368637.,
       14416274., 14539326.])

In [None]:
del second_chunk_matrix
del second_chunk_df
gc.collect()

321

## Functions' definitions

In [44]:
def KNN_chunk(chunk, targets, n_neighbors=20, metric='cosine'):
    """ KNN model for chunks
    Arguments:
        chunk: part of data
        targets: sessions from test dataset in chunk
        
    Returns:
        csr_matrix: one row for every target, with numbers of n_neighbors found in chunk
    """

    chunk_df = chunk.pivot_table(index='session', columns='aid', values='type').fillna(0)
    chunk_matrix = csr_matrix(chunk_df.values)

    model_knn = NearestNeighbors(metric=metric, algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    model_knn.fit(chunk_matrix)

    nn = model_knn.kneighbors_graph(chunk_df[chunk_df.index.isin(targets)].values)
    result = chunk_df.index.values*nn.toarray()

    del chunk_matrix
    del chunk_df
    gc.collect()
    return csr_matrix(result)


def recommend_orders(data_orders, target, sessions):
  for i in range(len(target)):
    sess = sessions.getrow(i).data
    products = data_orders[data_orders.session.isin(sess)].aid.values
    if len(products) > 0:
      recommend[target[i]].append(products.tolist())

def recommend_products(data, recommend, suffix, recommendations):
  for k, v in recommend.items():
    products = sum(v, []) #flatten results
    if len(products) > 1:
      omit = data[data.session == k].aid.values #data_orders, data_clicks etc.
      rec = products[products not in omit] #without products used in target before
      if isinstance(rec, list):
        #first 20 most repeated products
        rec = sorted(rec, key = rec.count, reverse = True)
        if len(set(rec)) > 20:
          rec = list(dict.fromkeys(rec))[:20]
        else:
          rec = list(dict.fromkeys(rec))
        recommendations[str(k) + '_' + suffix] = " ".join(str(i) for i in rec)
      else:
        recommendations[str(k) + '_' + suffix] = rec 
    else:
      recommendations[str(k) + '_' + suffix] = products
#  return recommendations

In [19]:
targets_all = test_time.session.unique() #all sessions in test dataset

In [47]:
test_time.session.nunique()

1617733

In [42]:
chunks_targets = list(range(0, test_time.session.nunique(), 10000)) + [test_time.session.nunique()]

###KNN for orders

In [None]:
data.aid.nunique()

1027688

In [None]:
list(range(0, data.aid.nunique(), 1000))[-1]

1027000

In [41]:
chunks_products = list(range(0, data.aid.nunique(), 1000)) + [data.aid.nunique()]

In [39]:
recomm_orders = {}

In [None]:
data_orders = data[data.type == 3]
for t in tq.tqdm(chunks_targets):
  if t == chunks_targets[-1]:
    break
  recommend = {key: [] for key in targets_all[t:t + 1]} 

  for i in chunks_products:
    if i == chunks_products[-1]:
      break
    chunk = data[data['aid'].isin(data.aid.unique()[i:i + 1])]
    targets = chunk[chunk.session.isin(targets_all[t:t + 1])].session.unique() #check which test sessions are in chunk (to predict)
    if len(targets) > 0:
      res = KNN_chunk(chunk, targets, n_neighbors=5)
      recommend_orders(data_orders, targets, res)

      del chunk
      gc.collect()
    else:
      del chunk
      gc.collect()

  recommend_products(data_orders, recommend, 'orders', recomm_orders)
  joblib.dump(recomm_orders,'orders_part1.joblib');

#add last targets

  0%|          | 0/163 [00:00<?, ?it/s]

In [None]:
#Hamming Distance 
#KDTree

In [None]:
#use SVD to factorization and use different features than type