Updated: Jun18th, 2022

In [1]:
from hmcollab import datasets
from hmcollab import transactions

In [2]:
import pandas as pd
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

## Aim
Create a target set with the transactions from the last 7 days organized by customer. We would like to have the same format as the one required by the Kaggle competition for the outputs. The idea is to use this dataset (relevant_set) to find those results that are relevant for the scoring functions (MAP@K etc)

### Let's use our toy dataset to get an idea of the time to run it

In [4]:
begin = datetime.now()
toy = datasets.HMDataset(toy=True)
print('Seconds: ',(datetime.now() - begin).total_seconds())
print('Transactions shape: ', toy.transactions.shape)
print('Unique customers: ',toy.transactions.customer_id.unique().shape)

Seconds:  3.817961
Transactions shape:  (252406, 5)
Unique customers:  (10899,)


In [6]:
print('train_x shape: ', toy.train_x.shape)
print('train_y shape: ', toy.train_y.shape)
print('test_x shape: ', toy.test_x.shape)
print('test_y shape: ', toy.test_y.shape)

train_x shape:  (197978, 5)
train_y shape:  (1603, 5)
test_x shape:  (52393, 5)
test_y shape:  (432, 5)


### Old class to create the target (transactions from the last 7 days)
This class retakes the feature transactions_x and transactions_y that we had in the past, and adds an attribute called relevant_set. This method was very slow to use with the full dataset

In [13]:
begin = datetime.now()
toy_slow = datasets.TargetSlow(toy.transactions)
print((datetime.now() - begin).total_seconds())

0.377086


In [14]:
print('x: ', toy_slow.transactions_x.shape)
print('y: ', toy_slow.transactions_y.shape)
print('Relevant: ', toy_slow.relevant_set.shape)

x:  (250371, 5)
y:  (2035, 5)
Relevant:  (578, 2)


In [15]:
toy_slow.relevant_set.head()

Unnamed: 0,customer_id,target
0,00fb0bee7b78e16bd37587e47a124f86e2aad252a34bb1...,0860285001 0909371001
1,02c67868a2a3f83eaf229ae3dac2373d77e5bf451b1a40...,0905945001
2,095c3fb8eb6a606a9e58b31402dc9ffd72c2736e5bfc69...,0456163060 0865921001 0708485003 0886390002 07...
3,0aad6bcfe77f2dcfe74a280df77fc37cb3731b85cb5207...,0907957001 0827968002 0923340001 0867966009 08...
4,1d07c4f4067165e9297fde4e788889e15215c400ccc1a0...,0693246007 0524529004


### This is the new and improved class to generate the relevant_set

In [16]:

begin = datetime.now()
toy_faster = datasets.Target(toy.transactions)
print((datetime.now() - begin).total_seconds())

0.06895


In [17]:
print('x: ', toy_faster.transactions_x.shape)
print('y: ', toy_faster.transactions_y.shape)
print('Relevant: ', toy_faster.relevant_set.shape)

x:  (250371, 5)
y:  (2035, 5)
Relevant:  (578, 2)


In [18]:
toy_faster.relevant_set.head()

Unnamed: 0,customer_id,target
0,004a51b9de9d21672dcb80ff5b76928b83e0865f614c7a...,0794575005 0924243001 0865799006
1,00fb0bee7b78e16bd37587e47a124f86e2aad252a34bb1...,0860285001 0909371001
2,010443087c1ea6f77578cde8b49751573175ef048e2896...,0831384003 0876125002 0871517008 0831384003 08...
3,010f87b59415ddb7e3dcb8228476028420aacd77ec42c8...,0921090003 0854020001 0759871034 0860336001
4,01adb64dfbae7ceca59fe2252906784573b45e5893ff4a...,0909014001


In [19]:
# toy_faster.relevant_set.to_csv('../data/toy_relevant_set.csv', index=False)