Updated: May16th, 2023

In [1]:
from hmcollab import datasets
from hmcollab import transactions

In [2]:
import pandas as pd
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

## Aim: Demonstrate usage for two split strategies (which now are actually three)
+ Folds = twosets: split the customers into 2 datasets, each of them with their target (y) set (train, test)
+ Folds=threesets: split the customers into 3 datasets each of them with their target (y) set  (train, val, test)
+ Folds=standard: uses all customers for training leaving las two weeks for validation and test

### Folds=threesets (train_x, trsin_y; val_x, val_y; test_x, test_y)
Creating three datasets of customers: one for training, one for validation and one for testing. All of them with their respective target variable y

In [3]:
begin = datetime.now()
toy3 = datasets.HMDataset(toy=True, folds="threesets")
print('Seconds: ',(datetime.now() - begin).total_seconds())
print('Transactions shape: ', toy3.transactions.shape)
print('Unique customers: ',toy3.transactions.customer_id.unique().shape)

Seconds:  4.137853
Transactions shape:  (252406, 5)
Unique customers:  (10899,)


In [4]:
#Note: This split might be over simplistic. We are not stratifying 
print('TRAIN SET:')
print('train_x shape: ', toy3.train_x.shape)
print('train_y shape: ', toy3.train_y.shape)
print('VALIDATION SET:')
print('val_x shape: ', toy3.val_x.shape)
print('val_y shape: ', toy3.val_y.shape)
print('TEST SET:')
print('test_x shape: ', toy3.test_x.shape)
print('test_y shape: ', toy3.test_y.shape)
print('RELEVANT SET:')
print('relevant_set shape: ', toy3.relevant_set.shape)

TRAIN SET:
train_x shape:  (150018, 5)
train_y shape:  (1111, 5)
VALIDATION SET:
val_x shape:  (47960, 5)
val_y shape:  (420, 5)
TEST SET:
test_x shape:  (52393, 5)
test_y shape:  (432, 5)
RELEVANT SET:
relevant_set shape:  (578, 2)


### Folds=twosets (train_x, test_x; train_y, test_y)
Creating two sets with corresponding test sets

In [8]:
begin = datetime.now()
toy2 = datasets.HMDataset(toy=True, folds="twosets")
print('Seconds: ',(datetime.now() - begin).total_seconds())
print('Transactions shape: ', toy2.transactions.shape)
print('Unique customers: ',toy2.transactions.customer_id.unique().shape)

Seconds:  4.067938
Transactions shape:  (252406, 5)
Unique customers:  (10899,)


In [12]:
#Note: This split might be over simplistic. We are not stratifying 
print('TRAIN SET:')
print('train_x shape: ', toy2.train_x.shape)
print('train_y shape: ', toy2.train_y.shape)
print('test_x shape: ', toy2.test_x.shape)
print('test_y shape: ', toy2.test_y.shape)
print('RELEVANT SET:')
print('relevant_set shape: ', toy2.relevant_set.shape)


TRAIN SET:
train_x shape:  (197978, 5)
train_y shape:  (1603, 5)
test_x shape:  (52393, 5)
test_y shape:  (432, 5)
RELEVANT SET:
relevant_set shape:  (578, 2)


### Folds=standard (train_x, train_vy, train_y)
Creating only one training set with validation and test sets

In [5]:
begin = datetime.now()
toy = datasets.HMDataset(toy=True, folds="standard")
print('Seconds: ',(datetime.now() - begin).total_seconds())
print('Transactions shape: ', toy.transactions.shape)
print('Unique customers: ',toy.transactions.customer_id.unique().shape)

Seconds:  3.989024
Transactions shape:  (252406, 5)
Unique customers:  (10899,)


In [6]:
#Note: This split might be over simplistic. We are not stratifying 
print('TRAIN SET:')
print('train_x shape: ', toy.train_x.shape)
print('validation (train_vy shape): ', toy.train_vy.shape)
print('test (train_y shape): ', toy.train_y.shape)
print('RELEVANT SET:')
print('relevant_set shape: ', toy.relevant_set.shape)

TRAIN SET:
train_x shape:  (247953, 5)
validation (train_vy shape):  (2418, 5)
test (train_y shape):  (2035, 5)
RELEVANT SET:
relevant_set shape:  (578, 2)
