In [73]:
from hmcollab import datasets
from hmcollab.directory_tree import HMDatasetDirectoryTree
from hmcollab import directories
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

There are several versions of the toy dataset. The original had a small fraction of customers with transactions in the last two weeks, while the current one has almost all of the available from the training set.

In [74]:
# tree = HMDatasetDirectoryTree(base=directories.data("toy"))   # (10000, 7)
# tree = HMDatasetDirectoryTree(base=directories.data("toy500"))   # (500, 7)
# tree = HMDatasetDirectoryTree(base=directories.data("toy1k"))   # (1000, 7)
tree = HMDatasetDirectoryTree(base=directories.data("toy_orig"))     # (10000, 7)

In [75]:
print(tree.transactions)
print(tree.articles)
print(tree.customers)

/Users/gina/Desktop/Gina/MachineLearning/Proyectos/HM/HM-clothing-project/data/toy_orig/transactions_train.csv
/Users/gina/Desktop/Gina/MachineLearning/Proyectos/HM/HM-clothing-project/data/toy_orig/articles.csv
/Users/gina/Desktop/Gina/MachineLearning/Proyectos/HM/HM-clothing-project/data/toy_orig/customers.csv


### Standard setup

In [76]:
begin = datetime.now()
dataset = datasets.HMDatasetStandard(tree=tree)
print('Seconds: ',(datetime.now() - begin).total_seconds())
print('TRANSACTIONS')
print('All transactions: ', dataset.transactions.shape)   # This shouldn't be accessible
print('transactions_x?: ', dataset.transactions_x.shape)
print('train_x', dataset.train_x.shape)
print('train_vy', dataset.train_vy.shape)
print('transactions_y?: ', dataset.transactions_y.shape)   # This shouldn't be accessible
print('train_y', dataset.train_y.shape)
print('CUSTOMERS')
print('All: ', dataset.transactions.customer_id.unique().shape)
print('train_x', dataset.train_x.customer_id.unique().shape)
print('train_vy', dataset.train_vy.customer_id.unique().shape)
print('train_y', dataset.train_y.customer_id.unique().shape)
print('Attribute', vars(dataset).keys())

Seconds:  1.149837
TRANSACTIONS
All transactions:  (252406, 5)
transactions_x?:  (250371, 5)
train_x (247953, 5)
train_vy (2418, 5)
transactions_y?:  (2035, 5)
train_y (2035, 5)
CUSTOMERS
All:  (10899,)
train_x (10815,)
train_vy (655,)
train_y (578,)
Attribute dict_keys(['tree', 'articles', 'customers', 'transactions', 'transactions_x', 'transactions_y', 'relevant_set', 'train_y', 'train_x', 'train_vy'])


Keep only those customers with transaction at train and target:

In [77]:
# Only keep customers at train_x and train_y
train_x_customer_ids_set = set(dataset.train_x.customer_id)
customers_at_y = train_x_customer_ids_set.intersection(
    set(dataset.train_y.customer_id)
)
print("customers_at_y: ", len(customers_at_y))    # orig=539, new=9160

# Only keep customers at train_x and train_vy
customers_at_vy = train_x_customer_ids_set.intersection(
    set(dataset.train_vy.customer_id)
)
print("customers_at_vy: ", len(customers_at_vy))  # orig=607, new=1980

customers_at_y:  539
customers_at_vy:  607


### Three sets: 
train_x, train_y, val_x, val_y, test_x, test_y

In [78]:
begin = datetime.now()
dataset = datasets.HMDatasetThreeSets(tree=tree)
print('Seconds: ',(datetime.now() - begin).total_seconds())
print('\nCUSTOMERS')
print('All: ', dataset.transactions.customer_id.unique().shape)
print('transactions_x?: ', dataset.transactions_x.customer_id.unique().shape)
print('transactions_y?: ', dataset.transactions_y.customer_id.unique().shape)
print('\nSET1=train')
print('train_x', dataset.train_x.customer_id.unique().shape)
print('train_y', dataset.train_y.customer_id.unique().shape)
print('\nSET2=val')
print('val_x', dataset.val_x.customer_id.unique().shape)
print('val_y', dataset.val_y.customer_id.unique().shape)
print('\nSET3=test')
print('test_x', dataset.test_x.customer_id.unique().shape)
print('test_y', dataset.test_y.customer_id.unique().shape)

print('\nAttribute', vars(dataset).keys())

Seconds:  1.078072

CUSTOMERS
All:  (10899,)
transactions_x?:  (10863,)
transactions_y?:  (578,)

SET1=train
train_x (6518,)
train_y (316,)

SET2=val
val_x (2173,)
val_y (107,)

SET3=test
test_x (2172,)
test_y (127,)

Attribute dict_keys(['tree', 'articles', 'customers', 'transactions', 'transactions_x', 'transactions_y', 'relevant_set', 'test_x', 'test_y', 'train_x', 'val_x', 'train_y', 'val_y'])


Keep only those customers with transaction at train and target:

In [79]:
# Only keep customers at train_x and train_y
train_x_customer_ids_set = set(dataset.train_x.customer_id)
customers_at_y = train_x_customer_ids_set.intersection(
    set(dataset.train_y.customer_id)
)
print("customers_at_y for SET1: ", len(customers_at_y))    # orig=316, new=5512

# Only keep customers at val_x and val_y
val_x_customer_ids_set = set(dataset.val_x.customer_id)
customers_at_val_y = val_x_customer_ids_set.intersection(
    set(dataset.val_y.customer_id)
)
print("customers at val_y for SET2 ", len(customers_at_val_y))  # orig= 107, new=1838


# Only keep customers at test_x and test_y
test_x_customer_ids_set = set(dataset.test_x.customer_id)
customers_at_test_y = test_x_customer_ids_set.intersection(
    set(dataset.test_y.customer_id)
)
print("customers at test_y for SET3 ", len(customers_at_test_y))  # orig=119, new=1876

customers_at_y for SET1:  316
customers at val_y for SET2  107
customers at test_y for SET3  119
