Updated: Jul18th, 2022

In [1]:
from hmcollab import datasets
from hmcollab import articles
from hmcollab import transactions
from hmcollab import models

from hmcollab.three_part_dataset import ThreePartDataset
from hmcollab.splitter import CustomerPortion

In [2]:
import math
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from datetime import datetime, timedelta

In [3]:
from display_data import display_article, display_articles


## Toy dataset

In [4]:
toy = datasets.HMDataset(toy=True)
print('Transactions shape: ', toy.transactions.shape)
print('Unique customers: ',toy.transactions.customer_id.unique().shape)
print('train_x shape: ', toy.train_x.shape)
print('train_y shape: ', toy.train_y.shape)
print('test_x shape: ', toy.test_x.shape)
print('test_y shape: ', toy.test_y.shape)
toy_target = datasets.Target(toy.transactions)
print('transactions_x: ', toy_target.transactions_x.shape)
print('transactions_y: ', toy_target.transactions_y.shape)
print('Relevant set: ', toy_target.relevant_set.shape)

Transactions shape:  (252406, 5)
Unique customers:  (10899,)
train_x shape:  (197978, 5)
train_y shape:  (1603, 5)
test_x shape:  (52393, 5)
test_y shape:  (432, 5)
transactions_x:  (250371, 5)
transactions_y:  (2035, 5)
Relevant set:  (578, 2)


In [5]:
r = np.random.RandomState(42)

In [6]:
all_customer_ids = toy.transactions.customer_id.unique()

In [7]:
customers_ids = r.choice(all_customer_ids, size=10, replace=False)

In [8]:
customers_ids

array(['d6a040a94e5f709a87845824c22fed05f14939741bce5cb5c57c29ffdea494c5',
       '8257c6a4a65795661ff0fef326a4813334e04a0f2a3ebc2fe59c694a57ed8c63',
       'd188dc011b022bb698630dac924b74f77a7a4045cc9a55ae0b4b27c209dde831',
       'b2234985ffbc93a8c980bd1dca6c60671b1a9614a484a0993a46b019416e11fe',
       '54e19f35be61f4d1c1168d74bfc0d63c872ce254b26af03a8d24dd1f0c8c5e02',
       '684a5c9ae9a70b4301e264f46658cba04ae58e4b27634b80aa039b9f169ddc02',
       '78a79ed428c7363d8e92ce175fc76fbf0c6d54182ccb2115035246ae7af9296a',
       '400377ab04aca437c40a635c5ab27005f2da452c800f3085fd8d293a769193cd',
       '20994b8c02cfab2de8a25e6b68b22e4bb72a3384f545a4f7fe7a75cfc3d86133',
       '79da90917ddfb06c4fd022e3549c095a03e3a120aa028b01700e5b6c68fc9e10'],
      dtype=object)

In [9]:
customers = toy.customers.loc[toy.customers.customer_id.isin(customers_ids), :]

In [10]:
customers.shape

(10, 7)

In [11]:
transactions = toy.transactions.loc[toy.transactions.customer_id.isin(customers_ids), :]

In [12]:
transactions.shape

(348, 5)

In [13]:
article_ids = transactions.article_id.unique()

In [14]:
article_ids

array(['0504154015', '0504154016', '0620081006', '0589222001',
       '0681815001', '0573085005', '0537346026', '0620081008',
       '0633150001', '0428291001', '0646429003', '0539723001',
       '0539723005', '0539723006', '0624151009', '0620337050',
       '0624674011', '0643350005', '0569933002', '0633609002',
       '0633607002', '0661647001', '0685054001', '0620625001',
       '0657510001', '0649702008', '0639452005', '0677838002',
       '0701102001', '0706536001', '0716258002', '0685689003',
       '0685689001', '0723716001', '0186264014', '0186262001',
       '0739618001', '0708352005', '0706016002', '0663568004',
       '0719201001', '0708352001', '0706016003', '0630675001',
       '0703559002', '0651905008', '0674492002', '0591631013',
       '0588849001', '0692226003', '0565379023', '0729928001',
       '0565379022', '0681376001', '0706016006', '0706016001',
       '0674606017', '0697054015', '0674606001', '0580770001',
       '0640716001', '0160442007', '0725293001', '03728

In [15]:
article_ids.shape

(306,)

In [16]:
articles = toy.articles.loc[toy.articles.article_id.isin(article_ids), :]

In [17]:
articles.shape

(306, 25)

In [18]:
portion = CustomerPortion(customers_ids)

In [19]:
ds = portion.split(toy)

In [20]:
ds.customers.shape

(10, 7)

In [21]:
ds.transactions.shape

(348, 5)

In [22]:
ds.articles.shape

(306, 25)