In [1]:
import pandas as pd
import datetime

transactions = pd.read_csv('data/transactions.csv', parse_dates=['t_dat'])
articles = pd.read_csv('data/articles.csv')
customers = pd.read_csv('data/customers.csv')

### Train Test Split

In [2]:
start_train = datetime.date(2019, 9, 22)
start_test = datetime.date(2020, 9, 15)

transactions_test = transactions[transactions.t_dat > pd.Timestamp(start_test)]
transactions_train = transactions[(transactions.t_dat <= pd.Timestamp(start_test))]
transactions_train_1year = transactions[(transactions.t_dat <= pd.Timestamp(start_test))& (transactions.t_dat > pd.Timestamp(start_train))]

#### Check whether how many customers unique in splits

In [3]:
print(transactions.customer_id.nunique(), customers.customer_id.nunique())
print(transactions.article_id.nunique(), articles.article_id.nunique())

1362281 1371980
104547 105542


- It is possible that a customer never purchased anything before can purchase something in the coming week.
- Are we really able to predict for the customer we did not see before?
- Are we really able to predict for the article we did not see before?

In [4]:
# All train set
print(transactions.customer_id.nunique(), transactions_train.customer_id.nunique(), transactions_test.customer_id.nunique())
print(transactions.article_id.nunique(), transactions_train.article_id.nunique(), transactions_test.article_id.nunique())

1362281 1356709 68984
104547 103880 17986


In [5]:
# 1 year train set
print(transactions.customer_id.nunique(), transactions_train_1year.customer_id.nunique(), transactions_test.customer_id.nunique())
print(transactions.article_id.nunique(), transactions_train_1year.article_id.nunique(), transactions_test.article_id.nunique())

1362281 986582 68984
104547 70221 17986


In [16]:
# Distinct customers in test set
customers_test = set(transactions_test.customer_id.unique())
customers_train_1year = set(transactions_train_1year.customer_id.unique())
customers_train = set(transactions_train.customer_id.unique())

print(len(customers_test - customers_train_1year), len(customers_test - customers_train))

7738 5572


In [15]:
# Distinct articles in test set
articles_test = set(transactions_test.article_id.unique())
articles_train_1year = set(transactions_train_1year.article_id.unique())
articles_train = set(transactions_train.article_id.unique())

print(len(articles_test - articles_train_1year), len(articles_test - articles_train))

685 667


- At the end even while using 1 year or all transaction data, cold start problem still stands there.
- But using all transactions give more info about almost 2k customer starting purchasing after one year gap.

### Cross Validation Split - 4 Folds -

In [2]:
# Fold 1
start_base = datetime.date(2019, 9, 22)
end_base = datetime.date(2020, 9, 7)
start_test = datetime.date(2019, 9, 8)
end_test = datetime.date(2020, 9, 15)

cond_base = (transactions.t_dat >= pd.Timestamp(start_base)) & (transactions.t_dat <= pd.Timestamp(end_base))
base = transactions[cond_base]

cond_test = (transactions.t_dat >= pd.Timestamp(start_test)) & (transactions.t_dat <= pd.Timestamp(end_test))
test = transactions[cond_test]

In [6]:
base.to_csv('data/folds/u1.base', index=False)
test.to_csv('data/folds/u1.test', index=False)
del base ,test

In [7]:
# Fold 2
start_base = datetime.date(2019, 9, 22)
end_base = datetime.date(2020, 8, 30)
start_test = datetime.date(2019, 9, 1)
end_test = datetime.date(2020, 9, 7)

cond_base = (transactions.t_dat >= pd.Timestamp(start_base)) & (transactions.t_dat <= pd.Timestamp(end_base))
base = transactions[cond_base]

cond_test = (transactions.t_dat >= pd.Timestamp(start_test)) & (transactions.t_dat <= pd.Timestamp(end_test))
test = transactions[cond_test]

In [8]:
base.to_csv('data/folds/u2.base', index=False)
test.to_csv('data/folds/u2.test', index=False)
del base ,test

In [9]:
# Fold 3
start_base = datetime.date(2018, 9, 22)
end_base = datetime.date(2019, 9, 7)
start_test = datetime.date(2018, 9, 8)
end_test = datetime.date(2019, 9, 15)

cond_base = (transactions.t_dat >= pd.Timestamp(start_base)) & (transactions.t_dat <= pd.Timestamp(end_base))
base = transactions[cond_base]

cond_test = (transactions.t_dat >= pd.Timestamp(start_test)) & (transactions.t_dat <= pd.Timestamp(end_test))
test = transactions[cond_test]

In [10]:
base.to_csv('data/folds/u3.base', index=False)
test.to_csv('data/folds/u3.test', index=False)
del base ,test

In [11]:
# Fold 2
start_base = datetime.date(2018, 9, 22)
end_base = datetime.date(2019, 8, 30)
start_test = datetime.date(2018, 9, 1)
end_test = datetime.date(2019, 9, 7)

cond_base = (transactions.t_dat >= pd.Timestamp(start_base)) & (transactions.t_dat <= pd.Timestamp(end_base))
base = transactions[cond_base]

cond_test = (transactions.t_dat >= pd.Timestamp(start_test)) & (transactions.t_dat <= pd.Timestamp(end_test))
test = transactions[cond_test]

In [12]:
base.to_csv('data/folds/u4.base', index=False)
test.to_csv('data/folds/u4.test', index=False)
del base ,test