In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
from average_precision import apk

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [2]:
from average_precision import apk

def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
DRY_RUN = True

In [5]:
%%time

import pandas as pd

if not DRY_RUN:
    transactions = pd.read_parquet('storage/transactions_train.parquet')
    customers = pd.read_parquet('storage/customers.parquet')
    articles = pd.read_parquet('storage/articles.parquet')
else:
    sample = 0.05
    transactions = pd.read_parquet(f'storage/transactions_train_sample_{sample}.parquet')
    customers = pd.read_parquet(f'storage/customers_sample_{sample}.parquet')
    articles = pd.read_parquet(f'storage/articles_train_sample_{sample}.parquet')

CPU times: user 106 ms, sys: 74.4 ms, total: 180 ms
Wall time: 268 ms


In [6]:
test_week = transactions.week.max()

In [7]:
transactions = transactions[transactions.week > transactions.week.max() - 10]

Generating candidates

In [10]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 1.13 s, sys: 64.8 ms, total: 1.2 s
Wall time: 1.14 s


In [12]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 20.7 ms, sys: 5.06 ms, total: 25.8 ms
Wall time: 25.4 ms


In [13]:
candidates_last_purchase = transactions.copy()

In [14]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 247 ms, sys: 2.53 ms, total: 250 ms
Wall time: 249 ms


Bestsellers candidates

In [16]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [17]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank')

In [18]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [19]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [20]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [21]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [22]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [23]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

Combining transactions and candidates / negative examples

In [25]:
transactions['purchased'] = 1

In [26]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

data.purchased.mean()

0.14591981693831024

In [28]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [29]:
data = data[data.week != data.week.min()].copy()
data.bestseller_rank.fillna(999, inplace=True)

In [30]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [31]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [32]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [33]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [34]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [35]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 18 ms, sys: 4.91 ms, total: 22.9 ms
Wall time: 22.4 ms


In [38]:
candidates_last_purchase

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
1444395,2020-07-15,7678388473497046,697564010,0.010153,2,97
1444396,2020-07-15,7678388473497046,820960001,0.013542,2,97
1444397,2020-07-15,48202911737860740,674180009,0.018627,2,104
1444398,2020-07-15,73103731992333692,599580083,0.016932,2,104
1444399,2020-07-15,100550013135125347,640021012,0.050831,2,96
...,...,...,...,...,...,...
1581620,2020-09-22,18426621781275797575,788575004,0.042356,2,104
1581621,2020-09-22,18426621781275797575,914441003,0.033881,2,104
1581622,2020-09-22,18426621781275797575,896848001,0.030492,2,104
1581623,2020-09-22,18440902715633436014,918894002,0.016932,1,104


In [39]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,2622543914066014,2,96,372860002,0.013377
1,2020-07-22,2622543914066014,2,96,760084003,0.025407
2,2020-07-22,2622543914066014,2,96,866731001,0.025078
3,2020-07-22,2622543914066014,2,96,610776002,0.008309
4,2020-07-22,2622543914066014,2,96,866383006,0.023912
...,...,...,...,...,...,...
261055,2020-09-22,18360336663135014098,1,104,906352001,0.058940
261056,2020-09-22,18360336663135014098,1,104,706016002,0.033168
261057,2020-09-22,18360336663135014098,1,104,673677002,0.024991
261058,2020-09-22,18360336663135014098,1,104,751471001,0.033638
