In [1]:
%run helper_functions.ipynb

In [2]:
DRY_RUN = True

In [66]:
import pandas as pd

import warnings

warnings.filterwarnings('ignore')

In [4]:
%%time

if not DRY_RUN:
    transactions = pd.read_parquet('data/transactions_train.parquet')
    customers = pd.read_parquet('data/customers.parquet')
    articles = pd.read_parquet('data/articles.parquet')
else:
    sample = 0.05
    transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
    customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
    articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: total: 188 ms
Wall time: 521 ms


In [5]:
test_week = transactions.week.max()

In [6]:
transactions = transactions[transactions.week > transactions.week.max() - 10]

### Generating Candidates

#### Last purchase candidates

In [7]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: total: 1 s
Wall time: 1.13 s


In [8]:
c2weeks

customer_id
1728846800780188                                   [96]
3030245210403714                                   [98]
3294889982371161                                  [102]
3725161734497764                                  [103]
4125270926874550                                  [100]
                                     ...               
18441816628229149275                           [97, 99]
18442606110317830291    [95, 97, 98, 99, 100, 102, 103]
18442720731457033890                               [97]
18444338227048911684                              [103]
18446566209623725451                               [99]
Name: week, Length: 21962, dtype: object

In [9]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: total: 62.5 ms
Wall time: 90.2 ms


In [10]:
c2weeks2shifted_weeks

{1728846800780188: {96: 104},
 3030245210403714: {98: 104},
 3294889982371161: {102: 104},
 3725161734497764: {103: 104},
 4125270926874550: {100: 104},
 5140543703930900: {98: 104},
 6611639188934298: {103: 104, 104: 104},
 7165979654983104: {98: 99, 99: 100, 100: 103, 103: 104},
 7678388473497046: {95: 97, 97: 102, 102: 104},
 7694319197314000: {96: 97, 97: 104, 104: 104},
 8245923130635254: {96: 98, 98: 103, 103: 104, 104: 104},
 8566549790898038: {96: 100, 100: 103, 103: 104},
 9853153815325591: {96: 100, 100: 104},
 10733759044223883: {99: 100, 100: 102, 102: 103, 103: 104},
 10815953703051191: {96: 104},
 11802190299861252: {104: 104},
 12584931330163351: {97: 98, 98: 99, 99: 104},
 12881854106531921: {96: 99, 99: 104},
 13104946915081562: {95: 104},
 13238763363016949: {97: 98, 98: 99, 99: 104},
 13718556659542304: {95: 103, 103: 104},
 13816484732521112: {104: 104},
 15820554573873772: {101: 104, 104: 104},
 16171113462012840: {97: 104},
 17948564163813484: {101: 102, 102: 103,

In [11]:
candidates_last_purchase = transactions.copy()

In [12]:
candidates_last_purchase

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
1463301,2020-07-15,7678388473497046,697564010,0.010153,2,95
1463302,2020-07-15,7678388473497046,820960001,0.013542,2,95
1463303,2020-07-15,13104946915081562,866307001,0.027102,2,95
1463304,2020-07-15,28516591585814015,826508007,0.016932,1,95
1463305,2020-07-15,28516591585814015,826508011,0.016932,1,95
...,...,...,...,...,...,...
1601580,2020-09-22,18373835195283945075,574109043,0.016610,2,104
1601581,2020-09-22,18373835195283945075,902388002,0.014763,2,104
1601582,2020-09-22,18403657855694067653,877369001,0.016932,2,104
1601583,2020-09-22,18403657855694067653,897423002,0.011847,2,104


In [13]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: total: 672 ms
Wall time: 1.06 s


In [14]:
candidates_last_purchase

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
1463301,2020-07-15,7678388473497046,697564010,0.010153,2,97
1463302,2020-07-15,7678388473497046,820960001,0.013542,2,97
1463303,2020-07-15,13104946915081562,866307001,0.027102,2,104
1463304,2020-07-15,28516591585814015,826508007,0.016932,1,104
1463305,2020-07-15,28516591585814015,826508011,0.016932,1,104
...,...,...,...,...,...,...
1601580,2020-09-22,18373835195283945075,574109043,0.016610,2,104
1601581,2020-09-22,18373835195283945075,902388002,0.014763,2,104
1601582,2020-09-22,18403657855694067653,877369001,0.016932,2,104
1601583,2020-09-22,18403657855694067653,897423002,0.011847,2,104


#### Bestseller candidates

In [15]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [16]:
mean_price

week  article_id
95    111565001     0.006339
      111586001     0.013542
      126589007     0.001678
      129085001     0.016932
      129085026     0.016932
                      ...   
104   949198001     0.025407
      949551001     0.033415
      949551002     0.032017
      952938001     0.050831
      953763001     0.022017
Name: price, Length: 56958, dtype: float32

In [17]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank')

In [18]:
sales

week  article_id
95    760084003     1.0
      824490001     2.0
      776237011     3.0
      817354001     3.0
      866731001     4.0
                   ... 
104   762846006     7.0
      924243002     8.0
      934835001     8.0
      714790020     9.0
      788575004     9.0
Name: bestseller_rank, Length: 120, dtype: float64

In [43]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [44]:
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1.0,0.025137
1,96,824490001,2.0,0.031745
2,96,776237011,3.0,0.021660
3,96,817354001,3.0,0.022569
4,96,866731001,4.0,0.025368
...,...,...,...,...
115,105,762846006,7.0,0.024835
116,105,924243002,8.0,0.041443
117,105,934835001,8.0,0.024719
118,105,714790020,9.0,0.050078


In [45]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [46]:
unique_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
1463301,2020-07-15,7678388473497046,2,95
1463303,2020-07-15,13104946915081562,2,95
1463304,2020-07-15,28516591585814015,1,95
1463306,2020-07-15,35335654790712518,1,95
1463310,2020-07-15,43500335720796308,2,95
...,...,...,...,...
1601567,2020-09-22,18314827227264852684,2,104
1601569,2020-09-22,18326362629821347858,2,104
1601576,2020-09-22,18353914612896054363,2,104
1601579,2020-09-22,18373835195283945075,2,104


In [47]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [48]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,bestseller_rank,price
0,2020-07-22,62294610371300175,2,96,760084003,1.0,0.025137
1,2020-07-22,62294610371300175,2,96,824490001,2.0,0.031745
2,2020-07-22,62294610371300175,2,96,776237011,3.0,0.021660
3,2020-07-22,62294610371300175,2,96,817354001,3.0,0.022569
4,2020-07-22,62294610371300175,2,96,866731001,4.0,0.025368
...,...,...,...,...,...,...,...
409183,2020-09-22,18403657855694067653,2,104,896169005,7.0,0.050785
409184,2020-09-22,18403657855694067653,2,104,918522001,7.0,0.041187
409185,2020-09-22,18403657855694067653,2,104,863646001,8.0,0.033610
409186,2020-09-22,18403657855694067653,2,104,924243001,9.0,0.041906


In [50]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [51]:
test_set_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
0,2020-07-15,7678388473497046,2,104
1,2020-07-15,13104946915081562,2,104
2,2020-07-15,28516591585814015,1,104
3,2020-07-15,35335654790712518,1,104
4,2020-07-15,43500335720796308,2,104
...,...,...,...,...
21957,2020-09-22,18297284838138651866,2,104
21958,2020-09-22,18314827227264852684,2,104
21959,2020-09-22,18326362629821347858,2,104
21960,2020-09-22,18353914612896054363,2,104


In [52]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [53]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [55]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,62294610371300175,2,96,760084003,0.025137
1,2020-07-22,62294610371300175,2,96,824490001,0.031745
2,2020-07-22,62294610371300175,2,96,776237011,0.021660
3,2020-07-22,62294610371300175,2,96,817354001,0.022569
4,2020-07-22,62294610371300175,2,96,866731001,0.025368
...,...,...,...,...,...,...
263539,2020-09-22,18403657855694067653,2,104,896169005,0.050785
263540,2020-09-22,18403657855694067653,2,104,918522001,0.041187
263541,2020-09-22,18403657855694067653,2,104,863646001,0.033610
263542,2020-09-22,18403657855694067653,2,104,924243001,0.041906


### Combining transactions and candidates / negative examples

In [60]:
transactions['purchased'] = 1

In [67]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

data.purchased.mean()

0.14566944064047194

In [68]:
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
1463301,2020-07-15,7678388473497046,697564010,0.010153,2,95,1.0
1463302,2020-07-15,7678388473497046,820960001,0.013542,2,95,1.0
1463303,2020-07-15,13104946915081562,866307001,0.027102,2,95,1.0
1463304,2020-07-15,28516591585814015,826508007,0.016932,1,95,1.0
1463305,2020-07-15,28516591585814015,826508011,0.016932,1,95,1.0


#### Add bestsellers information

In [70]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [71]:
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank
0,2020-07-15,7678388473497046,697564010,0.010153,2,95,1.0,
1,2020-07-15,7678388473497046,820960001,0.013542,2,95,1.0,
2,2020-07-15,13104946915081562,866307001,0.027102,2,95,1.0,
3,2020-07-15,28516591585814015,826508007,0.016932,1,95,1.0,
4,2020-07-15,28516591585814015,826508011,0.016932,1,95,1.0,


In [72]:
data = data[data.week != data.week.min()].copy()
data.bestseller_rank.fillna(999, inplace=True)

In [75]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [77]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [79]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [81]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [85]:
train_baskets

array([19, 17, 14, ..., 13, 16, 22], dtype=int64)

In [86]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [87]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: total: 15.6 ms
Wall time: 35.9 ms


### Model Ranking

In [89]:
from lightgbm.sklearn import LGBMRanker

In [90]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [91]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.110138
[LightGBM] [Debug] init for col-wise cost 0.000024 seconds, init for row-wise cost 0.022012 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009810 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1146
[LightGBM] [Info] Number of data points in the train set: 534496, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
CPU times: total: 734 ms
Wall time: 632 ms


In [99]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9986311941967358
age 0.0005194018524527628
postal_code 0.00029404455460752747
article_id 0.0002579473638869779
Active 8.503137188676565e-05
club_member_status 8.371546269738538e-05
colour_group_code 7.112834672383289e-05
fashion_news_frequency 3.57014504553535e-05
product_type_no 2.1835400553532715e-05
FN 0.0
garment_group_no 0.0
section_no 0.0
index_code 0.0
department_no 0.0
perceived_colour_master_id 0.0
perceived_colour_value_id 0.0
graphical_appearance_no 0.0
index_group_no 0.0


### Calculate prediction

In [100]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = ['0' + str(a_id) for a_id in \
                             bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()]

CPU times: total: 0 ns
Wall time: 0 ns


In [127]:
pd.DataFrame({'customer_id': c_id2predicted_article_ids.keys(), 'prediction': c_id2predicted_article_ids.values()})

Unnamed: 0,customer_id,prediction
0,1728846800780188,"[301656013, 767799016, 878510004, 768759007, 8..."
1,3030245210403714,"[896506001, 850917001, 158340001, 751471001, 9..."
2,3294889982371161,"[706271030, 733067001, 850917001, 158340001, 7..."
3,3725161734497764,"[876009005, 835542003, 850917001, 158340001, 7..."
4,4125270926874550,"[685814033, 850917001, 158340001, 751471001, 9..."
...,...,...
21957,18441816628229149275,"[902229003, 905811001, 903735002, 857097001, 7..."
21958,18442606110317830291,"[728876011, 850917001, 158340001, 751471001, 9..."
21959,18442720731457033890,"[917434002, 872278001, 850917001, 158340001, 7..."
21960,18444338227048911684,"[896161001, 929165001, 936057001, 899282001, 9..."
