In [1]:
import pandas as pd

In [2]:
transactions = pd.read_parquet('data/transactions_train.parquet')
customers = pd.read_parquet('data/customers.parquet')
articles = pd.read_parquet('data/articles.parquet')

In [3]:
transactions.head(2)

Unnamed: 0_level_0,t_dat,customer_id,article_id,price,sales_channel_id,week
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
25784,2018-09-20,1728846800780188,519773001,0.028458,2,0
25785,2018-09-20,1728846800780188,578472001,0.032525,2,0


In [4]:
test_week = transactions.week.max()
transactions = transactions[transactions.week != transactions.week.max()]
transactions = transactions[transactions.week > transactions.week.max() - 10]

## Generating Candidates

### Last Purchase Candidates

In [5]:
c2weeks = transactions.groupby('customer_id')['week'].unique()

In [6]:
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

In [7]:
candidates_last_purchase = transactions.copy()
weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

### BestSeller Candidates

In [8]:
mean_price = transactions.groupby(['week', 'article_id'])['price'].mean()

In [9]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [10]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [11]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [12]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [13]:
bestsellers_previous_week.head(2)

Unnamed: 0,week,article_id,bestseller_rank,price
0,95,806388001,1,0.013301
1,95,730683021,2,0.025643


In [14]:
candidates_bestsellers.head(2)

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,bestseller_rank,price
0,2020-07-15,272412481300040,1,95,806388001,1,0.013301
1,2020-07-15,272412481300040,1,95,730683021,2,0.025643


In [15]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [16]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [17]:
candidates_bestsellers_test_week.head(2)

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,bestseller_rank,price
0,2020-07-08,857913002275398,1,104,909370001,1,0.032947
1,2020-07-08,857913002275398,1,104,865799006,2,0.03334


In [18]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

### Combining transactions and candidates / Negative Examples

In [19]:
transactions['purchased'] = 1
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

data.purchased.mean()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.purchased.fillna(0, inplace=True)


0.14683737277719777

In [20]:
data.head(2)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94,1.0
28777301,2020-07-08,857913002275398,776237011,0.025407,1,94,1.0


In [21]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [22]:
data.head(2)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94,1.0
28777301,2020-07-08,857913002275398,776237011,0.025407,1,94,1.0


### Add bestseller information

In [23]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [24]:
data.head(2)

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank
0,2020-07-08,857913002275398,599580068,0.008458,1,94,1.0,
1,2020-07-08,857913002275398,776237011,0.025407,1,94,1.0,


In [25]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data.bestseller_rank.fillna(999, inplace=True)


In [26]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [27]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [28]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [29]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [30]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [31]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [32]:
train_X.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,bestseller_rank
0,762846001,259,1010016,10,3,9,1515,0,1,11,1010,1,1,0,1,21,57896,999.0
1,829308001,273,1010016,9,4,5,8310,9,26,5,1005,1,1,0,1,21,57896,999.0
2,730683036,273,1010014,9,4,5,8310,9,26,5,1005,1,1,0,1,21,57896,999.0
3,851094001,253,1010016,52,2,4,8316,9,26,5,1005,1,1,0,1,21,57896,999.0
4,757303012,306,1010016,9,4,5,8316,9,26,5,1005,1,1,0,1,21,57896,999.0


### Model Training

In [33]:
from lightgbm.sklearn import LGBMRanker



In [34]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [35]:
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.844955
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.151118
[LightGBM] [Debug] init for col-wise cost 0.105019 seconds, init for row-wise cost 0.153106 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.175493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1149
[LightGBM] [Info] Number of data points in the train set: 11557594, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10


In [36]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.999066481760161
article_id 0.00028298358632459234
age 0.0002444961714550688
garment_group_no 7.157281908016381e-05
club_member_status 6.724645263245181e-05
postal_code 6.510081422729875e-05
product_type_no 6.100419025057176e-05
colour_group_code 5.175429599815084e-05
department_no 3.6910397657910145e-05
Active 2.4511482560733915e-05
graphical_appearance_no 1.4670190618276406e-05
perceived_colour_value_id 1.3267839033840464e-05
fashion_news_frequency 0.0
FN 0.0
section_no 0.0
index_code 0.0
perceived_colour_master_id 0.0
index_group_no 0.0


### Calculate Prediction

In [37]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()