# Candidate Generation and LGBM

In [38]:
import pandas as pd
import numpy as np
from average_precision import apk, mapk

In [44]:
# Helper functions for improving memory
# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def hex_id_to_int(str):
    return int(str[-16:], 16)

In [2]:
transactions = pd.read_parquet('data/transactions_train.parquet')
customers = pd.read_parquet('data/customers.parquet')
articles = pd.read_parquet('data/articles.parquet')

## 1. Define training and testing weeks

In [3]:
# We're testing for week 105. The week after the final week given in the dataset
test_week = transactions.week.max() + 1 

# only uses the last 10 weeks of transactions
transactions = transactions[transactions.week > transactions.week.max() - 10]

In [4]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95


## 2. Generating candidates

#### Last purchase candidates - I'm unsure about the purpose of this section. I believe it's trying to identify the articles a customer has purchased previously or in the last week. 

For each customer, list the weeks when they made purchases

In [5]:
c2weeks = transactions.groupby('customer_id')['week'].unique()
c2weeks

customer_id
28847241659200          [95, 96, 101, 102]
41318098387474                        [98]
116809474287335                 [101, 103]
200292573348128          [95, 96, 99, 102]
248294615847351                       [96]
                               ...        
18446624797007271432                  [95]
18446630855572834764                 [103]
18446662237889060501                 [100]
18446705133201055310                 [102]
18446737527580148316                 [104]
Name: week, Length: 437365, dtype: object

Another representation of c2weeks (weeks when customers made purchases) <br>
Form {'customer_id': dict_of_shifted_weeks}. I'm not yet sure why they want data in this form

In [6]:
c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week
    
c2weeks2shifted_weeks[272412481300040]

{95: 96, 96: 103, 103: 105}

Shifts the 'week' recorded for each transaction. If the first purchase was made in week 95 and the next purchase was in week 97, records week 97. I don't know why though

In [7]:
candidates_last_purchase = transactions.copy()

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    # print(i, c_id, week)
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    # print(c2weeks2shifted_weeks[c_id][week])

candidates_last_purchase.week=weeks
candidates_last_purchase

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,96
29030504,2020-07-15,272412481300040,816592008,0.016932,1,96
29030505,2020-07-15,272412481300040,621381021,0.033881,1,96
29030506,2020-07-15,272412481300040,817477003,0.025407,1,96
29030507,2020-07-15,272412481300040,899088002,0.025407,1,96
...,...,...,...,...,...,...
31774722,2020-09-22,18439937050817258297,891591003,0.084729,2,105
31774723,2020-09-22,18439937050817258297,869706005,0.084729,2,105
31779097,2020-09-22,18440902715633436014,918894002,0.016932,1,105
31779098,2020-09-22,18440902715633436014,761269001,0.016932,1,105


### Bestsellers candidates - identify the top 12 bestselling articles per week

Average price of each article in a given week


In [8]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()
mean_price

week  article_id
95    108775015     0.004729
      108775044     0.008458
      110065001     0.006085
      110065002     0.006085
      111565001     0.004288
                      ...   
104   952267001     0.013732
      952938001     0.048651
      953450001     0.016932
      953763001     0.021885
      956217002     0.059068
Name: price, Length: 196880, dtype: float32

Top 12 bestselling items per week

In [9]:
# 1. number of times each article was sold in a given week
# 2. rank so most popular item is at the top ( == 1)
# 3. For each week get the top 12 items based on rank
sales = transactions\
    .groupby('week')['article_id'].value_counts()\
    .groupby('week').rank(method='dense', ascending=False)\
    .groupby('week').head(12).rename('bestseller_rank').astype('int8') 
sales

week  article_id
95    760084003      1
      866731001      2
      600886001      3
      706016001      4
      372860002      5
                    ..
104   915529003      8
      915529005      9
      448509014     10
      762846027     11
      714790020     12
Name: bestseller_rank, Length: 120, dtype: int8

Merges the two previous dataframes to get the bestsellers and prices of the articles (per week) from the previous week 

In [10]:
# the same article may appear multiple times
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.022980
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
...,...,...,...,...
115,105,915529003,8,0.033439
116,105,915529005,9,0.033417
117,105,448509014,10,0.041630
118,105,762846027,11,0.025005


Each record represents a distinct (customer, week) tuple. <br>
Thus, if a customer made multiple transactions in the same week only one row is reported. This table is used in the next step.

In [11]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()
unique_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
29030503,2020-07-15,272412481300040,1,95
29064059,2020-07-15,1456826891333599,1,95
29067103,2020-07-15,2133687643102426,2,95
29027487,2020-07-15,6010692573790711,1,95
29046403,2020-07-15,6171059100114610,2,95
...,...,...,...,...
31760188,2020-09-22,18435221511488011015,1,104
31782234,2020-09-22,18436859303155335645,1,104
31787251,2020-09-22,18437941771381362708,2,104
31776022,2020-09-22,18438270306572912089,1,104


Merges the bestseller information (week, article, bestseller rank, and price) with the customer who purchased that item.

In [12]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,bestseller_rank,price
0,2020-07-22,200292573348128,2,96,760084003,1,0.025094
1,2020-07-22,200292573348128,2,96,866731001,2,0.024919
2,2020-07-22,200292573348128,2,96,600886001,3,0.022980
3,2020-07-22,200292573348128,2,96,706016001,4,0.033197
4,2020-07-22,200292573348128,2,96,372860002,5,0.013193
...,...,...,...,...,...,...,...
8141191,2020-09-22,18440902715633436014,1,104,918292001,8,0.041424
8141192,2020-09-22,18440902715633436014,1,104,762846027,9,0.025104
8141193,2020-09-22,18440902715633436014,1,104,809238005,10,0.041656
8141194,2020-09-22,18440902715633436014,1,104,673677002,11,0.024925


In [13]:
# Transactions for the test set (from last week of data)
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

# Combine with bestsellers info
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,bestseller_rank,price
0,2020-07-22,200292573348128,2,96,760084003,1,0.025094
1,2020-07-22,200292573348128,2,96,866731001,2,0.024919
2,2020-07-22,200292573348128,2,96,600886001,3,0.022980
3,2020-07-22,200292573348128,2,96,706016001,4,0.033197
4,2020-07-22,200292573348128,2,96,372860002,5,0.013193
...,...,...,...,...,...,...,...
8141191,2020-09-22,18440902715633436014,1,104,918292001,8,0.041424
8141192,2020-09-22,18440902715633436014,1,104,762846027,9,0.025104
8141193,2020-09-22,18440902715633436014,1,104,809238005,10,0.041656
8141194,2020-09-22,18440902715633436014,1,104,673677002,11,0.024925


In [14]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,200292573348128,2,96,760084003,0.025094
1,2020-07-22,200292573348128,2,96,866731001,0.024919
2,2020-07-22,200292573348128,2,96,600886001,0.022980
3,2020-07-22,200292573348128,2,96,706016001,0.033197
4,2020-07-22,200292573348128,2,96,372860002,0.013193
...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,1,105,915529003,0.033439
5248376,2020-09-22,18438270306572912089,1,105,915529005,0.033417
5248377,2020-09-22,18438270306572912089,1,105,448509014,0.041630
5248378,2020-09-22,18438270306572912089,1,105,762846027,0.025005


## 3. Combining transactions and candidates / negative examples

Label all transactions across all weeks as 1 (positive samples)

Comment from Pawel in Kaggle discussion here: https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/307288
In my setting I propose to convert the problem to a table where you have a list of item candidates and you mark the sales as either 0 or 1 depending whether the item was sold in the next week. Imagine you want to create negative samples as a list of 1000 most popular items in the last week. Some of those items will be bought by the customers. So the table you need to create is:
customer_id
label = whether it was bought or not
article_id from the list of 1000 most popular items
I'm not saying this is the best approach here but it is more or less what I'm doing. This is a technique where you create a set of explicit negative items. There are also techniques for implicit recommendations but all of them assume some sort of a strategy to generate negative samples.

In [15]:
transactions['purchased'] = 1 
transactions.shape

(2762872, 7)

In [16]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True) # Label all candidate generated transactions as 0

# Drops duplicates. Retains the transactions (positive labels). If duplicated, will drop the duplicates in the negative samples
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True) 
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0
...,...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,915529003,0.033439,1,105,0.0
5248376,2020-09-22,18438270306572912089,915529005,0.033417,1,105,0.0
5248377,2020-09-22,18438270306572912089,448509014,0.041630,1,105,0.0
5248378,2020-09-22,18438270306572912089,762846027,0.025005,1,105,0.0


### Add bestseller information

In [17]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [18]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [19]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [20]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

## 4. Train/ Test data

In [21]:
train = data[data.week != test_week]
train.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-07-26,28847241659200,887770001,0.016932,1,96,1.0,999.0,887770,727,...,10,1010,6,3692,1,1,0,1,21,57896
1,2020-07-18,28847241659200,762846001,0.025407,1,96,0.0,999.0,762846,472,...,7,1010,6,492,1,1,0,1,21,57896
2,2020-07-18,28847241659200,829308001,0.033881,1,96,0.0,999.0,829308,11402,...,21,1005,0,9082,1,1,0,1,21,57896
3,2020-07-26,28847241659200,760084003,0.025094,1,96,0.0,1.0,760084,1134,...,1,1009,5,847,1,1,0,1,21,57896
4,2020-07-26,28847241659200,866731001,0.024919,1,96,0.0,2.0,866731,3609,...,21,1005,0,3130,1,1,0,1,21,57896


In [22]:
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()
test.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
11381612,2020-09-03,28847241659200,925246001,0.128797,2,105,0.0,999.0,925246,25454,...,40,1007,9,27855,1,1,0,1,21,57896
11381613,2020-07-18,28847241659200,924243001,0.041535,1,105,0.0,1.0,924243,19190,...,0,1003,3,13007,1,1,0,1,21,57896
11381614,2020-07-18,28847241659200,924243002,0.041877,1,105,0.0,2.0,924243,19190,...,0,1003,3,13007,1,1,0,1,21,57896
11381615,2020-07-18,28847241659200,918522001,0.041435,1,105,0.0,3.0,918522,26372,...,0,1003,3,28633,1,1,0,1,21,57896
11381616,2020-07-18,28847241659200,923758001,0.033462,1,105,0.0,4.0,923758,19359,...,0,1010,6,27869,1,1,0,1,21,57896


In [23]:
# Not yet sure what these are
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values
train_baskets

array([15, 23, 16, ..., 14, 19, 16])

Only use a subset of columns specified here as input into the LGBM model

In [24]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

### Model input

In [25]:
train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [26]:
train_X.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,bestseller_rank
0,887770001,253,1010016,9,4,5,1510,0,1,6,1010,1,1,0,1,21,57896,999.0
1,762846001,259,1010016,10,3,9,1515,0,1,11,1010,1,1,0,1,21,57896,999.0
2,829308001,273,1010016,9,4,5,8310,9,26,5,1005,1,1,0,1,21,57896,999.0
3,760084003,272,1010016,9,4,5,1747,1,2,53,1009,1,1,0,1,21,57896,1.0
4,866731001,273,1010016,9,4,5,8310,9,26,5,1005,1,1,0,1,21,57896,2.0


## 5. Model training

In [27]:
from lightgbm.sklearn import LGBMRanker

In [28]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [29]:
# not sure what group = train_baskets is for
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.848850
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.153099
[LightGBM] [Debug] init for col-wise cost 0.198651 seconds, init for row-wise cost 0.357031 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12


In [30]:
# Obtain feature importances
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9989805519216203
age 0.00024136038957903926
article_id 0.00017160828400263902
garment_group_no 0.0001448188543340445
department_no 9.637421875769266e-05
product_type_no 9.014783292439592e-05
section_no 7.067204716548531e-05
postal_code 6.792197441369627e-05
club_member_status 6.519780240033951e-05
colour_group_code 5.358754121027148e-05
perceived_colour_value_id 1.775913359216025e-05
fashion_news_frequency 0.0
Active 0.0
FN 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
index_group_no 0.0


## 6. Calculate predictions

In [31]:
# Predict on test_X
# Add the predictions to the test data (with customer and article id)
test['preds'] = ranker.predict(test_X)
test.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,product_code,prod_name,...,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,preds
11381612,2020-09-03,28847241659200,925246001,0.128797,2,105,0.0,999.0,925246,25454,...,1007,9,27855,1,1,0,1,21,57896,0.18302
11381613,2020-07-18,28847241659200,924243001,0.041535,1,105,0.0,1.0,924243,19190,...,1003,3,13007,1,1,0,1,21,57896,-0.168727
11381614,2020-07-18,28847241659200,924243002,0.041877,1,105,0.0,2.0,924243,19190,...,1003,3,13007,1,1,0,1,21,57896,-0.187652
11381615,2020-07-18,28847241659200,918522001,0.041435,1,105,0.0,3.0,918522,26372,...,1003,3,28633,1,1,0,1,21,57896,-0.187652
11381616,2020-07-18,28847241659200,923758001,0.033462,1,105,0.0,4.0,923758,19359,...,1010,6,27869,1,1,0,1,21,57896,-0.196152


In [32]:
# Obtain the products for each customer, based on highest predictions.
c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

c_id2predicted_article_ids[28847241659200] # predicted articles for person 28847241659200

[925246001,
 924243001,
 924243002,
 918522001,
 915529003,
 915529005,
 866731001,
 909370001,
 751471001,
 448509014,
 714790020,
 923758001,
 762846027]

In [33]:
# last weeks best sellers will be used in case fewer than 12 recommendations for a customer are made
bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

## 7. Evaluate results

Mileva has to continue from here. Still figuring out how this code is working.

In [34]:
val_week_purchases_by_cust = pd.read_pickle('data/val_week_purchases_by_cust.pkl')

In [39]:
apks = []

# If the ranking algorithm proposes fewer than 12 items. Pad with the top best sellers from the previous week
for c_id, gt in val_week_purchases_by_cust.items():
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    apks.append(apk(gt, pred[:12], 12))

np.mean(apks)

0.9245292813613925

## 8. Create submission

In [40]:
sub = pd.read_csv('data/sample_submission.csv')

In [45]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 2.84 s, sys: 267 ms, total: 3.11 s
Wall time: 3.23 s


In [46]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [None]:
# sub_name = 'basic_model_submission'
# sub.to_csv(f'data/subs/{sub_name}.csv.gz', index=False)

In [None]:
# !kaggle competitions submit -c h-and-m-personalized-fashion-recommendations -f 'data/subs/{sub_name}.csv.gz' -m {sub_name}