# Prepare baselines

In [1]:
import os
import sys
import gc
import logging
from datetime import datetime, timedelta
from itertools import combinations
from collections import Counter

from typing import List

import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import memory_profiler
from tqdm import tqdm

from metrics import mapk, mapk_drop_empty_actual

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-poster')


logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

%matplotlib inline
%load_ext memory_profiler

gc.enable()

2022-05-07 11:17:21,990 - DEBUG - Loaded backend module://matplotlib_inline.backend_inline version unknown.


## Calculate Gender feature based on all trans

In [3]:
%%time
dir_path = '../input_data/processed_data/'
file_name = 'trans_clean_all.feather'

trans = pd.read_feather(dir_path + file_name)
trans.head()

CPU times: user 570 ms, sys: 525 ms, total: 1.09 s
Wall time: 363 ms


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,cnt_articles
0,2018-09-20,2,541518023,304915,2,1
1,2018-09-20,2,663713001,508305,2,1
2,2018-09-20,7,505221001,203220,2,1
3,2018-09-20,7,505221004,152372,2,1
4,2018-09-20,7,685687001,169322,2,1


In [7]:
trans['customer_id'].nunique()

1362281

In [24]:
customers_data = pd.read_feather('../input_data/processed_data/customers_clean.feather')
#customers_data_feat = [
#    'customer_id',
#    'FN',
#    'Active',
#    'age',
#    'postal_code'
#]
#customers_data = customers_data[customers_data_feat]
customers_data.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,0,0,0,1,1,49,0
1,1,0,0,1,1,25,1
2,2,0,0,1,1,24,2
3,3,0,0,1,1,54,3
4,4,1,1,1,2,52,4


In [4]:
article_cat_features = [
    'article_id',
    'product_code',
    'product_type_no',
    'graphical_appearance_no',
    'colour_group_code',
    'perceived_colour_value_id',
    'perceived_colour_master_id',
    'department_no',
    'index_group_no',
    'section_no',
    'garment_group_no'
]

articles_data = pd.read_feather('../input_data/processed_data/articles_clean.feather')
# drop text features
articles_data = articles_data[article_cat_features]
articles_data.head()

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_group_no,section_no,garment_group_no
0,108775015,108775,253,1010016,9,4,5,1676,1,16,1002
1,108775044,108775,253,1010016,10,3,9,1676,1,16,1002
2,108775051,108775,253,1010017,11,1,9,1676,1,16,1002
3,110065001,110065,306,1010016,9,4,5,1339,1,61,1017
4,110065002,110065,306,1010016,10,3,9,1339,1,61,1017


In [11]:
# Baby/Children     4  
# Ladieswear        1
# Divided           2
# Menswear          3
# Sport             26
articles_data['index_group_no'].value_counts()

1     39737
4     34711
2     15149
3     12553
26     3392
Name: index_group_no, dtype: int64

In [5]:
trans = trans.merge(articles_data[['article_id', 'index_group_no']], how='left', on=['article_id'])

In [8]:
trans['index_group_no'].value_counts()

1     18495735
2      6440803
3      1608839
26     1148763
4      1119279
Name: index_group_no, dtype: int64

In [9]:
trans[trans['index_group_no'].isin([1,3])]

(20104574, 7)

In [6]:
customer_index_group = trans[trans['index_group_no'].isin([1,3])]\
    .groupby(['customer_id'], as_index=False)\
    .agg({'index_group_no': list})

In [16]:
from collections import Counter

customer_index_group['gender_calc'] = customer_index_group['index_group_no']\
    .apply(lambda x: Counter(x).most_common(1)[0][0])

In [17]:
customer_index_group.head()

Unnamed: 0,customer_id,index_group_no,gender_calc
0,0,"[3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
1,1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,2,"[1, 1, 1, 1, 1, 1, 3, 3, 1, 1, 3, 3, 1]",1
3,4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",1
4,6,"[1, 1]",1


In [18]:
customer_index_group['gender_calc'].value_counts()

1    1166337
3      70269
Name: gender_calc, dtype: int64

In [25]:
customers_data = customers_data.merge(customer_index_group[['customer_id', 'gender_calc']], 
                                      how='left', on='customer_id')
customers_data['gender_calc'] = customers_data['gender_calc'].fillna(0).astype('int8')

In [26]:
customers_data['gender_calc'].value_counts()

1    1166337
0     135374
3      70269
Name: gender_calc, dtype: int64

In [27]:
customers_data.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,gender_calc
0,0,0,0,1,1,49,0,1
1,1,0,0,1,1,25,1,1
2,2,0,0,1,1,24,2,1
3,3,0,0,1,1,54,3,0
4,4,1,1,1,2,52,4,1


In [28]:
# save gender to file
customers_data.to_feather('../input_data/processed_data/customers_clean2.feather')

## Validation strategy: compute metrics several folds
**Test and Validation peridos**

- `test` 2020-09-16 - 2020-09-22
- `test` 2020-09-09 - 2020-09-15
- `test` 2020-09-02 - 2020-09-08

- `test` 2019-09-24 - 2019-09-30 (or 23 - 29 or 22 - 28)

### utils

In [2]:
from utils_validation import (
    concat_items,
    get_dates,
    get_sub_df,
    get_sub_df_personal_cold,
    get_test_target
)

### Frequent pairs utils

In [3]:
from utils_frequent_pairs import (
    get_frequent_pairs,
    collect_freq_items,
    concat_pred_freq_items
)

### popular in categories

In [7]:
from utils_category_top_items import (
    get_category_top_items_time_decay,
    concat_pred_cat_items,
    category_top_items_predict
)

### predicts

In [4]:
from utils_heuristics import (
    popular_predict,
    popular_predict2,
    popular_time_decay_predict,
    personal_popular_predict,
    personal_recent_predict,
    personal_time_decay_predict,
    personal_trending_time_decay_predict,
    personal_history_time_decay_predict,
    gender_age_personal_cold_start_predict
)

### full script

In [5]:
%%time
dir_path = '../input_data/processed_data/'
file_name = 'trans_clean_all.feather'

trans = pd.read_feather(dir_path + file_name)
trans.head()

CPU times: user 556 ms, sys: 511 ms, total: 1.07 s
Wall time: 280 ms


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,cnt_articles
0,2018-09-20,2,541518023,304915,2,1
1,2018-09-20,2,663713001,508305,2,1
2,2018-09-20,7,505221001,203220,2,1
3,2018-09-20,7,505221004,152372,2,1
4,2018-09-20,7,685687001,169322,2,1


In [6]:
article_cat_features = [
    'article_id',
    'product_code',
    'product_type_no',
    'graphical_appearance_no',
    'colour_group_code',
    'perceived_colour_value_id',
    'perceived_colour_master_id',
    'department_no',
    'index_group_no',
    'section_no',
    'garment_group_no'
]

articles_data = pd.read_feather('../input_data/processed_data/articles_clean.feather')
# drop text features
articles_data = articles_data[article_cat_features]
articles_data.head()

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_group_no,section_no,garment_group_no
0,108775015,108775,253,1010016,9,4,5,1676,1,16,1002
1,108775044,108775,253,1010016,10,3,9,1676,1,16,1002
2,108775051,108775,253,1010017,11,1,9,1676,1,16,1002
3,110065001,110065,306,1010016,9,4,5,1339,1,61,1017
4,110065002,110065,306,1010016,10,3,9,1339,1,61,1017


In [7]:
customers_data = pd.read_feather('../input_data/processed_data/customers_clean2.feather')
#customers_data_feat = [
#    'customer_id',
#    'FN',
#    'Active',
#    'age',
#    'postal_code'
#]
#customers_data = customers_data[customers_data_feat]
def age_bin_util(x):
    if x < 25:
        return 18
    if 25 <= x < 35:
        return 25
    if 35 <= x < 45:
        return 35
    if 45 <= x < 55:
        return 45
    if x >= 55:
        return 55

customers_data['age_bin'] = customers_data['age'].apply(lambda x: age_bin_util(x))
customers_data.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,gender_calc,age_bin
0,0,0,0,1,1,49,0,1,45
1,1,0,0,1,1,25,1,1,25
2,2,0,0,1,1,24,2,1,18
3,3,0,0,1,1,54,3,0,45
4,4,1,1,1,2,52,4,1,45


In [8]:
SUB_PATH = '../input_data/processed_data/sub_user_ind.feather'
#ARTICLE_DATA_PATH = '../input_data/processed_data/articles_clean.feather'
CUTOFF = 12

In [10]:
%%time
%%memit
NUM_FOLDS = 4
NUM_DAYS_TRAIN = 30 # 734 days in trans
NUM_DAYS_TEST = 7
TEST_START_DAY = '2020-08-26'
# 2019 year
#TEST_START_DAY = '2019-09-02'
#ARTICLE_CATEGORY = 'index_group_no'

PERSONAL_GROUPBY = 'price_weight'
#PERSONAL_GROUPBY = 'item_weight'


test_start_date = datetime.strptime(TEST_START_DAY, '%Y-%m-%d')
# get init dates
test_start_date, test_end_date, \
    train_end_date, train_start_date = get_dates(
    test_start_date,
    NUM_DAYS_TEST,
    NUM_DAYS_TRAIN
)
#mapk_arr = []
mapk_dea_arr = []

for fold_num in range(NUM_FOLDS):
    logging.info('FOLD %s STARTED', fold_num + 1)
    logging.info('test_start_date = %s', test_start_date.date())
    logging.info('test_end_date = %s', test_end_date.date())
    logging.info('train_start_date = %s', train_start_date.date())
    logging.info('train_end_date = %s', train_end_date.date())    
    
    # get target dict
    target = get_test_target(trans, test_start_date, test_end_date)
    logging.info('len(target) = %s', len(target))
    
    # get popular preidictions - cnt unq users
    #popular_articles = popular_predict(trans, train_start_date, train_end_date, CUTOFF)
    #logging.info('len(popular_articles) = %s', len(popular_articles))
    
    # get popular preidictions - cnt items
    #preds = popular_predict2(trans, train_start_date, train_end_date, CUTOFF)
    #logging.info('len(preds) = %s', len(preds))    
    
    # get popular preidictions - time weighted items
    popular_articles = popular_time_decay_predict(trans, 
                                                  train_start_date, 
                                                  train_end_date, 
                                                  CUTOFF,
                                                  PERSONAL_GROUPBY)
    logging.info('len(popular_articles) = %s', len(popular_articles))
    
    # get personal popular items
    #preds = personal_popular_predict(trans, train_start_date, train_end_date, CUTOFF)
    #logging.info('preds.shape = %s', preds.shape)
    
    # get personal recent items
    #preds = personal_recent_predict(trans, train_start_date, train_end_date, CUTOFF)
    #logging.info('preds.shape = %s', preds.shape)

    # get personal time weighted items
    preds = personal_time_decay_predict(trans, 
                                        train_start_date, 
                                        train_end_date, 
                                        CUTOFF,
                                        PERSONAL_GROUPBY)
    logging.info('preds.shape = %s', preds.shape)
    
    # get personal time weighted items from all history
    #preds = personal_history_time_decay_predict(trans, train_start_date, train_end_date, CUTOFF)
    #logging.info('preds.shape = %s', preds.shape)
    
    # get personal TRENDING time weighted items
    #preds = personal_trending_time_decay_predict(trans, train_start_date, train_end_date, CUTOFF)
    #logging.info('preds.shape = %s', preds.shape)    

    # get frequent pairs of items
    freq_pairs = get_frequent_pairs(trans, train_start_date, train_end_date, CUTOFF)
    logging.info('len(freq_pairs) = %s', len(freq_pairs))
    
    # add frequent pairs to personal items
    preds['pred'] = preds['pred'].apply(lambda x: concat_pred_freq_items(x, freq_pairs, CUTOFF))
    
    # collect frequent pairs for personal items (AND NOT JOIN THEM)
    #preds['pred'] = preds['pred'].apply(lambda x: collect_freq_items(x, freq_pairs, CUTOFF))    
    
    # get popular items in categories
    #category_top_items, article_category = get_category_top_items_time_decay(
    #    trans, train_start_date, train_end_date,
    #    ARTICLE_DATA_PATH, ARTICLE_CATEGORY, CUTOFF
    #)
    # add popular items in categories to personal items
    #preds['pred'] = preds['pred'].apply(lambda x: concat_pred_cat_items(x, 
    #                                                                    article_category, 
    #                                                                    category_top_items,
    #                                                                    CUTOFF))
    # collect popular items in categories for personal items (AND NOT JOIN THEM)
    #preds['pred'] = preds['pred'].apply(lambda x: category_top_items_predict(x, 
    #                                                                    article_category, 
    #                                                                    category_top_items,
    #                                                                    CUTOFF)) 
    
    # popular in gender 
    #preds_personal_cold = gender_personal_cold_start_predict(
    #    trans,
    #    train_start_date, 
    #    train_end_date,
    #    customers_data,
    #    articles_data,
    #    CUTOFF
    #)
    preds_personal_cold = gender_age_personal_cold_start_predict(
    #preds_personal_cold = customer_personal_cold_start_predict(
        trans,
        train_start_date, 
        train_end_date,
        customers_data,
        articles_data,
        popular_articles,
        CUTOFF
    )
    #logging.info('preds_personal_cold.shape = %s', preds_personal_cold.shape)
      
    # get submission df
    #sub_df = get_sub_df_popular(SUB_PATH, target, preds)
    #sub_df = get_sub_df(SUB_PATH, target, preds, popular_articles, CUTOFF)
    sub_df = get_sub_df_personal_cold(SUB_PATH, target, preds, preds_personal_cold, CUTOFF)
    logging.info('sub_df.shape = %s', sub_df.shape)
    
    # compute metrics
    #metric1 = mapk(sub_df['target'].tolist(), 
    #               sub_df['pred'].tolist(),
    #               k=CUTOFF)
    metric2 = mapk_drop_empty_actual(
                     sub_df['target'].tolist(), 
                     sub_df['pred'].tolist(),
                     k=CUTOFF)
    #logging.info('mapk = %s', metric1)
    logging.info('mapk_drop_empty_actual = %s',metric2)
    #mapk_arr.append(metric1)
    mapk_dea_arr.append(metric2)
    
    # get dates of next fold
    test_start_date = test_start_date + timedelta(days=NUM_DAYS_TEST)
    test_start_date, test_end_date, \
        train_end_date, train_start_date = get_dates(
        test_start_date,
        NUM_DAYS_TEST,
        NUM_DAYS_TRAIN
    )
    logging.info('FOLD %s DONE', fold_num + 1)
    
#logging.info('MAP@K AVG = %s', np.mean(mapk_arr))
logging.info('MAP@K drop empty actuals AVG = %s', np.mean(mapk_dea_arr))

2022-05-07 11:22:23,507 - INFO - FOLD 1 STARTED
2022-05-07 11:22:23,509 - INFO - test_start_date = 2020-08-26
2022-05-07 11:22:23,509 - INFO - test_end_date = 2020-09-01
2022-05-07 11:22:23,510 - INFO - train_start_date = 2020-07-26
2022-05-07 11:22:23,510 - INFO - train_end_date = 2020-08-25
2022-05-07 11:22:24,785 - INFO - len(target) = 80253
2022-05-07 11:22:31,826 - INFO - len(popular_articles) = 12
2022-05-07 11:22:43,312 - INFO - preds.shape = (252749, 2)
100%|█████████████████████████████| 1963796/1963796 [00:02<00:00, 747898.58it/s]
100%|██████████████████████████████████| 29363/29363 [00:00<00:00, 63982.93it/s]
2022-05-07 11:22:52,139 - INFO - len(freq_pairs) = 29363
2022-05-07 11:23:18,944 - INFO - sub_df.shape = (1371980, 4)
2022-05-07 11:23:19,126 - INFO - mapk_drop_empty_actual = 0.023309331980796426
2022-05-07 11:23:19,126 - INFO - FOLD 1 DONE
2022-05-07 11:23:19,126 - INFO - FOLD 2 STARTED
2022-05-07 11:23:19,127 - INFO - test_start_date = 2020-09-02
2022-05-07 11:23:19,

peak memory: 2470.99 MiB, increment: 252.54 MiB
CPU times: user 3min 33s, sys: 771 ms, total: 3min 34s
Wall time: 3min 34s


**personal trending time weight + frequent pairs**
- NUM_DAYS_TRAIN = 30 groupby=price MAP@K = 0.02396
- NUM_DAYS_TRAIN = 30 groupby=item_weight MAP@K = 0.02457
- NUM_DAYS_TRAIN = 30 groupby=price_weight MAP@K = 0.02571

**personal time weight all history + frequent pairs + Active gender age bin personal cold (2020)**
- NUM_DAYS_TRAIN = 30 MAP@K = 0.02356


**personal time weight + frequent pairs + Active gender age bin personal cold (2020)**
- NUM_DAYS_TRAIN = 30 MAP@K = 0.02617


**personal time weight + frequent pairs + FN gender age bin personal cold (2020)**
- NUM_DAYS_TRAIN = 30 MAP@K = 0.02612


**personal time weight + frequent pairs + gender age bin personal cold (2020)**
- NUM_DAYS_TRAIN = 30 MAP@K = 0.02616
- NUM_DAYS_TRAIN = 60 MAP@K = 0.02537
- NUM_DAYS_TRAIN = 30 groupby=price_weight (personal) MAP@K = 0.02730
- NUM_DAYS_TRAIN = 30 groupby=price_weight (pers + pop) MAP@K = 0.02737
- NUM_DAYS_TRAIN = 30 groupby=price_weight (pers + pop + gender_age) MAP@K = 0.02671


**personal time weight + frequent pairs + gender personal cold (2020)**
- NUM_DAYS_TRAIN = 30 MAP@K = 0.02497
- NUM_DAYS_TRAIN = 30 FIX freq items for->while MAP@K = 0.02497
- NUM_DAYS_TRAIN = 30 divided -> female MAP@K = 0.02493
- NUM_DAYS_TRAIN = 30 divided -> male MAP@K = 0.02456
- NUM_DAYS_TRAIN = 30 baby -> female MAP@K = 0.02497


**personal time weight + gender personal cold (2020)**
- NUM_DAYS_TRAIN = 30 MAP@K = 0.02367


**personal time weight + common popular time weight (2020)**
- NUM_DAYS_TRAIN = 30 MAP@K = 0.02314


**gender popular time weight (2020)**
- NUM_DAYS_TRAIN = 30 MAP@K = 0.0074


**personal trending time weight (2020)**
- NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.02229
- NUM_DAYS_TRAIN = 60 MAP@K DEA AVG = 0.02209


**personal trending time weight (2020) +  frequent pairs**
- quot_fill_na = 0 NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.02346
- quot_fill_na = 1 NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.02369

**personal trending time weight (2019) +  frequent pairs**
- quot_fill_na = 1 NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.02202

**personal time weight + popular in category (time weight) 2020**
- category=index_code,         NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.02289
- category=department_no,      NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.02306
- category=product_type_no,    NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.02273
- category=product_group_name, NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.02277
- category=index_group_no,     NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.02299
- category=garment_group_no,   NUM_DAYS_TRAIN = 14 MAP@K DEA AVG = 0.02221
- category=garment_group_no,   NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.02272


**popular in category (time weight) 2020**
- category=index_code,         NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.0065
- category=department_no,      NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.0070
- category=index_group_no,     NUM_DAYS_TRAIN = 30 MAP@K DEA AVG = 0.0068

**frequent pairs for personal time weight  2020**
- NUM_DAYS_TRAIN = 30  MAP@K DEA AVG = 0.01006

**personal time weight + frequent pairs 2020**
- NUM_DAYS_TRAIN = 180 MAP@K DEA AVG = 0.02283
- NUM_DAYS_TRAIN = 60  MAP@K DEA AVG = 0.02387
- NUM_DAYS_TRAIN = 30  MAP@K DEA AVG = 0.02422
- NUM_DAYS_TRAIN = 14  MAP@K DEA AVG = 0.02368
- NUM_DAYS_TRAIN = 7   MAP@K DEA AVG = 0.02146

**personal time weight + frequent pairs 2019**
- NUM_DAYS_TRAIN = 30  MAP@K DEA AVG = 0.02250

**personal time weight (weeks) 2020**
- NUM_DAYS_TRAIN = 60  MAP@K DEA AVG = 0.02241
- NUM_DAYS_TRAIN = 30  MAP@K DEA AVG = 0.02244
- NUM_DAYS_TRAIN = 14  MAP@K DEA AVG = 0.02198

**personal time weight (days) 2020**
- NUM_DAYS_TRAIN = 180 MAP@K DEA AVG = 0.02239
- NUM_DAYS_TRAIN = 60  MAP@K DEA AVG = 0.02284
- NUM_DAYS_TRAIN = 30  MAP@K DEA AVG = 0.02280
- NUM_DAYS_TRAIN = 21  MAP@K DEA AVG = 0.02268
- NUM_DAYS_TRAIN = 14  MAP@K DEA AVG = 0.02222
- NUM_DAYS_TRAIN = 7   MAP@K DEA AVG = 0.02017

**personal time weight (days) 2019**
- NUM_DAYS_TRAIN = 60  MAP@K DEA AVG = 0.02124
- NUM_DAYS_TRAIN = 30  MAP@K DEA AVG = 0.02122
- NUM_DAYS_TRAIN = 14  MAP@K DEA AVG = 0.02002

**personal recent 2019**
- NUM_DAYS_TRAIN = 750 MAP@K DEA AVG = 0.02053
- NUM_DAYS_TRAIN = 370 MAP@K DEA AVG = 0.02053
- NUM_DAYS_TRAIN = 180 MAP@K DEA AVG = 0.02008
- NUM_DAYS_TRAIN = 60  MAP@K DEA AVG = 0.02003
- NUM_DAYS_TRAIN = 30  MAP@K DEA AVG = 0.02043
- NUM_DAYS_TRAIN = 21  MAP@K DEA AVG = 0.02036
- NUM_DAYS_TRAIN = 14  MAP@K DEA AVG = 0.01953
- NUM_DAYS_TRAIN = 7   MAP@K DEA AVG = 0.01751

**personal recent 2020**
- NUM_DAYS_TRAIN = 750 MAP@K DEA AVG = 0.02182
- NUM_DAYS_TRAIN = 370 MAP@K DEA AVG = 0.02189
- NUM_DAYS_TRAIN = 180 MAP@K DEA AVG = 0.02137
- NUM_DAYS_TRAIN = 60  MAP@K DEA AVG = 0.02182
- NUM_DAYS_TRAIN = 30  MAP@K DEA AVG = 0.02185
- NUM_DAYS_TRAIN = 21  MAP@K DEA AVG = 0.02187
- NUM_DAYS_TRAIN = 14  MAP@K DEA AVG = 0.02168
- NUM_DAYS_TRAIN = 7   MAP@K DEA AVG = 0.02058

**personal popular**
- NUM_DAYS_TRAIN = 21 MAP@K DEA AVG = 0.020053
- NUM_DAYS_TRAIN = 14 MAP@K DEA AVG = 0.020542
- NUM_DAYS_TRAIN = 7  MAP@K DEA AVG = 0.020101

**common popular**
- NUM_DAYS_TRAIN = 15 MAP@K AVG = 0.000350
- NUM_DAYS_TRAIN = 30 MAP@K AVG = 0.000283
- NUM_DAYS_TRAIN = 60 MAP@K AVG = 0.000229
- NUM_DAYS_TRAIN =370 MAP@K AVG = 0.0002026

## Make submission for public test


### script for personal popular, recent, time weighted

In [9]:
%%time
NUM_DAYS_TRAIN = 30
NUM_DAYS_TEST = 7
CUTOFF = 12
PERSONAL_GROUPBY = 'price_weight'


test_start_date = datetime.strptime('2020-09-23', '%Y-%m-%d')
test_start_date, test_end_date, \
    train_end_date, train_start_date = get_dates(
    test_start_date,
    NUM_DAYS_TEST,
    NUM_DAYS_TRAIN
)

#popular_articles = popular_predict(trans, train_start_date, train_end_date, CUTOFF)
popular_articles = popular_time_decay_predict(trans, 
                                              train_start_date, 
                                              train_end_date, 
                                              CUTOFF,
                                              PERSONAL_GROUPBY)
#logging.info('len(popular_articles) = %s', len(popular_articles))

#preds_personal_cold = gender_personal_cold_start_predict(
#        trans, 
#        train_start_date, 
#        train_end_date,
#        customers_data,
#        articles_data,
#        CUTOFF
#    )
preds_personal_cold = gender_age_personal_cold_start_predict(
    trans, 
    train_start_date, 
    train_end_date,
    customers_data,
    articles_data,
    popular_articles,
    CUTOFF
)
logging.info('preds_personal_cold.shape = %s', preds_personal_cold.shape)

#preds = personal_popular_predict(trans, train_start_date, train_end_date, CUTOFF)
#preds = personal_recent_predict(trans, train_start_date, train_end_date, CUTOFF)
#preds = personal_time_decay_predict(trans, train_start_date, train_end_date, CUTOFF)
preds = personal_time_decay_predict(trans, 
                                    train_start_date, 
                                    train_end_date, 
                                    CUTOFF,
                                    PERSONAL_GROUPBY)
#preds = personal_trending_time_decay_predict(trans, train_start_date, train_end_date, CUTOFF)
logging.info('preds.shape = %s', preds.shape)

# get frequent pairs of items
freq_pairs = get_frequent_pairs(trans, train_start_date, train_end_date, CUTOFF)
logging.info('len(freq_pairs) = %s', len(freq_pairs))

# add frequent pairs to personal items
preds['pred'] = preds['pred'].apply(lambda x: concat_pred_freq_items(x, freq_pairs, CUTOFF))

2022-05-07 00:30:27,355 - INFO - preds_personal_cold.shape = (1371980, 2)
2022-05-07 00:30:37,017 - INFO - preds.shape = (250619, 2)
100%|█████████████████████████████| 1639181/1639181 [00:02<00:00, 759731.64it/s]
100%|██████████████████████████████████| 28341/28341 [00:00<00:00, 76344.31it/s]
2022-05-07 00:30:45,634 - INFO - len(freq_pairs) = 28341


CPU times: user 38.8 s, sys: 439 ms, total: 39.2 s
Wall time: 39.2 s


In [10]:
%%time
SUB_PATH = '../input_data/processed_data/sub_user_ind.feather'

with open('../input_data/processed_data/ind_user.pcl', 'rb') as f:
    ind_user = pickle.load(f)

sub = pd.read_feather(SUB_PATH)
sub = sub.merge(preds, how='left', on='customer_id')
sub['prediction'] = sub['pred'].fillna('').apply(list)

# join common cold start
#sub['prediction'] = sub['prediction'].apply(lambda x: concat_items(x, popular_articles, CUTOFF))

# join personal cold start
sub = sub.merge(preds_personal_cold, how='left', on=['customer_id'])

sub['prediction'] = sub[['prediction', 'personal_cold_start']]\
                    .apply(lambda x: concat_items(x[0], x[1], CUTOFF), axis=1)


sub['customer_id'] = sub['customer_id'].apply(lambda x: ind_user[x])
sub['prediction'] = sub['prediction'].apply(lambda x: ' '.join(['0' + str(pred) for pred in x]))

sub.head()

CPU times: user 14.8 s, sys: 220 ms, total: 15 s
Wall time: 15 s


Unnamed: 0,customer_id,pred,prediction,personal_cold_start
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[568601043, 858856005, 779781015, 762846031, 5...",0568601043 0858856005 0779781015 0762846031 05...,"[751471001, 751471043, 915529003, 928206001, 8..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,0909370001 0924243001 0863583001 0915529003 01...,"[909370001, 924243001, 863583001, 915529003, 1..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[794321007, 794321011, 805000001, 805000007, 7...",0794321007 0794321011 0805000001 0805000007 07...,"[918522001, 915526001, 751471001, 915529003, 9..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,0751471001 0924243001 0448509014 0918522001 09...,"[751471001, 924243001, 448509014, 918522001, 9..."
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,,0751471001 0751471043 0915529003 0928206001 08...,"[751471001, 751471043, 915529003, 928206001, 8..."


In [11]:
sub[['customer_id', 'prediction']].to_csv('../submissions/sub_40.csv', index=False)