# Use implicit knn model 

In [1]:
import gc
import logging
import pickle
from typing import Tuple, Dict, List, Any
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import implicit
from scipy.sparse.linalg import norm
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import memory_profiler
from tqdm import tqdm

from metrics import mapk, mapk_drop_empty_actual

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-poster')


logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

%matplotlib inline
%load_ext memory_profiler

gc.enable()

2022-05-07 14:58:05,511 - DEBUG - Loaded backend module://matplotlib_inline.backend_inline version unknown.


In [2]:
from utils_validation import (
    concat_items,
    get_dates,
    get_sub_df,
    get_test_target,
    get_sub_df_personal_cold
)
from utils_frequent_pairs import (
    get_frequent_pairs,
    collect_freq_items,
    concat_pred_freq_items
)
from utils_heuristics import (
    popular_time_decay_predict,
    personal_time_decay_predict,
    gender_age_personal_cold_start_predict
)

from utils_models import (
    get_user_item_matrix,
    build_lookup_array
)

from utils_implicit_knn import (
    batch_array_sort,
    implicit_fit_predict
)

In [3]:
def get_test_target(data: pd.DataFrame, 
                    test_start_date_: datetime, 
                    test_end_date_: datetime) -> Dict[int, List[int]]:
    """Collect target items to dict: customer_ind -> list of articles"""
    target = data[(data['t_dat'] >= test_start_date_) & 
                  (data['t_dat'] <= test_end_date_)]\
                .groupby(['customer_id'], as_index=False)\
                .agg({'article_id': set})\
                .to_dict(orient='records')
    # transform to dict 
    target = {row['customer_id']: row['article_id'] for row in target}
    return target

## Validation

In [3]:
%%time
dir_path = '../input_data/processed_data/'
file_name = 'trans_clean_all.feather'

trans = pd.read_feather(dir_path + file_name)
trans.head()

CPU times: user 571 ms, sys: 496 ms, total: 1.07 s
Wall time: 292 ms


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,cnt_articles
0,2018-09-20,2,541518023,304915,2,1
1,2018-09-20,2,663713001,508305,2,1
2,2018-09-20,7,505221001,203220,2,1
3,2018-09-20,7,505221004,152372,2,1
4,2018-09-20,7,685687001,169322,2,1


In [4]:
article_cat_features = [
    'article_id',
    'product_code',
    'product_type_no',
    'graphical_appearance_no',
    'colour_group_code',
    'perceived_colour_value_id',
    'perceived_colour_master_id',
    'department_no',
    'index_group_no',
    'section_no',
    'garment_group_no'
]

articles_data = pd.read_feather('../input_data/processed_data/articles_clean.feather')
# drop text features
articles_data = articles_data[article_cat_features]
articles_data.head()

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_group_no,section_no,garment_group_no
0,108775015,108775,253,1010016,9,4,5,1676,1,16,1002
1,108775044,108775,253,1010016,10,3,9,1676,1,16,1002
2,108775051,108775,253,1010017,11,1,9,1676,1,16,1002
3,110065001,110065,306,1010016,9,4,5,1339,1,61,1017
4,110065002,110065,306,1010016,10,3,9,1339,1,61,1017


In [5]:
customers_data = pd.read_feather('../input_data/processed_data/customers_clean2.feather')
#customers_data_feat = [
#    'customer_id',
#    'FN',
#    'Active',
#    'age',
#    'postal_code'
#]
#customers_data = customers_data[customers_data_feat]
def age_bin_util(x):
    if x < 25:
        return 18
    if 25 <= x < 35:
        return 25
    if 35 <= x < 45:
        return 35
    if 45 <= x < 55:
        return 45
    if x >= 55:
        return 55

customers_data['age_bin'] = customers_data['age'].apply(lambda x: age_bin_util(x))
customers_data.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,gender_calc,age_bin
0,0,0,0,1,1,49,0,1,45
1,1,0,0,1,1,25,1,1,25
2,2,0,0,1,1,24,2,1,18
3,3,0,0,1,1,54,3,0,45
4,4,1,1,1,2,52,4,1,45


In [6]:
SUB_PATH = '../input_data/processed_data/sub_user_ind.feather'
ARTICLE_DATA_PATH = '../input_data/processed_data/articles_clean.feather'
CUTOFF = 12

In [7]:
%%time
%%memit
NUM_FOLDS = 4
NUM_DAYS_TRAIN = 30 # 734 days in trans
NUM_DAYS_TEST = 7

TEST_START_DAY = '2020-08-26'
# 2019 year
#TEST_START_DAY = '2019-09-02'

PERSONAL_GROUPBY = 'price_weight'

test_start_date = datetime.strptime(TEST_START_DAY, '%Y-%m-%d')
# get init dates
test_start_date, test_end_date, \
    train_end_date, train_start_date = get_dates(
    test_start_date,
    NUM_DAYS_TEST,
    NUM_DAYS_TRAIN
)
mapk_dea_arr = []

# implicit params
model_type = 'tfidf'
model_params = {'K': 1000, 'num_threads': 8}
SIMILARITY_TYPE = 'u2u'
USER_ITEM_VALUES = 'price_weight'
USER_CNT_UNQ_ITEMS = 0
ITEM_CNT_UNQ_USERS = 0


for fold_num in range(NUM_FOLDS):
    logging.info('FOLD %s STARTED', fold_num + 1)
    logging.info('test_start_date = %s', test_start_date.date())
    logging.info('test_end_date = %s', test_end_date.date())
    logging.info('train_start_date = %s', train_start_date.date())
    logging.info('train_end_date = %s', train_end_date.date())    
    
    # get target dict
    target = get_test_target(trans, test_start_date, test_end_date)
    logging.info('len(target) = %s', len(target))
        
    # get popular preidictions - time weighted items
    popular_articles = popular_time_decay_predict(trans, 
                                                  train_start_date, 
                                                  train_end_date, 
                                                  CUTOFF,
                                                  PERSONAL_GROUPBY)
    logging.info('len(popular_articles) = %s', len(popular_articles))

    # get personal time weighted items
    #preds = personal_time_decay_predict(trans, train_start_date, train_end_date, CUTOFF)
    #logging.info('preds.shape = %s', preds.shape)  

    # get implicit nn predict
    preds = implicit_fit_predict(
        trans,
        train_start_date, 
        train_end_date,
        USER_ITEM_VALUES,
        model_type,
        model_params,
        SIMILARITY_TYPE,
        CUTOFF,
        USER_CNT_UNQ_ITEMS,
        ITEM_CNT_UNQ_USERS)
    logging.info('preds.shape = %s', preds.shape) 
    
    # get frequent pairs of items
    #freq_pairs = get_frequent_pairs(trans, train_start_date, train_end_date, CUTOFF)
    #logging.info('len(freq_pairs) = %s', len(freq_pairs))
    
    # add frequent pairs to personal items
    #preds['pred'] = preds['pred'].apply(lambda x: concat_pred_freq_items(x, freq_pairs, CUTOFF)) 
    
    preds_personal_cold = gender_age_personal_cold_start_predict(
        trans,
        train_start_date, 
        train_end_date,
        customers_data,
        articles_data,
        popular_articles,
        CUTOFF
    )
    
    
    #sub_df = get_sub_df(SUB_PATH, target, preds, popular_articles, CUTOFF)
    sub_df = get_sub_df_personal_cold(SUB_PATH, target, preds, preds_personal_cold, CUTOFF)
    logging.info('sub_df.shape = %s', sub_df.shape)
    
    metric2 = mapk_drop_empty_actual(
                     sub_df['target'].tolist(), 
                     sub_df['pred'].tolist(),
                     k=CUTOFF)
    #logging.info('mapk = %s', metric1)
    logging.info('mapk_drop_empty_actual = %s',metric2)
    #mapk_arr.append(metric1)
    mapk_dea_arr.append(metric2)
    
    # get dates of next fold
    test_start_date = test_start_date + timedelta(days=NUM_DAYS_TEST)
    test_start_date, test_end_date, \
        train_end_date, train_start_date = get_dates(
        test_start_date,
        NUM_DAYS_TEST,
        NUM_DAYS_TRAIN
    )
    logging.info('FOLD %s DONE', fold_num + 1)
    
#logging.info('MAP@K AVG = %s', np.mean(mapk_arr))
logging.info('MAP@K drop empty actuals AVG = %s', np.mean(mapk_dea_arr))

2022-05-07 14:42:44,540 - INFO - FOLD 1 STARTED
2022-05-07 14:42:44,541 - INFO - test_start_date = 2020-08-26
2022-05-07 14:42:44,542 - INFO - test_end_date = 2020-09-01
2022-05-07 14:42:44,542 - INFO - train_start_date = 2020-07-26
2022-05-07 14:42:44,543 - INFO - train_end_date = 2020-08-25
2022-05-07 14:42:45,829 - INFO - len(target) = 80253
2022-05-07 14:42:52,776 - INFO - len(popular_articles) = 12
2022-05-07 14:42:59,527 - INFO - Trans data collected
2022-05-07 14:42:59,814 - INFO - user_item_log.shape = (1109446, 3)
2022-05-07 14:42:59,838 - INFO - len(items_to_drop) = 0
2022-05-07 14:42:59,864 - INFO - len(users_to_drop) = 0
2022-05-07 14:42:59,882 - INFO - user_item_log_filtered.shape = (1109446, 3)
2022-05-07 14:43:00,616 - INFO - len(user_item_log_filtered) = 1109446
2022-05-07 14:43:01,430 - INFO - len(user_index) = 252749
2022-05-07 14:43:01,430 - INFO - len(item_index) = 30213
2022-05-07 14:43:01,431 - INFO - len(index_item) = 30213
2022-05-07 14:43:01,431 - INFO - len(in

  0%|          | 0/252749 [00:00<?, ?it/s]

100%|███████████████████████████████████████████| 26/26 [02:02<00:00,  4.71s/it]
2022-05-07 14:45:52,188 - INFO - preds.shape = (252749, 2)
2022-05-07 14:46:18,186 - INFO - sub_df.shape = (1371980, 4)
2022-05-07 14:46:18,366 - INFO - mapk_drop_empty_actual = 0.02318119401720179
2022-05-07 14:46:18,366 - INFO - FOLD 1 DONE
2022-05-07 14:46:18,367 - INFO - FOLD 2 STARTED
2022-05-07 14:46:18,367 - INFO - test_start_date = 2020-09-02
2022-05-07 14:46:18,367 - INFO - test_end_date = 2020-09-08
2022-05-07 14:46:18,368 - INFO - train_start_date = 2020-08-02
2022-05-07 14:46:18,368 - INFO - train_end_date = 2020-09-01
2022-05-07 14:46:19,719 - INFO - len(target) = 75822
2022-05-07 14:46:26,558 - INFO - len(popular_articles) = 12
2022-05-07 14:46:33,368 - INFO - Trans data collected
2022-05-07 14:46:33,718 - INFO - user_item_log.shape = (1092867, 3)
2022-05-07 14:46:33,753 - INFO - len(items_to_drop) = 0
2022-05-07 14:46:33,782 - INFO - len(users_to_drop) = 0
2022-05-07 14:46:33,804 - INFO - us

  0%|          | 0/254366 [00:00<?, ?it/s]

100%|███████████████████████████████████████████| 26/26 [01:57<00:00,  4.50s/it]
2022-05-07 14:49:30,203 - INFO - preds.shape = (254366, 2)
2022-05-07 14:49:52,913 - INFO - sub_df.shape = (1371980, 4)
2022-05-07 14:49:53,058 - INFO - mapk_drop_empty_actual = 0.027116809209861457
2022-05-07 14:49:53,059 - INFO - FOLD 2 DONE
2022-05-07 14:49:53,059 - INFO - FOLD 3 STARTED
2022-05-07 14:49:53,059 - INFO - test_start_date = 2020-09-09
2022-05-07 14:49:53,059 - INFO - test_end_date = 2020-09-15
2022-05-07 14:49:53,060 - INFO - train_start_date = 2020-08-09
2022-05-07 14:49:53,060 - INFO - train_end_date = 2020-09-08
2022-05-07 14:49:54,058 - INFO - len(target) = 72019
2022-05-07 14:49:59,451 - INFO - len(popular_articles) = 12
2022-05-07 14:50:05,206 - INFO - Trans data collected
2022-05-07 14:50:05,456 - INFO - user_item_log.shape = (1042056, 3)
2022-05-07 14:50:05,476 - INFO - len(items_to_drop) = 0
2022-05-07 14:50:05,500 - INFO - len(users_to_drop) = 0
2022-05-07 14:50:05,516 - INFO - u

  0%|          | 0/250098 [00:00<?, ?it/s]

100%|███████████████████████████████████████████| 26/26 [01:52<00:00,  4.34s/it]
2022-05-07 14:52:43,457 - INFO - preds.shape = (250098, 2)
2022-05-07 14:53:06,500 - INFO - sub_df.shape = (1371980, 4)
2022-05-07 14:53:06,646 - INFO - mapk_drop_empty_actual = 0.028294468695102094
2022-05-07 14:53:06,646 - INFO - FOLD 3 DONE
2022-05-07 14:53:06,647 - INFO - FOLD 4 STARTED
2022-05-07 14:53:06,647 - INFO - test_start_date = 2020-09-16
2022-05-07 14:53:06,647 - INFO - test_end_date = 2020-09-22
2022-05-07 14:53:06,648 - INFO - train_start_date = 2020-08-16
2022-05-07 14:53:06,648 - INFO - train_end_date = 2020-09-15
2022-05-07 14:53:07,627 - INFO - len(target) = 68984
2022-05-07 14:53:13,219 - INFO - len(popular_articles) = 12
2022-05-07 14:53:18,862 - INFO - Trans data collected
2022-05-07 14:53:19,108 - INFO - user_item_log.shape = (1018346, 3)
2022-05-07 14:53:19,127 - INFO - len(items_to_drop) = 0
2022-05-07 14:53:19,150 - INFO - len(users_to_drop) = 0
2022-05-07 14:53:19,166 - INFO - u

  0%|          | 0/249511 [00:00<?, ?it/s]

100%|███████████████████████████████████████████| 25/25 [01:52<00:00,  4.50s/it]
2022-05-07 14:55:58,325 - INFO - preds.shape = (249511, 2)
2022-05-07 14:56:21,007 - INFO - sub_df.shape = (1371980, 4)
2022-05-07 14:56:21,151 - INFO - mapk_drop_empty_actual = 0.0299640551657206
2022-05-07 14:56:21,152 - INFO - FOLD 4 DONE
2022-05-07 14:56:21,152 - INFO - MAP@K drop empty actuals AVG = 0.027139131771971484


peak memory: 14424.75 MiB, increment: 13544.13 MiB
CPU times: user 13min 57s, sys: 1min 27s, total: 15min 24s
Wall time: 13min 36s


**implicit nn, i2i tfidf**
- NUM_DAYS_TRAIN=30, tfidf, K=30, item_weight, pop price_weight MAP@K = 0.02403
- NUM_DAYS_TRAIN=30, tfidf, K=30, item_weight, pop price_weight + personal cold gender_age MAP@K = 0.02557
- NUM_DAYS_TRAIN=30, tfidf, K=30, price_weight, pop price_weight + personal cold gender_age MAP@K = 0.02632

**implicit nn 2020, u2u, after target fix**

- NUM_DAYS_TRAIN=30, tfidf, K=1000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02543
- NUM_DAYS_TRAIN=60, tfidf, K=1000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02582
- NUM_DAYS_TRAIN=30, tfidf, K=1000, price_weight, pop price_weight + personal cold gender_age MAP@K = 0.02713

**implicit nn 2020, u2u**
- NUM_DAYS_TRAIN=30, tfidf, K= 5, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02070
- NUM_DAYS_TRAIN=30, tfidf, K=10, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02117
- NUM_DAYS_TRAIN=30, tfidf, K=20, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02156
- NUM_DAYS_TRAIN=30, tfidf, K=20, item_weight, UCUI=1, ICUU=5 MAP@K DEA AVG = 0.01948
- NUM_DAYS_TRAIN=30, tfidf, K=20, item_weight, UCUI=2, ICUU=5 MAP@K DEA AVG = 0.01692
- NUM_DAYS_TRAIN=30, tfidf, K=20, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02167
- NUM_DAYS_TRAIN=30, tfidf, K=20, item_weight, UCUI=0, ICUU=1 MAP@K DEA AVG = 0.02166
- NUM_DAYS_TRAIN=30, tfidf, K=50, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02212
- NUM_DAYS_TRAIN=30, tfidf, K=100, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02290


- NUM_DAYS_TRAIN=30, tfidf, K=500, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02487
- NUM_DAYS_TRAIN=30, tfidf, K=800, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02505
- NUM_DAYS_TRAIN=30, tfidf, K=900, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02507
- NUM_DAYS_TRAIN=30, tfidf, K=1000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02508
- NUM_DAYS_TRAIN=30, tfidf, K=1050, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02507
- NUM_DAYS_TRAIN=30, tfidf, K=1125, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02505
- NUM_DAYS_TRAIN=30, tfidf, K=1250, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02504
- NUM_DAYS_TRAIN=30, tfidf, K=1500, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02503
- NUM_DAYS_TRAIN=30, tfidf, K=2000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02494


- NUM_DAYS_TRAIN=30, tfidf, K=1000, item_weight, UCUI=0, ICUU=1 MAP@K DEA AVG = 0.02507
- NUM_DAYS_TRAIN=30, tfidf, K=1000, item_weight, UCUI=0, ICUU=3 MAP@K DEA AVG = 0.02502
- NUM_DAYS_TRAIN=30, tfidf, K=1000, item_weight, UCUI=0, ICUU=7 MAP@K DEA AVG = 0.02492
- NUM_DAYS_TRAIN=30, tfidf, K=1000, item_weight, UCUI=1, ICUU=0 MAP@K DEA AVG = 0.02197
- NUM_DAYS_TRAIN=40, tfidf, K=1000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02522
- NUM_DAYS_TRAIN=50, tfidf, K=1000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02536
- NUM_DAYS_TRAIN=60, tfidf, K=1000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02545
- NUM_DAYS_TRAIN=70, tfidf, K=1000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02540
- NUM_DAYS_TRAIN=90, tfidf, K=1000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02540


- NUM_DAYS_TRAIN=30, cos, K=100, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02288


**implicit nn 2019, u2u**

- NUM_DAYS_TRAIN=30, tfidf, K=1000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02301
- NUM_DAYS_TRAIN=60, tfidf, K=1000, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02317

**implicit nn 2020**
- NUM_DAYS_TRAIN=60, cos, K=5,  cnt_articles, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.01725
- NUM_DAYS_TRAIN=30, cos, K=5,  cnt_articles, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.01875
- NUM_DAYS_TRAIN=14, cos, K=5,  cnt_articles, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.01889


- NUM_DAYS_TRAIN=30, cos, K=2,  cnt_articles, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.01852
- NUM_DAYS_TRAIN=30, cos, K=4,  cnt_articles, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.01876
- NUM_DAYS_TRAIN=30, cos, K=7,  cnt_articles, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.01873
- NUM_DAYS_TRAIN=30, cos, K=12, cnt_articles, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.01872
- NUM_DAYS_TRAIN=30, cos, K=25, cnt_articles, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.01867
- NUM_DAYS_TRAIN=30, cos, K=50, cnt_articles, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.01859


- NUM_DAYS_TRAIN=30, cos, K=2,    item_weight, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.02017
- NUM_DAYS_TRAIN=30, cos, K=4,    item_weight, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.02013
- NUM_DAYS_TRAIN=30, cos, K=5,    item_weight, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.02012
- NUM_DAYS_TRAIN=30, cos, K=7,    item_weight, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.02009
- NUM_DAYS_TRAIN=30, cos, K=12,   item_weight, UCUI=1, ICUU=1 MAP@K DEA AVG = 0.02004


- NUM_DAYS_TRAIN=30, cos, K=4, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02266
- NUM_DAYS_TRAIN=30, cos, K=4, item_weight, UCUI=1, ICUU=0 MAP@K DEA AVG = 0.02006
- NUM_DAYS_TRAIN=30, cos, K=4, item_weight, UCUI=0, ICUU=1 MAP@K DEA AVG = 0.02274
- NUM_DAYS_TRAIN=30, cos, K=4, item_weight, UCUI=0, ICUU=2 MAP@K DEA AVG = 0.02276
- NUM_DAYS_TRAIN=30, cos, K=4, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02277
- NUM_DAYS_TRAIN=30, cos, K=4, item_weight, UCUI=0, ICUU=10 MAP@K DEA AVG= 0.02268
- NUM_DAYS_TRAIN=30, cos, K=4, item_weight, UCUI=2, ICUU=5 MAP@K DEA AVG = 0.01729
- NUM_DAYS_TRAIN=30, cos, K=7, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02276
- NUM_DAYS_TRAIN=30, cos, K=5, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02278
- NUM_DAYS_TRAIN=30, cos, K=12, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG =0.02270


- NUM_DAYS_TRAIN=30, tfidf, K=5, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02390
- NUM_DAYS_TRAIN=30, tfidf, K=10, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02394
- NUM_DAYS_TRAIN=30, tfidf, K=15, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02388
- NUM_DAYS_TRAIN=30, tfidf, K=20, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02389
- NUM_DAYS_TRAIN=30, tfidf, K=500, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02345


- NUM_DAYS_TRAIN=21, tfidf, K=10, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02374
- NUM_DAYS_TRAIN=45, tfidf, K=10, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02394
- NUM_DAYS_TRAIN=60, tfidf, K=10, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02393
- NUM_DAYS_TRAIN=60, tfidf, K=20, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02391
- NUM_DAYS_TRAIN=120, tfidf, K=10, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02364


- NUM_DAYS_TRAIN=30, tfidf, K=10, item_weight, UCUI=0, ICUU=0 MAP@K DEA AVG = 0.02359
- NUM_DAYS_TRAIN=30, tfidf, K=10, item_weight, UCUI=1, ICUU=0 MAP@K DEA AVG = 0.02078
- NUM_DAYS_TRAIN=30, tfidf, K=10, item_weight, UCUI=0, ICUU=1 MAP@K DEA AVG = 0.02378
- NUM_DAYS_TRAIN=30, tfidf, K=10, item_weight, UCUI=0, ICUU=3 MAP@K DEA AVG = 0.02387
- NUM_DAYS_TRAIN=30, tfidf, K=10, item_weight, UCUI=0, ICUU=10 MAP@K DEA AVG = 0.02388


- NUM_DAYS_TRAIN=30, bm25, K=2, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02204
- NUM_DAYS_TRAIN=30, bm25, K=5, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02210
- NUM_DAYS_TRAIN=30, bm25, K=10, item_weight, UCUI=0, ICUU=5 MAP@K DEA AVG = 0.02209
- NUM_DAYS_TRAIN=30, bm25, K=5, item_weight, UCUI=0, ICUU=10 MAP@K DEA AVG = 0.02237

## Make submission for public test

In [3]:
%%time
dir_path = '../input_data/processed_data/'
file_name = 'trans_clean_all.feather'

trans = pd.read_feather(dir_path + file_name)
trans.head()

CPU times: user 551 ms, sys: 523 ms, total: 1.07 s
Wall time: 295 ms


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,cnt_articles
0,2018-09-20,2,541518023,304915,2,1
1,2018-09-20,2,663713001,508305,2,1
2,2018-09-20,7,505221001,203220,2,1
3,2018-09-20,7,505221004,152372,2,1
4,2018-09-20,7,685687001,169322,2,1


In [4]:
article_cat_features = [
    'article_id',
    'product_code',
    'product_type_no',
    'graphical_appearance_no',
    'colour_group_code',
    'perceived_colour_value_id',
    'perceived_colour_master_id',
    'department_no',
    'index_group_no',
    'section_no',
    'garment_group_no'
]

articles_data = pd.read_feather('../input_data/processed_data/articles_clean.feather')
# drop text features
articles_data = articles_data[article_cat_features]
articles_data.head()

Unnamed: 0,article_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_group_no,section_no,garment_group_no
0,108775015,108775,253,1010016,9,4,5,1676,1,16,1002
1,108775044,108775,253,1010016,10,3,9,1676,1,16,1002
2,108775051,108775,253,1010017,11,1,9,1676,1,16,1002
3,110065001,110065,306,1010016,9,4,5,1339,1,61,1017
4,110065002,110065,306,1010016,10,3,9,1339,1,61,1017


In [5]:
customers_data = pd.read_feather('../input_data/processed_data/customers_clean2.feather')
#customers_data_feat = [
#    'customer_id',
#    'FN',
#    'Active',
#    'age',
#    'postal_code'
#]
#customers_data = customers_data[customers_data_feat]
def age_bin_util(x):
    if x < 25:
        return 18
    if 25 <= x < 35:
        return 25
    if 35 <= x < 45:
        return 35
    if 45 <= x < 55:
        return 45
    if x >= 55:
        return 55

customers_data['age_bin'] = customers_data['age'].apply(lambda x: age_bin_util(x))
customers_data.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,gender_calc,age_bin
0,0,0,0,1,1,49,0,1,45
1,1,0,0,1,1,25,1,1,25
2,2,0,0,1,1,24,2,1,18
3,3,0,0,1,1,54,3,0,45
4,4,1,1,1,2,52,4,1,45


In [6]:
%%time
NUM_DAYS_TRAIN = 30
NUM_DAYS_TEST = 7
CUTOFF = 12
PERSONAL_GROUPBY = 'price_weight'

test_start_date = datetime.strptime('2020-09-23', '%Y-%m-%d')
test_start_date, test_end_date, \
    train_end_date, train_start_date = get_dates(
    test_start_date,
    NUM_DAYS_TEST,
    NUM_DAYS_TRAIN
)

# implicit params
model_type = 'tfidf'
model_params = {'K': 1000, 'num_threads': 8}
SIMILARITY_TYPE = 'u2u'
USER_ITEM_VALUES = 'price_weight'
USER_CNT_UNQ_ITEMS = 0
ITEM_CNT_UNQ_USERS = 0

#popular_articles = popular_predict(trans, train_start_date, train_end_date, CUTOFF)
popular_articles = popular_time_decay_predict(trans, 
                                              train_start_date, 
                                              train_end_date, 
                                              CUTOFF,
                                              PERSONAL_GROUPBY)
logging.info('len(popular_articles) = %s', len(popular_articles))

preds_personal_cold = gender_age_personal_cold_start_predict(
    trans, 
    train_start_date, 
    train_end_date,
    customers_data,
    articles_data,
    popular_articles,
    CUTOFF
)
logging.info('preds_personal_cold.shape = %s', preds_personal_cold.shape)

# get implicit nn predict
preds = implicit_fit_predict(
    trans,
    train_start_date, 
    train_end_date,
    USER_ITEM_VALUES,
    model_type,
    model_params,
    SIMILARITY_TYPE,
    CUTOFF,
    USER_CNT_UNQ_ITEMS,
    ITEM_CNT_UNQ_USERS)
logging.info('preds.shape = %s', preds.shape)

# get frequent pairs of items
#freq_pairs = get_frequent_pairs(trans, train_start_date, train_end_date, CUTOFF)
#logging.info('len(freq_pairs) = %s', len(freq_pairs))

# add frequent pairs to personal items
#preds['pred'] = preds['pred'].apply(lambda x: concat_pred_freq_items(x, freq_pairs, CUTOFF))

2022-05-07 14:58:25,123 - INFO - len(popular_articles) = 12
2022-05-07 14:58:38,247 - INFO - preds_personal_cold.shape = (1371980, 2)
2022-05-07 14:58:44,652 - INFO - Trans data collected
2022-05-07 14:58:44,901 - INFO - user_item_log.shape = (1020512, 3)
2022-05-07 14:58:44,920 - INFO - len(items_to_drop) = 0
2022-05-07 14:58:44,943 - INFO - len(users_to_drop) = 0
2022-05-07 14:58:44,959 - INFO - user_item_log_filtered.shape = (1020512, 3)
2022-05-07 14:58:45,647 - INFO - len(user_item_log_filtered) = 1020512
2022-05-07 14:58:46,384 - INFO - len(user_index) = 250619
2022-05-07 14:58:46,384 - INFO - len(item_index) = 29237
2022-05-07 14:58:46,384 - INFO - len(index_item) = 29237
2022-05-07 14:58:46,385 - INFO - len(index_user) = 250619
2022-05-07 14:58:46,385 - INFO - user_items.shape = (250619, 29237)


  0%|          | 0/250619 [00:00<?, ?it/s]

100%|███████████████████████████████████████████| 26/26 [01:52<00:00,  4.34s/it]
2022-05-07 15:01:31,726 - INFO - preds.shape = (250619, 2)


CPU times: user 3min 27s, sys: 22 s, total: 3min 49s
Wall time: 3min 12s


In [7]:
SUB_PATH = '../input_data/processed_data/sub_user_ind.feather'
ARTICLE_DATA_PATH = '../input_data/processed_data/articles_clean.feather'
CUTOFF = 12

In [8]:
%%time
with open('../input_data/processed_data/ind_user.pcl', 'rb') as f:
    ind_user = pickle.load(f)

sub = pd.read_feather(SUB_PATH)
sub = sub.merge(preds, how='left', on='customer_id')
sub['prediction'] = sub['pred'].fillna('').apply(list)
# join common cold start
#sub['prediction'] = sub['prediction'].apply(lambda x: concat_items(x, popular_articles, CUTOFF))

# join personal cold start
sub = sub.merge(preds_personal_cold, how='left', on=['customer_id'])

sub['prediction'] = sub[['prediction', 'personal_cold_start']]\
                    .apply(lambda x: concat_items(x[0], x[1], CUTOFF), axis=1)

sub['customer_id'] = sub['customer_id'].apply(lambda x: ind_user[x])
sub['prediction'] = sub['prediction'].apply(lambda x: ' '.join(['0' + str(int(pred)) for pred in x]))

sub.head()

CPU times: user 13.1 s, sys: 287 ms, total: 13.3 s
Wall time: 13.3 s


Unnamed: 0,customer_id,pred,prediction,personal_cold_start
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[568601043.0, 779781015.0, 858856005.0, 851400...",0568601043 0779781015 0858856005 0851400020 05...,"[751471001, 751471043, 915529003, 928206001, 8..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,0909370001 0924243001 0863583001 0915529003 01...,"[909370001, 924243001, 863583001, 915529003, 1..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[794321007.0, 805000001.0, 794321011.0, 765743...",0794321007 0805000001 0794321011 0765743007 08...,"[918522001, 915526001, 751471001, 915529003, 9..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,0924243001 0918522001 0714790020 0918292001 09...,"[924243001, 918522001, 714790020, 918292001, 9..."
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,,0751471001 0751471043 0915529003 0928206001 08...,"[751471001, 751471043, 915529003, 928206001, 8..."


In [9]:
sub[['customer_id', 'prediction']].to_csv('../submissions/sub_43.csv', index=False)