In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

In [2]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
# sys.path.append("/content/drive/MyDrive/HM/") # path to the `src`` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [3]:
from src.data import DataHelper
from src.data.metrics import map_at_k, hr_at_k, recall_at_k

from src.utils import (
    calc_valid_date,
    merge_week_data,
    reduce_mem_usage,
    calc_embd_similarity,
)

In [4]:
tqdm.pandas()

In [5]:
data_dir = Path("data")
model_dir = Path("models")

Pepare data: encoding ids and preprocessing

In [6]:
%pip install kagglehub

Note: you may need to restart the kernel to use updated packages.


In [7]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("thangndk67/h-and-m-csv-dataset")

print("Path to dataset files:", path)

Path to dataset files: /home/codespace/.cache/kagglehub/datasets/thangndk67/h-and-m-csv-dataset/versions/3


In [8]:
dh = DataHelper(data_dir)

In [11]:
# data = dh.preprocess_data(save=True, name="encoded_full") # * run only once, processed data will be saved

In [9]:
data = dh.load_data(name="encoded_full", load_path=path)

FileNotFoundError: [Errno 2] No such file or directory: '/home/codespace/.cache/kagglehub/datasets/thangndk67/h-and-m-csv-dataset/versions/3/user.pqt'

### Blend

In [None]:
pred1_lgb_rank = pd.read_parquet(data_dir/"processed"/"large_rank_valid.pqt")
pred1_lgb_binary = pd.read_parquet(data_dir/"processed"/"large_binary_valid.pqt")
pred1_nn = pd.read_parquet(data_dir/"external"/"large_nn_valid.pqt")
pred1_nn.rename(columns={'article_id':'prediction'},inplace=True)

In [None]:
pred2_lgb_rank = pd.read_parquet(data_dir/"processed"/"small_rank_valid.pqt")
pred2_lgb_binary = pd.read_parquet(data_dir/"processed"/"small_binary_valid.pqt")
pred2_nn = pd.read_parquet(data_dir/"external"/"small_nn_valid.pqt")
pred2_nn.rename(columns={'article_id':'prediction'},inplace=True)

In [None]:
pred1_lgb_binary = pred1_lgb_binary.merge(pred1_nn, on=['customer_id','prediction'], how='left')
pred1_lgb_binary['prob'] = pred1_lgb_binary['prob_x'] + pred1_lgb_binary['prob_y']

pred2_lgb_binary = pred2_lgb_binary.merge(pred2_nn, on=['customer_id','prediction'], how='left')
pred2_lgb_binary['prob'] = pred2_lgb_binary['prob_x'] + pred2_lgb_binary['prob_y']

In [None]:
pred1_lgb_rank   = pred1_lgb_rank.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred1_lgb_binary = pred1_lgb_binary.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred2_lgb_rank   = pred2_lgb_rank.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred2_lgb_binary = pred2_lgb_binary.sort_values(by='prob', ascending=False).reset_index(drop=True)

In [None]:
pred1_lgb_rank   = pred1_lgb_rank.groupby('customer_id')['prediction'].apply(list).reset_index()
pred1_lgb_binary = pred1_lgb_binary.groupby('customer_id')['prediction'].apply(list).reset_index()
pred2_lgb_rank   = pred2_lgb_rank.groupby('customer_id')['prediction'].apply(list).reset_index()
pred2_lgb_binary = pred2_lgb_binary.groupby('customer_id')['prediction'].apply(list).reset_index()

In [None]:
pred1_lgb_rank.rename(columns={'prediction':'large_rank'},inplace=True)
pred1_lgb_binary.rename(columns={'prediction':'large_binary'},inplace=True)
pred2_lgb_rank.rename(columns={'prediction':'small_rank'},inplace=True)
pred2_lgb_binary.rename(columns={'prediction':'small_binary'},inplace=True)

In [None]:
pred = (
    pred1_lgb_rank.merge(pred1_lgb_binary, on=['customer_id'])
                    .merge(pred2_lgb_rank, on=['customer_id'])
                    .merge(pred2_lgb_binary, on=['customer_id'])
                    )

In [None]:
def cust_blend(dt, W = [1,1,1,1]):
    #Create a list of all model predictions
    REC = []

    # Second Try
    REC.append(dt['large_rank'])
    REC.append(dt['large_binary'])
    REC.append(dt['small_rank'])
    REC.append(dt['small_binary'])

    #Create a dictionary of items recommended.
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))

    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

    # Return the top 12 items only
    return res[:12]

In [None]:
pred['prediction'] = pred.progress_apply(cust_blend, W = [1.0, 1.3, 1.0, 1.3], axis=1)

100%|██████████| 68984/68984 [00:07<00:00, 9609.50it/s]


In [None]:
label = pd.read_parquet(data_dir/"processed"/"pivot"/"week1_label.pqt")
label = pd.merge(label, pred, on="customer_id", how="left")

In [None]:
map_at_k(label["article_id"], label["prediction"], k=12)

# 0.032249032703345615

0.033772875345730695

### Test

In [None]:
pred1_lgb_rank = pd.read_parquet(data_dir/"processed"/"large_rank_test.pqt")
pred1_lgb_binary = pd.read_parquet(data_dir/"processed"/"large_binary_test.pqt")
pred1_nn = pd.read_parquet(data_dir/"processed"/"large_nn_test.pqt")
pred1_nn.rename(columns={'article_id':'prediction'},inplace=True)

In [None]:
pred1_lgb_rank = pred1_lgb_rank.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred1_lgb_rank = pred1_lgb_rank.drop_duplicates(['customer_id','prediction'])

pred1_lgb_binary = pred1_lgb_binary.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred1_lgb_binary = pred1_lgb_binary.drop_duplicates(['customer_id','prediction'])

pred1_nn = pred1_nn.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred1_nn = pred1_nn.drop_duplicates(['customer_id','prediction'])

In [None]:
pred1_lgb_binary = pred1_lgb_binary.sort_values(by=['customer_id','prediction']).reset_index(drop=True)
pred1_nn = pred1_nn.sort_values(by=['customer_id','prediction']).reset_index(drop=True)

# * not using merge here to avoid memory error
pred1_lgb_binary['prob2'] = pred1_lgb_binary['prob'] + pred1_nn['prob']
pred1_lgb_binary = pred1_lgb_binary.sort_values(by='prob2', ascending=False).reset_index(drop=True)

In [None]:
pred1_lgb_rank   = pred1_lgb_rank.groupby('customer_id')['prediction'].progress_apply(list).reset_index()
pred1_lgb_binary = pred1_lgb_binary.groupby('customer_id')['prediction'].progress_apply(list).reset_index()

100%|██████████| 1303117/1303117 [01:02<00:00, 20846.60it/s]
100%|██████████| 1303117/1303117 [01:09<00:00, 18805.83it/s]


In [None]:
pred1_lgb_rank.to_parquet(data_dir/"large_rank_test_new.pqt")
pred1_lgb_binary.to_parquet(data_dir/"large_binary_test_new.pqt")

In [None]:
# * ------------------------------------------------------------

In [None]:
pred2_lgb_rank = pd.read_parquet(data_dir/"processed"/"small_rank_test.pqt")
pred2_lgb_binary = pd.read_parquet(data_dir/"processed"/"small_binary_test.pqt")
pred2_nn = pd.read_parquet(data_dir/"processed"/"small_nn_test.pqt")
pred2_nn.rename(columns={'article_id':'prediction'},inplace=True)

In [None]:
pred2_lgb_rank = pred2_lgb_rank.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred2_lgb_rank = pred2_lgb_rank.drop_duplicates(['customer_id','prediction'])

pred2_lgb_binary = pred2_lgb_binary.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred2_lgb_binary = pred2_lgb_binary.drop_duplicates(['customer_id','prediction'])

pred2_nn = pred2_nn.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred2_nn = pred2_nn.drop_duplicates(['customer_id','prediction'])

In [None]:
pred2_lgb_binary = pred2_lgb_binary.sort_values(by=['customer_id','prediction']).reset_index(drop=True)
pred2_nn = pred2_nn.sort_values(by=['customer_id','prediction']).reset_index(drop=True)

pred2_lgb_binary['prob2'] = pred2_lgb_binary['prob'] + pred2_nn['prob']
pred2_lgb_binary = pred2_lgb_binary.sort_values(by='prob2', ascending=False).reset_index(drop=True)

In [None]:
pred2_lgb_rank   = pred2_lgb_rank.groupby('customer_id')['prediction'].progress_apply(list).reset_index()
pred2_lgb_binary = pred2_lgb_binary.groupby('customer_id')['prediction'].progress_apply(list).reset_index()

100%|██████████| 1371980/1371980 [01:06<00:00, 20647.26it/s]
100%|██████████| 1371980/1371980 [01:05<00:00, 20995.72it/s]


In [None]:
pred2_lgb_rank.shape

(1371980, 2)

In [None]:
# pred2_lgb_rank.to_parquet(data_dir/"small_rank_test.pqt")
# pred2_lgb_binary.to_parquet(data_dir/"small_binary_test.pqt")

In [None]:
# ----------------------------------------------------------

In [None]:
pred1_lgb_rank.rename(columns={'prediction':'large_rank'},inplace=True)
pred1_lgb_binary.rename(columns={'prediction':'large_binary'},inplace=True)
pred2_lgb_rank.rename(columns={'prediction':'small_rank'},inplace=True)
pred2_lgb_binary.rename(columns={'prediction':'small_binary'},inplace=True)

In [None]:
pred = pred2_lgb_rank.merge(pred2_lgb_binary, on=['customer_id'], how='left')\
                    .merge(pred1_lgb_rank, on=['customer_id'], how='left')\
                    .merge(pred1_lgb_binary, on=['customer_id'], how='left')

In [None]:
for f in ['large_rank','large_binary']:
    pred[f] = pred.progress_apply(lambda x:x[f] if not pd.isna(np.array(x[f])).any() else [], axis=1)

100%|██████████| 1371980/1371980 [00:43<00:00, 31292.08it/s]
100%|██████████| 1371980/1371980 [00:43<00:00, 31513.76it/s]


In [None]:
def cust_blend(dt, W = [1,1]):
    #Create a list of all model predictions
    REC = []

    # Second Try
    REC.append(dt['large_rank'])
    REC.append(dt['large_binary'])
    REC.append(dt['small_rank'])
    REC.append(dt['small_binary'])

    #Create a dictionary of items recommended.
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))

    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

    # Return the top 12 items only
    return res[:12]

In [None]:
pred['prediction'] = pred.progress_apply(cust_blend, W = [1.0, 1.3, 1.0, 1.3], axis=1) # , 1.0, 1.2

In [None]:
idx2uid = pickle.load(open(data_dir/"index_id_map/user_index2id.pkl", "rb"))
idx2iid = pickle.load(open(data_dir/"index_id_map/item_index2id.pkl", "rb"))

In [None]:
def parse(x):
    l = ['0'+str(idx2iid[i]) for i in x]
    l = ' '.join(l[:12])
    return l

In [None]:
pred['prediction'] = pred['prediction'].progress_apply(lambda x: parse(x))

100%|██████████| 1371980/1371980 [00:28<00:00, 48095.38it/s] 


In [None]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

In [None]:
del submission['prediction']
submission = submission.merge(pred, on='customer_id', how='left')
submission['customer_id'] = submission['customer_id'].map(idx2uid)

In [None]:
submission = submission[['customer_id', 'prediction']]

In [None]:
submission.to_csv('large_recall_binary.csv', index=False)

In [None]:
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601006 0568601043 0751471043 0751471001 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0448509014 0918292001 0762846027 0706016003 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0486639003 0160442010 0918292001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0918522001 0762846031 0762846027 0918292001 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152002 0730683050 0751471043 0896152001 08...
