In [None]:
# mounting Google Drive using the Google Colab library to allows access to files stored in Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

In [None]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("/content/drive/MyDrive/HM-new/") # path to the `src` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [None]:
from src.data import DataHelper
from src.data.metrics import map_at_k, hr_at_k, recall_at_k

from src.utils import (
    calc_valid_date,
    merge_week_data,
    reduce_mem_usage,
    calc_embd_similarity,
)

In [None]:
tqdm.pandas()

In [None]:
# Set the data directory
data_dir = Path("/content/drive/MyDrive/HM-new/data/")
model_dir = Path("/content/drive/MyDrive/HM-new/models/")

In [None]:
dh = DataHelper(data_dir)

In [None]:
# data = dh.preprocess_data(save=True, name="encoded_full")   # run only once, processed data will be saved

In [None]:
data = dh.load_data(name="encoded_full")

In [None]:
# Load the predictions for pred_lgb_classifier
pred_lgb_classifier = pd.read_parquet(data_dir/"processed"/"lgb_classifier_test.pqt")

In [None]:
# Sort predictions
pred_lgb_classifier = pred_lgb_classifier.sort_values(by='prob', ascending=False).reset_index(drop=True)

In [None]:
# Drop duplicates
pred_lgb_classifier = pred_lgb_classifier.drop_duplicates(['customer_id','prediction'])

In [None]:
# Group by customer_id and apply list to predictions
pred_lgb_classifier = pred_lgb_classifier.groupby('customer_id')['prediction'].progress_apply(lambda x: x.head(12).tolist()).reset_index()

100%|██████████| 1371980/1371980 [01:51<00:00, 12324.77it/s]


In [None]:
idx2uid = pickle.load(open(data_dir/"index_id_map/user_index2id.pkl", "rb"))
idx2iid = pickle.load(open(data_dir/"index_id_map/item_index2id.pkl", "rb"))

In [None]:
pred_lgb_classifier

Unnamed: 0,customer_id,prediction
0,1,"[16004, 67523, 15989, 67544, 42627, 17156, 171..."
1,2,"[3092, 104073, 104046, 56695, 77255, 67523, 10..."
2,3,"[78504, 81241, 103797, 56695, 104073, 94657, 1..."
3,4,"[67523, 42627, 104073, 104046, 56695, 104555, ..."
4,5,"[101367, 101368, 77916, 77909, 60764, 67544, 1..."
...,...,...
1371975,1371976,"[81225, 13339, 25069, 64257, 4746, 86099, 1040..."
1371976,1371977,"[71108, 53893, 71102, 71111, 71104, 104046, 30..."
1371977,1371978,"[71108, 71111, 99370, 32743, 42627, 3092, 1040..."
1371978,1371979,"[56695, 3521, 66519, 3092, 13339, 77255, 10407..."


In [None]:
pred_lgb_classifier.dtypes

customer_id     int64
prediction     object
dtype: object

In [None]:
def parse(x):
    l = [str(idx2iid[i]) for i in x]
    l = ' '.join(l[:12])
    return l

In [None]:
pred_lgb_classifier['prediction'] = pred_lgb_classifier['prediction'].progress_apply(lambda x: parse(x))

100%|██████████| 1371980/1371980 [00:07<00:00, 194017.62it/s]


In [None]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

In [None]:
del submission['prediction']
submission = submission.merge(pred_lgb_classifier, on='customer_id', how='left')
# Map the customer indices back to customer IDs
submission['customer_id'] = submission['customer_id'].map(idx2uid)

In [None]:
submission = submission[['customer_id', 'prediction']]

In [None]:
submission.to_csv('lgb_classifier.csv', index=False)

In [None]:
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,568601006 751471001 568597006 751471043 673677...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,448509014 918522001 918292001 714790020 788575...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007 805000001 915529003 714790020 918522...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,751471001 673677002 918522001 918292001 714790...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,896152001 896152002 791587015 791587001 730683...


In [None]:
# Convert prediction strings into lists of article_ids
submission['prediction'] = submission['prediction'].apply(lambda x: x.split())

# Explode the DataFrame so each row contains one prediction
submission_exploded = submission.explode('prediction')

# Rename columns for clarity
submission_exploded.rename(columns={'prediction': 'article_id'}, inplace=True)

# Convert article_id to the correct type to match articles DataFrame
submission_exploded['article_id'] = submission_exploded['article_id'].astype(str)
articles['article_id'] = articles['article_id'].astype(str)

# Merge to get product names
result = pd.merge(submission_exploded, articles[['article_id', 'prod_name']], on='article_id', how='left')

# If you want to aggregate back to lists:
result = result.groupby('customer_id')['prod_name'].agg(list).reset_index()

# Save or view the result
print(result.head())
# result.to_csv('lgb_classifier_final_submission.csv', index=False)

                                         customer_id  \
0  00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...   
1  0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...   
2  000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...   
3  00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...   
4  00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...   

                                           prod_name  
0  [Mariette Blazer, Pluto RW slacks (1), Hayes s...  
1  [Perrie Slim Mom Denim TRS, Jackie cable vest,...  
2  [SULIMA  jkt, Norway hood jacket, Liliana, Mom...  
3  [Pluto RW slacks (1), Henry polo. (1), Jackie ...  
4  [Amelie, Amelie, Speedy conscious tee, Speedy ...  


In [None]:
result.head()

Unnamed: 0,customer_id,prod_name
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[Mariette Blazer, Pluto RW slacks (1), Hayes s..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[Perrie Slim Mom Denim TRS, Jackie cable vest,..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[SULIMA jkt, Norway hood jacket, Liliana, Mom..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[Pluto RW slacks (1), Henry polo. (1), Jackie ..."
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[Amelie, Amelie, Speedy conscious tee, Speedy ..."
