In [1]:
VER = 6

import pandas as pd, numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import os, sys, pickle, glob, gc
from collections import Counter
import cudf, itertools
print('We will use RAPIDS version',cudf.__version__)

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

from pandarallel import pandarallel

pandarallel.initialize(nb_workers=4, progress_bar=True, use_memory_fs=False)

import xgboost as xgb
from sklearn.model_selection import GroupKFold

from pyarrow.parquet import ParquetFile
import pyarrow as pa 

from catboost import CatBoostRanker, Pool

We will use RAPIDS version 22.10.00a+392.g1558403753
INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
def remove_negative_session(df,target='label'):
    true_df = df.groupby('session')[target].agg('sum') > 0
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    df = df.merge(session, how = 'inner', on = 'session')
    return df

# Training

In [3]:
type_labels = {'clicks':0, 'carts':1, 'orders':2}

In [4]:
CANDIDATE_COUNT = 100

In [5]:
RUN_FOR = "kaggle" # "kaggle"

In [6]:
train_sessions = np.load("./splitted_raw_data/val_sessions_for_train.npy", allow_pickle=True)

In [7]:
import json

with open("./models/model_iters.json", "r") as read_file:
    model_iters = json.load(read_file)

In [8]:
model_iters

{'./models/XGB_100candidates_fold0_clicks.xgb': 900,
 './models/XGB_100candidates_fold1_clicks.xgb': 900,
 './models/XGB_100candidates_fold2_clicks.xgb': 900,
 './models/XGB_100candidates_fold3_clicks.xgb': 900,
 './models/XGB_100candidates_fold4_clicks.xgb': 900,
 './models/XGB_100candidates_fold0_carts.xgb': 400,
 './models/XGB_100candidates_fold1_carts.xgb': 400,
 './models/XGB_100candidates_fold2_carts.xgb': 400,
 './models/XGB_100candidates_fold3_carts.xgb': 400,
 './models/XGB_100candidates_fold4_carts.xgb': 400,
 './models/XGB_100candidates_fold0_orders.xgb': 400,
 './models/XGB_100candidates_fold1_orders.xgb': 400,
 './models/XGB_100candidates_fold2_orders.xgb': 400,
 './models/XGB_100candidates_fold3_orders.xgb': 400,
 './models/XGB_100candidates_fold4_orders.xgb': 400}

In [9]:
subs = []

for type_str in tqdm(list(type_labels.keys())):
        
    batches = sorted(glob.glob(f"./candidated_features/{RUN_FOR}_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))

    model_paths = sorted(glob.glob(f"./models/XGB_{CANDIDATE_COUNT}candidates_fold*_{type_str}.xgb"))
#     model_paths = sorted(glob.glob(f"./models/CB_{CANDIDATE_COUNT}candidates_fold*_{type_str}.cb"))

    all_predictions = []
    
    for batch in tqdm(batches):
        whole_df = pd.read_parquet(batch)#.drop(labels=["candidate_rank"], axis=1)
#         whole_df.drop(columns=dropcols, axis=1, inplace=True)

        #whole_df = whole_df.drop(labels=[col for col in whole_df.columns if col[:4]=="type"], axis=1)

        if RUN_FOR == "local":
#             whole_df = remove_negative_session(whole_df).reset_index(drop=True)
            whole_df = whole_df[~whole_df.session.isin(train_sessions)].reset_index(drop=True)
        
        print(f"Processing {len(whole_df)} rows...")

        CHUNK_SIZE = 1_500_000
        
        for chunk_num in range(len(whole_df) // CHUNK_SIZE + 1):
            start_index = chunk_num*CHUNK_SIZE
            end_index = min(chunk_num*CHUNK_SIZE + CHUNK_SIZE, len(whole_df))
            print(start_index, end_index)
            chunk_df = whole_df.iloc[start_index:end_index]#.drop(dropcols, 1)

            FEATURES = chunk_df.columns[2 : -1]
            dtest = xgb.DMatrix(data=chunk_df[FEATURES])
    #         dtest = Pool(
    #             data=whole_df[FEATURES]
    #         )

            preds = [] #np.zeros(len(chunk_df))

            for model_path in model_paths:
    #             target_it = int(model_path.split("_it")[-1].split(".xgb")[0])
    #             model = CatBoostRanker()
    #             model.load_model(model_path)

                model = xgb.Booster()
                model.load_model(model_path)
                model.set_param({'predictor': 'gpu_predictor'})

                preds.append(model.predict(dtest#,
#                                        iteration_range = (0, model_iters[model_path])
                                      ))
            preds = np.mean(preds, axis=0)
            
            predictions = chunk_df[['session','aid']].copy()
            predictions['pred'] = preds
            all_predictions.append(predictions)
        
    all_predictions = pd.concat(all_predictions, ignore_index=True)
    
    all_predictions = all_predictions.sort_values(['session','pred'],
                                                  ascending=[True,False]).reset_index(drop=True)
    
    all_predictions.to_parquet(f"../raw_data/soft_scores/{RUN_FOR}_{type_str}_soft_scores.parquet")
        
    all_predictions['n'] = all_predictions.groupby('session').aid.cumcount().astype('int8')
    all_predictions = all_predictions.loc[all_predictions.n<20]

    sub = all_predictions.groupby('session').aid.apply(list)
    sub = sub.to_frame().reset_index()
    sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))
    sub.columns = ['session_type','labels']
    sub.session_type = sub.session_type.astype('str') + '_' + type_str

    subs.append(sub)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3

  sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))


  0%|          | 0/17 [00:00<?, ?it/s]

Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3

  sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))


  0%|          | 0/17 [00:00<?, ?it/s]

Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3000000 4500000
4500000 6000000
6000000 7500000
7500000 9000000
9000000 10000000
Processing 10000000 rows...
0 1500000
1500000 3000000
3

  sub.item = sub.aid.apply(lambda x: " ".join(map(str,x)))


In [10]:
chunk_num

6

## Local Score

In [11]:
final_sub = pd.concat(subs, ignore_index=True)
final_sub.sort_values(by="session_type", ascending=True).reset_index(drop=True)

if RUN_FOR == "local":
    # COMPUTE METRIC
    score = 0
    weights = {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60}
    for t in [
        'clicks',
        'carts',
        'orders'
    ]:
        sub = final_sub.loc[final_sub.session_type.str.contains(t)].copy()
        sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    #     sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])
        test_labels = pd.read_parquet('./splitted_raw_data/val_labels.parquet')
        test_labels = test_labels[~test_labels.session.isin(train_sessions)].reset_index(drop=True)
        test_labels = test_labels.loc[test_labels['type']==t]
        test_labels = test_labels.merge(sub, how='left', on=['session'])
        test_labels['labels'] = test_labels['labels'].fillna("").apply(list)
        test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
        test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
        recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
        score += weights[t]*recall
        print(f'{t} recall =',recall)

    print('=============')
    print('Overall Recall =',score)
    print('=============')

elif RUN_FOR == "kaggle":
    final_sub["labels"] = final_sub.labels.apply(lambda x: " ".join([str(elm) for elm in x]))
#     final_sub.to_csv("submission.csv", index=False)
    final_sub.to_csv("submission.csv.gz", index=False, compression='gzip')

In [12]:
whole_df.session.nunique()

100000

In [13]:
score

NameError: name 'score' is not defined

In [None]:
test_labels.ground_truth

In [None]:
feat_importances = pd.Series(model.feature_importances_, index=whole_df[FEATURES].columns)

In [None]:
type_str = "clicks"

batches = sorted(glob.glob(f"./candidated_features/{RUN_FOR}_{type_str}_all_data_{CANDIDATE_COUNT}candidates_p*.pqt"))
for batch in tqdm(batches):
    whole_df = pd.read_parquet(batch)
    if RUN_FOR == "local":
        whole_df = remove_negative_session(whole_df).reset_index(drop=True)
        whole_df = whole_df[~whole_df.session.isin(train_sessions)].reset_index(drop=True)
        
    FEATURES = whole_df.columns[2 : -1]
    dtest = xgb.DMatrix(data=whole_df[FEATURES])
    break

In [None]:
model_path = sorted(glob.glob(f"./models/XGB_{CANDIDATE_COUNT}candidates_fold*_{type_str}.xgb"))[0]
model = xgb.Booster()
model.load_model(model_path)

model.predict(dtest)

imp_df = pd.Series(model.get_score(importance_type='gain'))
imp_df.nlargest(30).plot(kind='barh')

In [None]:
del whole_df, dtest, model

In [None]:
score