In [1]:
import pandas as pd
# pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, precision_score
from scipy.sparse import csr_matrix
import xgboost as xgb
from xgboost import XGBClassifier
import pickle
import time

In [2]:
# load dataset
with open('../data/top10_categories.pickle', "rb") as input_file:
    df = pickle.load(input_file)
df.shape

(3295481, 31926)

In [3]:
# load data containing order_dow and hod
df_temporal = pd.read_csv('../data/orders.csv', index_col=0)
df_temporal = df_temporal[['order_dow', 'order_hour_of_day']]

  mask |= (ar1 == a)


In [4]:
# dow one hot
enc = OneHotEncoder()
df_temporal[[f"order_dow_{x}" for x in range(7)]] = enc.fit_transform(df_temporal.loc[:, 'order_dow'].values.reshape((-1, 1))).toarray().astype(np.int8)

# hod one hot
enc = OneHotEncoder()
df_temporal[[f"order_hod_{x}" for x in range(24)]] = enc.fit_transform(df_temporal.loc[:, 'order_hour_of_day'].values.reshape((-1, 1))).toarray().astype(np.int8)

df_temporal = df_temporal[[col for col in df_temporal.columns if col not in ['order_dow', 'order_hour_of_day']]]
df_temporal.head()

Unnamed: 0_level_0,order_dow_0,order_dow_1,order_dow_2,order_dow_3,order_dow_4,order_dow_5,order_dow_6,order_hod_0,order_hod_1,order_hod_2,...,order_hod_14,order_hod_15,order_hod_16,order_hod_17,order_hod_18,order_hod_19,order_hod_20,order_hod_21,order_hod_22,order_hod_23
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2539329,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2398795,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
473747,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2254736,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
431534,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [5]:
# convert df_temporal to sparse
df_temporal = pd.DataFrame.sparse.from_spmatrix(csr_matrix(df_temporal.values),
                                                index=df_temporal.index,
                                                columns=df_temporal.columns)

# join dataframes without the original dow and hod
df = df.join(df_temporal)
df.shape

(3295481, 31957)

# Test Run

# Helper functions

In [6]:
def partition_dataset(df,
                      label_col,
                      features=None,
                      test_size=0.2,
                      valid_size=0.2,
                      random_state=None,
                      three_way=False
                     ):
    if features is None:
        features = [col for col in df.columns if col != label_col]
    X = csr_matrix(df[features].sparse.to_coo())
    y = df[label_col].to_numpy()
        
    if three_way:
        rng = np.random.default_rng(random_state)
        seeds = rng.integers(10000, size=2)
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=test_size, 
                                                            random_state=seeds[0], 
                                                            stratify=y
                                                           )
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                              test_size=test_size, 
                                                              random_state=seeds[1], 
                                                              stratify=y_train
                                                             )
        return X_train, X_valid, X_test, y_train, y_valid, y_test
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=test_size, 
                                                            random_state=random_state, 
                                                            stratify=y
                                                           )
        return X_train, X_test, y_train, y_test

In [7]:
def classification_scores(true, pred):
    scores = []
    scores.append(accuracy_score(true, pred))
    scores.append(precision_score(true, pred))
    scores.append(recall_score(true, pred))
    scores.append(f1_score(true, pred))
    return scores

## Estimating n_estimators

In [8]:
version = 3

# get top product_ids
labels = df.sum(axis=0).sort_values(ascending=False)[:10].index.tolist()
print(labels)

# features
col_dow = [f"order_dow_{x}" for x in range(7)]
col_hod = [f"order_hod_{x}" for x in range(24)]
use_dow = False
use_hod = False

# xgb params
xgb_params = {'learning_rate': 0.05,
              'colsample_bytree': 0.8,
              'subsample': 0.8,
              'objective': 'binary:logistic',
              'eval_metric': 'logloss'
             }

# positive class weight multiplier - multiply weight after balanced
pos_mult = 0.8

['24852', '13176', '21137', '21903', '47209', '47766', '47626', '16797', '26209', '27845']


In [9]:
# LOOP THROUGH TOP 10 PRODUCTS

tic = time.process_time()

valid_scores_all = []
test_scores_all = []

for label in labels:
    
    print(f"training label: {label}...")

    # DATA SETUP

    features = ([x for x in df.columns if x != label])
    if not use_dow: features = [f for f in features if f not in col_dow]
    if not use_hod: features = [f for f in features if f not in col_hod]

    # setup train and test set
    df_xgb = df[features + [label]]
    X_train, X_valid, X_test, y_train, y_valid, y_test = partition_dataset(df_xgb, 
                                                                           label,
                                                                           three_way=True,
                                                                           random_state=0)

    # PARAMS SETUP
    xgb_params['scale_pos_weight'] = len(y_train[y_train == 0])/len(y_train[y_train == 1]) * pos_mult
        

    # TRAINING
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    dtest = xgb.DMatrix(X_test, label=y_test)
    watchlist = [(dtrain,'train'), (dvalid,'valid')]

    model = xgb.train(xgb_params,
                      dtrain,
                      num_boost_round=20000,
                      evals=watchlist,
                      early_stopping_rounds=20,
                      verbose_eval=100)
    
    # save model
    model.save_model(f"../models/xgb_{label}_v{version}.model")
        
    # prediction
    pred_valid = (model.predict(dvalid) > 0.5).astype(int)
    valid_scores_all.append(classification_scores(y_valid, pred_valid))
    pred_test = (model.predict(dtest) > 0.5).astype(int)
    test_scores_all.append(classification_scores(y_test, pred_test))


# final processing of scores
scores_all = np.concatenate([valid_scores_all, test_scores_all], axis=1)
df_scores = pd.DataFrame(scores_all,
                         columns=['valid_accuracy', 'valid_precision', 'valid_recall', 'valid_f1',
                                  'test_accuracy', 'test_precision', 'test_recall', 'test_f1'],
                         index=labels
                        )
df_scores.to_csv(f"../results/xgb_product_v{version}.csv")

toc = time.process_time()
print(f"elapsed_time: {toc - tic}")

training label: 24852...
[0]	train-logloss:0.68462	valid-logloss:0.68431
[100]	train-logloss:0.53390	valid-logloss:0.53441
[200]	train-logloss:0.52083	valid-logloss:0.52197
[300]	train-logloss:0.51377	valid-logloss:0.51507
[400]	train-logloss:0.50884	valid-logloss:0.51068
[500]	train-logloss:0.50476	valid-logloss:0.50719
[600]	train-logloss:0.50168	valid-logloss:0.50438
[700]	train-logloss:0.49891	valid-logloss:0.50198
[800]	train-logloss:0.49664	valid-logloss:0.49997
[900]	train-logloss:0.49463	valid-logloss:0.49832
[1000]	train-logloss:0.49267	valid-logloss:0.49674
[1100]	train-logloss:0.49103	valid-logloss:0.49534
[1200]	train-logloss:0.48937	valid-logloss:0.49412
[1300]	train-logloss:0.48816	valid-logloss:0.49307
[1400]	train-logloss:0.48660	valid-logloss:0.49198
[1500]	train-logloss:0.48532	valid-logloss:0.49096
[1600]	train-logloss:0.48426	valid-logloss:0.49006
[1700]	train-logloss:0.48312	valid-logloss:0.48927
[1800]	train-logloss:0.48197	valid-logloss:0.48843
[1900]	train-loglo

[3200]	train-logloss:0.41675	valid-logloss:0.42332
[3300]	train-logloss:0.41589	valid-logloss:0.42259
[3400]	train-logloss:0.41502	valid-logloss:0.42187
[3500]	train-logloss:0.41407	valid-logloss:0.42109
[3600]	train-logloss:0.41310	valid-logloss:0.42030
[3700]	train-logloss:0.41230	valid-logloss:0.41964
[3800]	train-logloss:0.41149	valid-logloss:0.41895
[3900]	train-logloss:0.41065	valid-logloss:0.41828
[4000]	train-logloss:0.40987	valid-logloss:0.41764
[4100]	train-logloss:0.40894	valid-logloss:0.41698
[4200]	train-logloss:0.40812	valid-logloss:0.41630
[4300]	train-logloss:0.40736	valid-logloss:0.41567
[4328]	train-logloss:0.40716	valid-logloss:0.41552
training label: 47626...
[0]	train-logloss:0.68154	valid-logloss:0.68272
[100]	train-logloss:0.49412	valid-logloss:0.49495
[200]	train-logloss:0.48037	valid-logloss:0.48129
[300]	train-logloss:0.47325	valid-logloss:0.47497
[400]	train-logloss:0.46875	valid-logloss:0.47036
[500]	train-logloss:0.46485	valid-logloss:0.46696
[600]	train-lo