In [1]:
import pandas as pd
# pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, precision_score
from scipy.sparse import csr_matrix
import xgboost as xgb
from xgboost import XGBClassifier
import pickle
import time

In [2]:
# load dataset
with open('../data/top500_products.pickle', "rb") as input_file:
    df = pickle.load(input_file)
df.shape

(2922905, 500)

In [3]:
# load data containing order_dow and hod
df_temporal = pd.read_csv('../data/orders.csv', index_col=0)
df_temporal = df_temporal[['order_dow', 'order_hour_of_day']]

  mask |= (ar1 == a)


In [4]:
# dow one hot
enc = OneHotEncoder()
df_temporal[[f"order_dow_{x}" for x in range(7)]] = enc.fit_transform(df_temporal.loc[:, 'order_dow'].values.reshape((-1, 1))).toarray().astype(np.int8)

# hod one hot
enc = OneHotEncoder()
df_temporal[[f"order_hod_{x}" for x in range(24)]] = enc.fit_transform(df_temporal.loc[:, 'order_hour_of_day'].values.reshape((-1, 1))).toarray().astype(np.int8)

df_temporal = df_temporal[[col for col in df_temporal.columns if col not in ['order_dow', 'order_hour_of_day']]]
df_temporal.head()

Unnamed: 0_level_0,order_dow_0,order_dow_1,order_dow_2,order_dow_3,order_dow_4,order_dow_5,order_dow_6,order_hod_0,order_hod_1,order_hod_2,...,order_hod_14,order_hod_15,order_hod_16,order_hod_17,order_hod_18,order_hod_19,order_hod_20,order_hod_21,order_hod_22,order_hod_23
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2539329,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2398795,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
473747,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2254736,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
431534,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [5]:
# convert df_temporal to sparse
df_temporal = pd.DataFrame.sparse.from_spmatrix(csr_matrix(df_temporal.values),
                                                index=df_temporal.index,
                                                columns=df_temporal.columns)

# join dataframes without the original dow and hod
df = df.join(df_temporal)
df.shape

(2922905, 531)

## Helper Function

In [6]:
def partition_dataset(df,
                      label_col,
                      features=None,
                      test_size=0.2,
                      valid_size=0.2,
                      random_state=None,
                      three_way=False
                     ):
    if features is None:
        features = [col for col in df.columns if col != label_col]
    X = csr_matrix(df[features].sparse.to_coo())
    y = df[label_col].to_numpy()
        
    if three_way:
        rng = np.random.default_rng(random_state)
        seeds = rng.integers(10000, size=2)
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=test_size, 
                                                            random_state=seeds[0], 
                                                            stratify=y
                                                           )
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                              test_size=test_size, 
                                                              random_state=seeds[1], 
                                                              stratify=y_train
                                                             )
        return X_train, X_valid, X_test, y_train, y_valid, y_test
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=test_size, 
                                                            random_state=random_state, 
                                                            stratify=y
                                                           )
        return X_train, X_test, y_train, y_test

In [7]:
def classification_scores(true, pred):
    scores = []
    scores.append(accuracy_score(true, pred))
    scores.append(precision_score(true, pred))
    scores.append(recall_score(true, pred))
    scores.append(f1_score(true, pred))
    return scores

# Test Run

## XGB runs

In [15]:
version = 1

# features
col_dow = [f"order_dow_{x}" for x in range(7)]
col_hod = [f"order_hod_{x}" for x in range(24)]
use_dow = True
use_hod = True

# xgb params
xgb_params = {'learning_rate': 0.1,
              'colsample_bytree': 1,
              'subsample': 1,
              'objective': 'binary:logistic',
              'eval_metric': 'logloss'
             }

# positive class weight multiplier - multiply weight after balanced
pos_mult = 1

In [16]:
# LOOP THROUGH TOP ALL PRODUCTS

tic = time.process_time()

valid_scores_all = []
test_scores_all = []
feature_importances_df = pd.DataFrame()

predicted_products = df.iloc[:, :500].columns.tolist()

for i, label in enumerate(predicted_products):
    
    print(f"training label: {label}...({i}/{len(predicted_products)})")

    # DATA SETUP

    features = ([x for x in df.columns if x != label])
    if not use_dow: features = [f for f in features if f not in col_dow]
    if not use_hod: features = [f for f in features if f not in col_hod]

    # setup train and test set
    df_xgb = df[features + [label]]
    X_train, X_valid, y_train, y_valid = partition_dataset(df_xgb, 
                                                           label,
                                                           three_way=False,
                                                           random_state=0)

    # PARAMS SETUP
    xgb_params['scale_pos_weight'] = len(y_train[y_train == 0])/len(y_train[y_train == 1]) * pos_mult
        

    # TRAINING
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    watchlist = [(dtrain,'train'), (dvalid,'valid')]

    model = xgb.train(xgb_params,
                      dtrain,
                      num_boost_round=100,
                      evals=watchlist,
                      early_stopping_rounds=20,
                      verbose_eval=100)
    
    # save model
    model.save_model(f"../models/xgb_{label}_v{version}_fi.model")
    
    # feature importance
    feature_importances = {}
    fi_raw = model.get_score(importance_type='gain')
    for i, col in enumerate(features):
        try:
            feature_importances[col] = fi_raw[f"f{i}"]
        except:
            feature_importances[col] = np.nan
    feature_importances_df = feature_importances_df.append(feature_importances, ignore_index=True)
        
    # prediction
    pred_valid = (model.predict(dvalid) > 0.5).astype(int)
    valid_scores_all.append(classification_scores(y_valid, pred_valid))

    
# feature importances indexing
feature_importances_df = feature_importances_df.set_index(pd.Index(predicted_products), 'predicted_product')
feature_importances_df.to_csv(f"../results/xgb_feature_importances_v{version}_top500.csv")

# final processing of scores
df_scores = pd.DataFrame(valid_scores_all,
                         columns=['valid_accuracy', 'valid_precision', 'valid_recall', 'valid_f1'],
                         index=predicted_products
                        )
df_scores.to_csv(f"../results/xgb_product_v{version}_top500.csv")

toc = time.process_time()
print(f"elapsed_time: {toc - tic}")

training label: 24852...(0/500)
[0]	train-logloss:0.68047	valid-logloss:0.68093
[99]	train-logloss:0.58845	valid-logloss:0.58976
training label: 13176...(1/500)
[0]	train-logloss:0.67849	valid-logloss:0.67808
[99]	train-logloss:0.57423	valid-logloss:0.57570
training label: 21137...(2/500)
[0]	train-logloss:0.68364	valid-logloss:0.68359
[99]	train-logloss:0.58883	valid-logloss:0.59043
training label: 21903...(3/500)
[0]	train-logloss:0.68620	valid-logloss:0.68627
[99]	train-logloss:0.59857	valid-logloss:0.59911
training label: 47209...(4/500)
[0]	train-logloss:0.68068	valid-logloss:0.68037
[99]	train-logloss:0.55320	valid-logloss:0.55383
training label: 47766...(5/500)
[0]	train-logloss:0.68047	valid-logloss:0.68109
[99]	train-logloss:0.56767	valid-logloss:0.56841
training label: 47626...(6/500)
[0]	train-logloss:0.68021	valid-logloss:0.68155
[99]	train-logloss:0.57120	valid-logloss:0.57102
training label: 16797...(7/500)
[0]	train-logloss:0.68187	valid-logloss:0.68222
[99]	train-loglos

[0]	train-logloss:0.68530	valid-logloss:0.68484
[99]	train-logloss:0.57457	valid-logloss:0.57430
training label: 35221...(64/500)
[0]	train-logloss:0.67042	valid-logloss:0.67098
[99]	train-logloss:0.50432	valid-logloss:0.50460
training label: 28842...(65/500)
[0]	train-logloss:0.67245	valid-logloss:0.67219
[99]	train-logloss:0.48136	valid-logloss:0.48197
training label: 33731...(66/500)
[0]	train-logloss:0.68424	valid-logloss:0.68553
[99]	train-logloss:0.56760	valid-logloss:0.56835
training label: 8424...(67/500)
[0]	train-logloss:0.68070	valid-logloss:0.68113
[99]	train-logloss:0.54895	valid-logloss:0.54961
training label: 27521...(68/500)
[0]	train-logloss:0.68024	valid-logloss:0.68161
[99]	train-logloss:0.53845	valid-logloss:0.53968
training label: 33198...(69/500)
[0]	train-logloss:0.69048	valid-logloss:0.69060
[99]	train-logloss:0.63987	valid-logloss:0.64017
training label: 8174...(70/500)
[0]	train-logloss:0.68018	valid-logloss:0.68091
[99]	train-logloss:0.55889	valid-logloss:0.5

[0]	train-logloss:0.68028	valid-logloss:0.68151
[99]	train-logloss:0.53471	valid-logloss:0.53537
training label: 41844...(127/500)
[0]	train-logloss:0.68796	valid-logloss:0.68926
[99]	train-logloss:0.61555	valid-logloss:0.61592
training label: 43789...(128/500)
[0]	train-logloss:0.68126	valid-logloss:0.68158
[99]	train-logloss:0.53886	valid-logloss:0.53929
training label: 46676...(129/500)
[0]	train-logloss:0.68667	valid-logloss:0.68600
[99]	train-logloss:0.59458	valid-logloss:0.59484
training label: 31343...(130/500)
[0]	train-logloss:0.68707	valid-logloss:0.68702
[99]	train-logloss:0.58503	valid-logloss:0.58642
training label: 42701...(131/500)
[0]	train-logloss:0.68583	valid-logloss:0.68524
[99]	train-logloss:0.54880	valid-logloss:0.55030
training label: 11777...(132/500)
[0]	train-logloss:0.67884	valid-logloss:0.67975
[99]	train-logloss:0.55334	valid-logloss:0.55368
training label: 38293...(133/500)
[0]	train-logloss:0.68407	valid-logloss:0.68273
[99]	train-logloss:0.55726	valid-lo

training label: 2078...(189/500)
[0]	train-logloss:0.68636	valid-logloss:0.68650
[99]	train-logloss:0.59879	valid-logloss:0.59947
training label: 19048...(190/500)
[0]	train-logloss:0.68198	valid-logloss:0.68066
[99]	train-logloss:0.53954	valid-logloss:0.54037
training label: 32864...(191/500)
[0]	train-logloss:0.68836	valid-logloss:0.68854
[99]	train-logloss:0.60885	valid-logloss:0.60995
training label: 33787...(192/500)
[0]	train-logloss:0.64739	valid-logloss:0.64660
[99]	train-logloss:0.31709	valid-logloss:0.31749
training label: 13984...(193/500)
[0]	train-logloss:0.67461	valid-logloss:0.67457
[99]	train-logloss:0.49717	valid-logloss:0.49818
training label: 41290...(194/500)
[0]	train-logloss:0.68769	valid-logloss:0.68923
[99]	train-logloss:0.62045	valid-logloss:0.62101
training label: 8859...(195/500)
[0]	train-logloss:0.69031	valid-logloss:0.69090
[99]	train-logloss:0.63341	valid-logloss:0.63426
training label: 23165...(196/500)
[0]	train-logloss:0.67586	valid-logloss:0.67595
[99

[99]	train-logloss:0.50194	valid-logloss:0.50278
training label: 23288...(252/500)
[0]	train-logloss:0.69146	valid-logloss:0.69047
[99]	train-logloss:0.63724	valid-logloss:0.63824
training label: 39984...(253/500)
[0]	train-logloss:0.67575	valid-logloss:0.67571
[99]	train-logloss:0.48910	valid-logloss:0.48911
training label: 21376...(254/500)
[0]	train-logloss:0.68712	valid-logloss:0.68698
[99]	train-logloss:0.57884	valid-logloss:0.57993
training label: 46654...(255/500)
[0]	train-logloss:0.68778	valid-logloss:0.68892
[99]	train-logloss:0.62359	valid-logloss:0.62397
training label: 14084...(256/500)
[0]	train-logloss:0.68694	valid-logloss:0.68683
[99]	train-logloss:0.59392	valid-logloss:0.59463
training label: 32734...(257/500)
[0]	train-logloss:0.68553	valid-logloss:0.68433
[99]	train-logloss:0.55862	valid-logloss:0.56021
training label: 37766...(258/500)
[0]	train-logloss:0.68483	valid-logloss:0.68343
[99]	train-logloss:0.57721	valid-logloss:0.57841
training label: 12206...(259/500)


[99]	train-logloss:0.62162	valid-logloss:0.62175
training label: 44449...(315/500)
[0]	train-logloss:0.67997	valid-logloss:0.68092
[99]	train-logloss:0.52876	valid-logloss:0.52906
training label: 42445...(316/500)
[0]	train-logloss:0.68166	valid-logloss:0.68108
[99]	train-logloss:0.52222	valid-logloss:0.52405
training label: 36550...(317/500)
[0]	train-logloss:0.68430	valid-logloss:0.68472
[99]	train-logloss:0.55360	valid-logloss:0.55456
training label: 17706...(318/500)
[0]	train-logloss:0.68384	valid-logloss:0.68255
[99]	train-logloss:0.54835	valid-logloss:0.54932
training label: 19019...(319/500)
[0]	train-logloss:0.68716	valid-logloss:0.68655
[99]	train-logloss:0.57983	valid-logloss:0.58043
training label: 20082...(320/500)
[0]	train-logloss:0.68457	valid-logloss:0.68493
[99]	train-logloss:0.56575	valid-logloss:0.56654
training label: 42625...(321/500)
[0]	train-logloss:0.67427	valid-logloss:0.67482
[99]	train-logloss:0.48357	valid-logloss:0.48476
training label: 21295...(322/500)


[99]	train-logloss:0.42667	valid-logloss:0.42753
training label: 32691...(378/500)
[0]	train-logloss:0.68949	valid-logloss:0.68819
[99]	train-logloss:0.61935	valid-logloss:0.61939
training label: 35898...(379/500)
[0]	train-logloss:0.69133	valid-logloss:0.69088
[99]	train-logloss:0.63324	valid-logloss:0.63392
training label: 10070...(380/500)
[0]	train-logloss:0.68341	valid-logloss:0.68496
[99]	train-logloss:0.56554	valid-logloss:0.56588
training label: 36735...(381/500)
[0]	train-logloss:0.68221	valid-logloss:0.68279
[99]	train-logloss:0.55215	valid-logloss:0.55272
training label: 34262...(382/500)
[0]	train-logloss:0.68829	valid-logloss:0.68824
[99]	train-logloss:0.59693	valid-logloss:0.59726
training label: 4421...(383/500)
[0]	train-logloss:0.68712	valid-logloss:0.68628
[99]	train-logloss:0.57283	valid-logloss:0.57305
training label: 37029...(384/500)
[0]	train-logloss:0.68479	valid-logloss:0.68419
[99]	train-logloss:0.55871	valid-logloss:0.55939
training label: 18918...(385/500)
[

[0]	train-logloss:0.67230	valid-logloss:0.67338
[99]	train-logloss:0.46478	valid-logloss:0.46554
training label: 31066...(441/500)
[0]	train-logloss:0.68986	valid-logloss:0.69143
[99]	train-logloss:0.64703	valid-logloss:0.64741
training label: 26800...(442/500)
[0]	train-logloss:0.68122	valid-logloss:0.68058
[99]	train-logloss:0.54506	valid-logloss:0.54632
training label: 9825...(443/500)
[0]	train-logloss:0.68190	valid-logloss:0.68325
[99]	train-logloss:0.53920	valid-logloss:0.53976
training label: 27796...(444/500)
[0]	train-logloss:0.68361	valid-logloss:0.68246
[99]	train-logloss:0.53831	valid-logloss:0.53809
training label: 11068...(445/500)
[0]	train-logloss:0.68398	valid-logloss:0.68477
[99]	train-logloss:0.55594	valid-logloss:0.55538
training label: 5322...(446/500)
[0]	train-logloss:0.68766	valid-logloss:0.68744
[99]	train-logloss:0.58515	valid-logloss:0.58601
training label: 7969...(447/500)
[0]	train-logloss:0.67433	valid-logloss:0.67591
[99]	train-logloss:0.50785	valid-loglo

  feature_importances_df = feature_importances_df.set_index(pd.Index(predicted_products), 'predicted_product')


elapsed_time: 224888.625
