In [1]:
import pandas as pd
# pd.set_option('display.max_columns', None)
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, recall_score, precision_score
from scipy.sparse import csr_matrix
import xgboost as xgb
from xgboost import XGBClassifier
import pickle
import time

In [2]:
# load dataset
with open('../data/top500_products.pickle', "rb") as input_file:
    df = pickle.load(input_file)
df.shape

(2922905, 500)

In [3]:
# load data containing order_dow and hod
df_temporal = pd.read_csv('../data/orders.csv', index_col=0)
df_temporal = df_temporal[['order_dow', 'order_hour_of_day']]

  mask |= (ar1 == a)


In [4]:
# dow one hot
enc = OneHotEncoder()
df_temporal[[f"order_dow_{x}" for x in range(7)]] = enc.fit_transform(df_temporal.loc[:, 'order_dow'].values.reshape((-1, 1))).toarray().astype(np.int8)

# hod one hot
enc = OneHotEncoder()
df_temporal[[f"order_hod_{x}" for x in range(24)]] = enc.fit_transform(df_temporal.loc[:, 'order_hour_of_day'].values.reshape((-1, 1))).toarray().astype(np.int8)

df_temporal = df_temporal[[col for col in df_temporal.columns if col not in ['order_dow', 'order_hour_of_day']]]
df_temporal.head()

Unnamed: 0_level_0,order_dow_0,order_dow_1,order_dow_2,order_dow_3,order_dow_4,order_dow_5,order_dow_6,order_hod_0,order_hod_1,order_hod_2,...,order_hod_14,order_hod_15,order_hod_16,order_hod_17,order_hod_18,order_hod_19,order_hod_20,order_hod_21,order_hod_22,order_hod_23
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2539329,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2398795,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
473747,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2254736,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
431534,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [5]:
# convert df_temporal to sparse
df_temporal = pd.DataFrame.sparse.from_spmatrix(csr_matrix(df_temporal.values),
                                                index=df_temporal.index,
                                                columns=df_temporal.columns)

# join dataframes without the original dow and hod
df = df.join(df_temporal)
df.shape

(2922905, 531)

## Helper Function

In [6]:
def partition_dataset(df,
                      label_col,
                      features=None,
                      test_size=0.2,
                      valid_size=0.2,
                      random_state=None,
                      three_way=False
                     ):
    if features is None:
        features = [col for col in df.columns if col != label_col]
    X = csr_matrix(df[features].sparse.to_coo())
    y = df[label_col].to_numpy()
        
    if three_way:
        rng = np.random.default_rng(random_state)
        seeds = rng.integers(10000, size=2)
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=test_size, 
                                                            random_state=seeds[0], 
                                                            stratify=y
                                                           )
        X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train,
                                                              test_size=test_size, 
                                                              random_state=seeds[1], 
                                                              stratify=y_train
                                                             )
        return X_train, X_valid, X_test, y_train, y_valid, y_test
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                            test_size=test_size, 
                                                            random_state=random_state, 
                                                            stratify=y
                                                           )
        return X_train, X_test, y_train, y_test

In [7]:
def classification_scores(true, pred):
    scores = []
    scores.append(accuracy_score(true, pred))
    scores.append(precision_score(true, pred))
    scores.append(recall_score(true, pred))
    scores.append(f1_score(true, pred))
    return scores

# Test Run

## XGB runs

In [8]:
version = 1

# features
col_dow = [f"order_dow_{x}" for x in range(7)]
col_hod = [f"order_hod_{x}" for x in range(24)]
use_dow = False
use_hod = False

# xgb params
xgb_params = {'learning_rate': 0.1,
              'colsample_bytree': 1,
              'subsample': 1,
              'objective': 'binary:logistic',
              'eval_metric': 'logloss'
             }

# positive class weight multiplier - multiply weight after balanced
pos_mult = 1

In [13]:
# LOOP THROUGH TOP ALL PRODUCTS

tic = time.process_time()

valid_scores_all = []
test_scores_all = []
feature_importances_df = pd.DataFrame()

predicted_products = df.iloc[:, :500].columns.tolist()

for i, label in enumerate(predicted_products):
    
    print(f"training label: {label}...({i}/{len(predicted_products)})")

    # DATA SETUP

    features = ([x for x in df.columns if x != label])
    if not use_dow: features = [f for f in features if f not in col_dow]
    if not use_hod: features = [f for f in features if f not in col_hod]

    # setup train and test set
    df_xgb = df[features + [label]]
    X_train, X_valid, y_train, y_valid = partition_dataset(df_xgb, 
                                                           label,
                                                           three_way=False,
                                                           random_state=0)

    # PARAMS SETUP
    xgb_params['scale_pos_weight'] = len(y_train[y_train == 0])/len(y_train[y_train == 1]) * pos_mult
        

    # TRAINING
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    watchlist = [(dtrain,'train'), (dvalid,'valid')]

    model = xgb.train(xgb_params,
                      dtrain,
                      num_boost_round=100,
                      evals=watchlist,
                      early_stopping_rounds=20,
                      verbose_eval=100)
    
    # save model
    model.save_model(f"../models/xgb_{label}_v{version}_fi.model")
    
    # feature importance
    feature_importances = {}
    fi_raw = model.get_score(importance_type='gain')
    for i, col in enumerate(features):
        try:
            feature_importances[col] = fi_raw[f"f{i}"]
        except:
            feature_importances[col] = np.nan
    feature_importances_df = feature_importances_df.append(feature_importances, ignore_index=True)
        
    # prediction
    pred_valid = (model.predict(dvalid) > 0.5).astype(int)
    valid_scores_all.append(classification_scores(y_valid, pred_valid))

    
# feature importances indexing
feature_importances_df = feature_importances_df.set_index(pd.Index(predicted_products), 'predicted_product')
feature_importances_df.to_csv(f"../results/xgb_feature_importances_v{version}_top500.csv")

# final processing of scores
df_scores = pd.DataFrame(valid_scores_all,
                         columns=['valid_accuracy', 'valid_precision', 'valid_recall', 'valid_f1'],
                         index=predicted_products
                        )
df_scores.to_csv(f"../results/xgb_product_v{version}_top500.csv")

toc = time.process_time()
print(f"elapsed_time: {toc - tic}")

training label: 24852...(0/100)
[0]	train-logloss:0.68047	valid-logloss:0.68093
[99]	train-logloss:0.58845	valid-logloss:0.58976
training label: 13176...(1/100)
[0]	train-logloss:0.67849	valid-logloss:0.67808
[99]	train-logloss:0.57423	valid-logloss:0.57570
training label: 21137...(2/100)
[0]	train-logloss:0.68364	valid-logloss:0.68359
[99]	train-logloss:0.58883	valid-logloss:0.59043
training label: 21903...(3/100)
[0]	train-logloss:0.68620	valid-logloss:0.68627
[99]	train-logloss:0.59857	valid-logloss:0.59911
training label: 47209...(4/100)
[0]	train-logloss:0.68068	valid-logloss:0.68037
[99]	train-logloss:0.55320	valid-logloss:0.55383
training label: 47766...(5/100)
[0]	train-logloss:0.68047	valid-logloss:0.68109
[99]	train-logloss:0.56767	valid-logloss:0.56841
training label: 47626...(6/100)
[0]	train-logloss:0.68021	valid-logloss:0.68155
[99]	train-logloss:0.57120	valid-logloss:0.57102
training label: 16797...(7/100)
[0]	train-logloss:0.68187	valid-logloss:0.68222
[99]	train-loglos

[0]	train-logloss:0.68530	valid-logloss:0.68484
[99]	train-logloss:0.57457	valid-logloss:0.57430
training label: 35221...(64/100)
[0]	train-logloss:0.67042	valid-logloss:0.67098
[99]	train-logloss:0.50432	valid-logloss:0.50460
training label: 28842...(65/100)
[0]	train-logloss:0.67245	valid-logloss:0.67219
[99]	train-logloss:0.48136	valid-logloss:0.48197
training label: 33731...(66/100)
[0]	train-logloss:0.68424	valid-logloss:0.68553
[99]	train-logloss:0.56760	valid-logloss:0.56835
training label: 8424...(67/100)
[0]	train-logloss:0.68070	valid-logloss:0.68113
[99]	train-logloss:0.54895	valid-logloss:0.54961
training label: 27521...(68/100)
[0]	train-logloss:0.68024	valid-logloss:0.68161
[99]	train-logloss:0.53845	valid-logloss:0.53968
training label: 33198...(69/100)
[0]	train-logloss:0.69048	valid-logloss:0.69060
[99]	train-logloss:0.63987	valid-logloss:0.64017
training label: 8174...(70/100)
[0]	train-logloss:0.68018	valid-logloss:0.68091
[99]	train-logloss:0.55889	valid-logloss:0.5

  feature_importances_df = feature_importances_df.set_index(pd.Index(predicted_products), 'predicted_product')
