In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
test = False

In [3]:
# load dataset
df = pd.read_csv('../data/dataset_clean.csv')
df.head()

Unnamed: 0,order_id,category_1,category_2,category_3,category_4,category_5,category_6,category_7,category_8,category_9,...,category_13_count,category_14_count,category_15_count,category_16_count,category_17_count,category_18_count,category_19_count,category_20_count,category_21_count,order_dow
0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,4
1,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5
2,3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,5
3,4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,1
4,5,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,2.0,0.0,0.0,3.0,3.0,0.0,4.0,1.0,0.0,6


In [4]:
if test:
    _, df, _, _ = train_test_split(df, df['1'],
                                   test_size=0.1,
                                   random_state=0,
                                   stratify='order_dow')

In [5]:
# dow one hot
enc = OneHotEncoder()
df[[f"order_dow_{x}" for x in range(7)]] = enc.fit_transform(df.loc[:, 'order_dow'].values.reshape((-1, 1))).toarray()
df.head()

Unnamed: 0,order_id,category_1,category_2,category_3,category_4,category_5,category_6,category_7,category_8,category_9,...,category_20_count,category_21_count,order_dow,order_dow_0,order_dow_1,order_dow_2,order_dow_3,order_dow_4,order_dow_5,order_dow_6
0,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,5,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,4,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,...,1.0,0.0,6,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [6]:
def partition_dataset(df,
                      label_col,
                      features=None,
                      partition_on=None,
                      test_size=0.2,
                      random_state=None
                     ):
    if features is None:
        features = [col for col in df.columns if col != label_col]
    X = df[features]
    y = df[label_col].apply(int).values.reshape((-1, 1))
    if partition_on is not None:
        partition_on = df[partition_on]
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_size, 
                                                        random_state=random_state, 
                                                        stratify=partition_on
                                                       )
    return X_train, X_test, y_train, y_test

In [7]:
# train test split stratified by label & dow
def train_xgb_cv_classifier(X, y,
                            test_size=0.2,
                            n_folds=5,
                            xgb_params={},
                            n_estimators=100,
                            random_state=None
                           ):
    
    scores = []
    # CV fold
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X.iloc[train_index, :], X.iloc[valid_index, :]
        y_train, y_valid = y[train_index], y[valid_index]
        
        # xgb training
        model = XGBClassifier(n_estimators=n_estimators, 
                              **xgb_params, 
                              use_label_encoder=False,
                              scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1])
                             )
        model.fit(X_train, y_train)
        
        # prediction on validation set
        pred = model.predict(X_valid)
        scores.append(f1_score(y_valid, pred))
        
    return np.array(scores).mean()

In [11]:
# TEST with indicator variables
labels = [f"category_{x}" for x in [4, 16, 7, 19, 1]]
col_dow = [f"order_dow_{x}" for x in range(7)]
scores_all_labels = []
for label in labels:
    print(label)
    X_train, X_test, y_train, y_test = partition_dataset(df, label)
    scores_all_labels.append(
        train_xgb_cv_classifier(X_train[[f"category_{x}" for x in range(1, 22) if f"category_{x}" != label] + col_dow], y_train, 
                                n_estimators=10,
                                xgb_params={'eval_metric': 'logloss'}
                               )
    )

category_4
category_16
category_7
category_19
category_1


In [12]:
pd.Series(scores_all_labels, index=[4, 16, 7, 19, 1])

4     0.726672
16    0.713484
7     0.559684
19    0.576549
1     0.555610
dtype: float64

In [17]:
# TEST with counts
labels = [f"category_{x}" for x in [4, 16, 7, 19, 1]]
col_dow = [f"order_dow_{x}" for x in range(7)]
scores_all_labels_count = []
for label in labels:
    print(label)
    X_train, X_test, y_train, y_test = partition_dataset(df, label)
    scores_all_labels_count.append(
        train_xgb_cv_classifier(X_train[[f"category_{x}_count" for x in range(1, 22) if f"category_{x}" != label] + col_dow], y_train, 
                                n_estimators=10,
                                xgb_params={'eval_metric': 'logloss'}
                               )
    )

category_4
category_16
category_7
category_19
category_1


In [18]:
pd.Series(scores_all_labels_count, index=[4, 16, 7, 19, 1])

4     0.720810
16    0.715178
7     0.559195
19    0.581903
1     0.555199
dtype: float64