In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
test = True

In [3]:
# load dataset
if test:
    df = pd.read_csv('../data/dataset_clean_small.csv')
else:
    df = pd.read_csv('../data/dataset_clean.csv')
df.head()

Unnamed: 0,order_id,1,2,3,4,5,6,7,8,9,...,18,19,20,21,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,634660,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,122910,prior,17,1,13,7.0
1,3321607,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,109779,prior,5,5,15,30.0
2,28237,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,190896,prior,1,0,15,
3,145898,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,18948,prior,7,1,8,4.0
4,1301473,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,171826,prior,2,1,10,30.0


In [4]:
# dow one hot
enc = OneHotEncoder()
df[[f"order_dow_{x}" for x in range(7)]] = enc.fit_transform(df.loc[:, 'order_dow'].values.reshape((-1, 1))).toarray()
df.head()

Unnamed: 0,order_id,1,2,3,4,5,6,7,8,9,...,order_dow,order_hour_of_day,days_since_prior_order,order_dow_0,order_dow_1,order_dow_2,order_dow_3,order_dow_4,order_dow_5,order_dow_6
0,634660,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1,13,7.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,3321607,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,5,15,30.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,28237,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0,15,,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,145898,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,1,8,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1301473,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1,10,30.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
def partition_dataset(df,
                      label_col,
                      features=None,
                      partition_on=None,
                      test_size=0.2,
                      random_state=None
                     ):
    if features is None:
        features = [col for col in df.columns if col != label_col]
    X = df[features]
    y = df[label_col].apply(int).values.reshape((-1, 1))
    if partition_on is not None:
        partition_on = df[partition_on]
    X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size=test_size, 
                                                        random_state=random_state, 
                                                        stratify=partition_on
                                                       )
    return X_train, X_test, y_train, y_test

In [21]:
# train test split stratified by label & dow
def train_xgb_cv_classifier(X, y,
                            test_size=0.2,
                            n_folds=5,
                            xgb_params={},
                            n_estimators=100,
                            random_state=None
                           ):
    
    scores = []
    # CV fold
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    for train_index, valid_index in skf.split(X, y):
        X_train, X_valid = X.iloc[train_index, :], X.iloc[valid_index, :]
        y_train, y_valid = y[train_index], y[valid_index]
        
        # xgb training
        model = XGBClassifier(n_estimators=n_estimators, 
                              **xgb_params, 
                              use_label_encoder=False,
#                               scale_pos_weight=len(y_train[y_train==0]) / len(y_train[y_train==1])
                             )
        model.fit(X_train, y_train)
        
        # prediction on validation set
        pred = model.predict(X_valid)
        scores.append(f1_score(y_valid, pred))
        
    return np.array(scores).mean()

In [22]:
# TEST
labels = [str(x) for x in [4, 16, 7, 19, 1]]
col_dow = [f"order_dow_{x}" for x in range(7)]
scores_all_labels = []
for label in labels:
    X_train, X_test, y_train, y_test = partition_dataset(df, label)
    scores_all_labels.append(
        train_xgb_cv_classifier(X_train[[str(x) for x in range(1, 22) if str(x) != label] + col_dow], y_train, 
                                n_estimators=10,
                                xgb_params={'eval_metric': 'logloss'}
                               )
    )

In [23]:
pd.Series(scores_all_labels, index=[4, 16, 7, 19, 1])

4     0.863243
16    0.810095
7     0.483970
19    0.508828
1     0.415020
dtype: float64