# Model

In [1]:
import pandas as pd
from constants import *
from helpers import *
import numpy as np
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
train = pd.read_pickle(TRAIN)
test = pd.read_pickle(TEST)

Create a 70/30 train/validation split of the training data and also a small sample to speed up testing:

In [3]:
val_set = (train[["user_id"]]
           .drop_duplicates()
           .assign(val_flag = True)
           .sample(frac = .3, random_state = 152)
           .set_index("user_id"))
val_bool = train[["user_id"]].join(val_set, on = "user_id")[["val_flag"]].notnull()
train.eval_set[val_bool.val_flag.values] = "val"

In [4]:
train_set_small = (train[train.eval_set == "train"][["user_id"]]
                   .drop_duplicates()
                   .assign(small_flag = True)
                   .sample(frac = .2, random_state = 152)
                   .set_index("user_id"))
small_bool = train[["user_id"]].join(train_set_small, on = "user_id")[["small_flag"]].notnull()
train.eval_set[small_bool.small_flag.values] = "train_small"

In [5]:
train.groupby("eval_set").agg({"user_id": "nunique"})

Unnamed: 0_level_0,user_id
eval_set,Unnamed: 1_level_1
train,73477
train_small,18369
val,39363


### Model Training

In [3]:
def buildDMatrix(X, y = None):
    cols_to_drop = ["user_id", "order_id", "product_id", "eval_set", "ordered"]
    cols_to_drop_present = list(set(cols_to_drop) & set(X.columns.values))
    X_dr = X.drop(cols_to_drop_present, axis = 1)
    if y is not None:
        dm = xgb.DMatrix(
            X_dr.values, label = y.values, feature_names = X_dr.columns.values)
    else:
        dm = xgb.DMatrix(
            X_dr.values, feature_names = X_dr.columns.values)
    return dm

In [4]:
def trainModel(X, y, X_test, y_test = None, model_params = {}, num_boost_round = 80, ev = True):
    
    # Build DMatrices
    print "Building dmatrices..."
    train_dm = buildDMatrix(X, y)
    if X_test is not None:
        test_dm = buildDMatrix(X_test, y_test)
    
    # Fit model
    print "Fitting model..."
    model = xgb.train(model_params, train_dm, num_boost_round = num_boost_round)
    
    # Get training predictions
    print "Getting training predictions..."
    train_predictions = model.predict(train_dm)
    
    # Get test predictions
    if X_test is not None:
        print "Getting test predictions..."
        test_predictions = model.predict(test_dm)
    
    train_scores = None
    test_scores = None
    if ev:
        # Evaluate on train
        print "Evaluating on training set..."
        train_scores = evaluate(X, y, train_predictions)

        # If test labels known, evaluate on test
        if y_test is not None:
            print "Evaluating on test set..."
            test_scores = evaluate(X_test, y_test, test_predictions)
    
    print "Done."
    return {
        "model": model,
        "train_f1_scores": train_scores,
        "test_f1_scores": test_scores,
        "test_predictions": test_predictions
    }

In [5]:
params = {
  "objective": "reg:logistic",
  "eval_metric": "logloss",
  "eta": 0.03,
  "max_depth": 6,
  "min_child_weight": 10,
  "gamma": 0.70,
  "subsample": 0.76,
  "colsample_bytree": 0.95,
  "alpha": 3e-05,
  "lambda": 10
}

Local Validation:

In [9]:
# X_train = train[train.eval_set == "train_small"]
# y_train = train.ordered[train.eval_set == "train_small"]
# X_test = train[train.eval_set == "val"]
# y_test = train.ordered[train.eval_set == "val"]
# model_train = trainModel(
#     X_train, y_train, X_test, y_test, params, 70)

In [10]:
# pd.DataFrame({
#     "Threshold": model_train["train_f1_scores"].keys(),
#     "Training F1": model_train["train_f1_scores"].values(),
#     "Test F1": model_train["test_f1_scores"].values()
# }).set_index("Threshold").sort_index()

In [11]:
# test_scores = evaluate(
#     X_test, y_test, model_train["test_predictions"])

In [12]:
# xgb.plot_importance(model_train["model"])

In [13]:
# train_preds = model_train["model"].predict(buildDMatrix(X_train))
# X_train.assign(training_preds = train_preds).to_pickle(THRESHOLD_TRAINING)

Build Submission:

In [6]:
X_train = train
y_train = train.ordered
X_test = test
y_test = None

In [15]:
model_train = trainModel(
    X_train, y_train, X_test, y_test, params, 250, ev = False)

Building dmatrices...
Fitting model...
Getting training predictions...
Getting test predictions...
Done.


In [16]:
model_train["model"].save_model("./data/full250.model")

In [7]:
model = xgb.Booster()
model.load_model("./data/full250.model")

In [8]:
X_test.head()

Unnamed: 0,user_id,order_id,product_id,eval_set,u_days_since_prior_order_mean,u_days_since_prior_order_median,u_total_orders_max,u_prev_order_dow,u_prev_days_since_prior_order,u_prev_order_size,...,o_same_dow,u_products_ratio,up_days_between_orders_diff_last,up_days_between_orders_diff_mean,up_days_between_orders_diff_median,up_order_number_gap,up_orders_since_last_order,up_reorder_rate,up_reorder_rate_since_first,ordered
766,15,2161313,196,test,10.809524,8.0,22,1,14,2,...,True,0.180556,-7.0,-3.5,-3.5,7,0,0.227273,0.571429,0
959,19,1735923,196,test,9.5,7.5,9,5,8,21,...,False,0.651961,,,,5,2,0.333333,0.285714,0
1792,31,280888,196,test,4.894737,6.0,20,0,0,5,...,False,0.635452,,,,7,3,0.1,0.1,0
5750,98,139487,196,test,20.307692,24.0,14,5,8,2,...,False,0.54,22.0,7.333333,0.0,11,0,0.5,0.545455,0
18638,290,3116687,196,test,7.14,7.0,51,6,7,19,...,True,0.344654,,,,19,23,0.039216,0.02381,0


In [9]:
test_preds = model.predict(buildDMatrix(X_test))

Without Dynamic Threshold:

In [17]:
# pred_str = binaryPredictionToString(X_test, test_preds, .191)
# pred_str.to_csv("./data/new_features_tuning.csv")

With Dynamic Threshold:

In [18]:
# test_threshold = pd.read_pickle(TEST_THRESHOLD)

In [19]:
# test_full = (X_test
#              .assign(pred = test_preds)
#              .set_index("user_id")
#              .join(test_threshold))

In [20]:
# pred_str = binaryPredictionToString(test_full, test_full.pred.values, 
#                                     thr = (test_full.test_preds.values + 3*.19)/4, #shrinkage
#                                     dynamic = True)
# pred_str.to_csv("./data/new_features2_tuning_w_threshold_shrunk3.csv")

With Basket Size Prediction

In [10]:
size_pred = pd.read_pickle(BASKET_SIZE)

In [11]:
test_full = (X_test
             .assign(pred = test_preds)
             .set_index("user_id")
             .join(size_pred))

In [17]:
pred_str = binaryPredictionToString(test_full, test_full.pred.values, 
                                    thr = test_full.reorder_size_prediction,
                                    basket = True)
pred_str.to_csv("./data/new_features2_tuning_w_basket_size.csv")