# Threshold Model

In [33]:
import pandas as pd
import numpy as np
import xgboost as xgb
from constants import *
from helpers import evaluate
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_pickle(THRESHOLD_TRAINING)
test = pd.read_pickle(TEST)

### Getting the Training Labels

For each user (in the training set), I want to get what would've been the best threshold (by F1) to use. In the case where a user reordered no items, I'll set that threshold to be the max predicted probability. In the case where a user reordered all items, I'll set it to be the min predicted probability.

In [3]:
def getBestThreshold(s):
    '''
    Function to be passed to agg to find the best F1 threshold.
    Assumes tuple is sorted by descending probability.
    '''
    ordered = pd.Series([e[0] for e in s])
    training_prob = [e[1] for e in s]
    
    p = ordered.sum()
    cum_sum = ordered.cumsum().values
    ix = ordered.index.values + 1
    
    if p == s.shape[0]: # case where everything was reordered
        return min(training_prob)
    elif p == 0: # case where nothing was reordered
        return max(training_prob) + .000001
        
    
    # TP, FP, and FN counts at each threshold value
    tp = cum_sum
    fp = ix - cum_sum
    fn = p - cum_sum

    # Precision/Recall/F1
    pr = tp / (tp + fp)
    re = tp / (tp + fn)
    f1 = np.nan_to_num(2*pr*re / (pr + re))
    
    return training_prob[f1.argmax()]

In [4]:
best_threshold = (train
                  .assign(ordered_threshold = zip(train.ordered, train.training_preds))
                  .sort_values(by = ["user_id", "training_preds"], ascending = False)
                  .reset_index()
                  .groupby("user_id")
                  .agg({
                      "ordered_threshold": getBestThreshold
                  }))

In [5]:
best_threshold.head()

Unnamed: 0_level_0,ordered_threshold
user_id,Unnamed: 1_level_1
1,0.17801
2,0.129763
5,0.144712
7,0.221792
8,0.326707


In [9]:
best_threshold.describe()

Unnamed: 0,ordered_threshold
count,131209.0
mean,0.265351
std,0.175073
min,0.003286
25%,0.13277
50%,0.221749
75%,0.36239
max,0.919504


In [6]:
threshold_train = (train
                   .set_index("user_id")
                   .join(best_threshold))

Using a perfect threshold for each user on the training predictions, the max possible training F1 for my predictions is as follows:

In [10]:
evaluate(threshold_train, threshold_train.ordered.values, threshold_train.training_preds.values,
         thr = threshold_train.ordered_threshold.values, dynamic = True)

{'dynamic': 0.58749444821651953}

### Model

In [15]:
user_feature_list = ["u_days_since_prior_order_mean", "u_days_since_prior_order_median", 
                     "u_total_orders_max", "u_basket_size", "u_reorder_rate_after_first_order", 
                     "u_products_nunique", "u_products_count", "u_total_reorders_sum", 
                     "o_days_since_prior_order", "u_products_ratio"]

In [74]:
X_train = train.set_index("user_id")[user_feature_list].drop_duplicates()
y_train = X_train.join(best_threshold)["ordered_threshold"]
X_test = test.set_index("user_id")[user_feature_list].drop_duplicates()

In [75]:
train_dm = xgb.DMatrix(X_train.values, feature_names = X_train.columns, label = y_train.values)
test_dm = xgb.DMatrix(X_test.values, feature_names = X_test.columns)

In [68]:
params = {
  "objective": "reg:linear",
  "eval_metric": "rmse",
  "eta": 0.07,
  "max_depth": 6,
  "alpha": 2e-5,
  "lambda": 2
}

In [72]:
# cv = xgb.cv(params, train_dm, 85)
# cv.tail(10)

In [73]:
model = xgb.train(params, train_dm, 85)

In [76]:
test_preds = model.predict(test_dm)

In [83]:
X_test.assign(test_preds = test_preds)[["test_preds"]].to_pickle(TEST_THRESHOLD)