### 1. Imports
* **read_pickle**: Reads the given pickle file

In [267]:
# Janky xgboost fix
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [268]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb

from code.lw_pickle import read_pickle

In [269]:
df = pd.read_csv('ec2/test_df.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)
lr = read_pickle('ec2/model_lr.pk')
gbm = read_pickle('ec2/model_gbm.pk')



In [270]:
df.shape

(856961, 21)

In [271]:
def adjust_threshold(proba, threshold=.5):
    return (proba[:, 1] >= threshold).astype(int)

In [272]:
group_col = 'user_id'
x_cols = ['percent_in_user_orders',
          'percent_in_all_orders',
          'in_last_cart',
          'in_last_five',
          'total_user_orders',
          'mean_orders_between',
          'mean_days_between',
          'orders_since_newest',
          'days_since_newest',
          'product_reorder_proba',
          'user_reorder_proba',
          'mean_cart_size',
          'mean_cart_percentile',
          'mean_hour_of_week',
          'newest_cart_size',
          'newest_hour_of_week',
          'cart_size_difference',
          'hour_of_week_difference'
         ]
y_col = 'in_cart'

## Total F1 Score

### Logistic Regression

In [273]:
lr_proba = lr.predict_proba(df[x_cols])
lr_pred = adjust_threshold(lr_proba)

print(f1_score(df[y_col], lr_pred))
print(precision_score(df[y_col], lr_pred))
print(recall_score(df[y_col], lr_pred))

lr_adj = adjust_threshold(lr_proba, .17)
print(f1_score(df[y_col], lr_adj))

0.31109302956333057
0.634192192856353
0.20609480001437452
0.44570121194858175


### XGBoost

In [274]:
gbm_proba = gbm.predict_proba(df[x_cols].values)
gbm_pred = adjust_threshold(gbm_proba)

print(f1_score(df[y_col], gbm_pred))
print(precision_score(df[y_col], gbm_pred))
print(recall_score(df[y_col], gbm_pred))

gbm_adj = adjust_threshold(gbm_proba, .21)
print(f1_score(df[y_col], gbm_adj))

0.33374309086889237
0.637595458538893
0.22602747930666858
0.45937386165863925


## Per User Average Score

In [286]:
lr_adj = adjust_threshold(lr_proba, .16)
gbm_adj = adjust_threshold(gbm_proba, .17)

score_df = df[[group_col, y_col]].reset_index(drop=True)
score_df['lr_adj'] = lr_adj
score_df['gbm_adj'] = gbm_adj
users = score_df['user_id'].unique()

lr_f1 = []
gbm_f1 = []
for user in users:
    sdf = score_df[score_df['user_id'] == user]
    lr_f1.append(f1_score(sdf[y_col], sdf['lr_adj']))
    gbm_f1.append(f1_score(sdf[y_col], sdf['gbm_adj']))

print(np.mean(lr_f1))
print(np.mean(gbm_f1))

0.3653402554718635
0.36717204855346675


### Compare Features

In [305]:
feature_df = pd.DataFrame({'feature': x_cols,
                           'lr_coef': lr.coef_[0].tolist(),
                           'lr_abs_coef': np.abs(lr.coef_[0]).tolist(),
                           'xgb_coef': gbm.feature_importances_.tolist()}
                         )

In [307]:
feature_df.to_csv('tableau/features.csv')

In [309]:
pd.DataFrame(gbm_f1).to_csv('tableau/hist.csv')

### User Example

In [317]:
opt = []
for user in users:
    sdf = score_df[score_df['user_id'] == user]
    if f1_score(sdf[y_col], sdf['gbm_adj']) == .8:
        opt.append(user)

In [319]:
opt[:10]

[165457, 131275, 55976, 149166, 200230, 10673, 33350, 136527, 184394, 40141]

In [326]:
prob_list = gbm.predict_proba(df[df['user_id'] == 55976][x_cols].values)[:,1].tolist()
pid = df[df['user_id'] == 55976]['product_id'].tolist()

In [329]:
p_df = pd.DataFrame({'product_id':pid, 'prob':prob_list})

In [331]:
p_df.head()

Unnamed: 0,product_id,prob
0,4913,0.036324
1,12341,0.065523
2,12899,0.041584
3,22046,0.642222
4,8859,0.066379


In [332]:
products_df = pd.read_csv('../data/common/products.csv')
products_df.head()

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


In [333]:
p_df = p_df.merge(products_df, how='left', on='product_id')

In [336]:
p_df.sort_values(by='prob', ascending=False)

Unnamed: 0,product_id,prob,product_name,aisle_id,department_id
3,22046,0.642222,Frozen Whole Strawberries,116,1
12,5258,0.205645,Sparkling Water,115,7
5,13176,0.124902,Bag of Organic Bananas,24,4
6,30827,0.101226,Seedless Cucumbers,32,4
10,26348,0.073586,Mixed Fruit Fruit Snacks,50,19
4,8859,0.066379,Natural Spring Water,115,7
1,12341,0.065523,Hass Avocados,32,4
17,27885,0.06538,Malbec,28,5
11,33768,0.049367,Sinfully Sweet Campari Tomatoes,83,4
14,33065,0.048476,Cabernet Sauvignon,28,5
