In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
import matplotlib.pyplot as plt
import sklearn
import scipy
import optuna
from optuna.samplers import TPESampler

import os
import gc
import warnings
import datetime as dt

pd.options.mode.chained_assignment = None  # default='warn'

## <center>Instacart Optuna Hyperparameter Optimization</center>

This notebook is in conjunction with the other instacart notebooks in the repo. This notebook is specific to optimizing the hyperparameters of the XGBoost notebook. Note that as we are experimenting with an increased model depth here, you will exceed memory limits in running this in Colab.

#### Making Changes to the Pipeline

- If making changes to the pipeline, change the all_data variable to FALSE (two cells below). 
- As there is over 3 million rows it takes way too long to use the whole dataset when testing.

#### Using F1-Score

- F1 score is a great metric of model performance, balancing precision and recall. This is great in our case as we have >80% of products that are not reordered and therefore raw accuracy would be misleading.

#### Memory Limits

- Be very conscious of memory limits. I delete dataframes as I go to stay within Colab/Kaggle limits.
- Given the size of the dataset these limits are reached very easily

## Loading Data

In [None]:
# Data variables
PATH = "../input/simplifiedinstacartdata/"
missing_value_formats = ["n.a.","?","NA","n/a", "na", "--","-"]

for file in os.listdir(PATH):
    dfname = file.split('.')[0].replace("__","_")
    globals()[dfname] = pd.read_csv(PATH + file, na_values = missing_value_formats)
    print(dfname)

#### Creating Features on Timestamp

In [None]:
orders['timestamp'] =  pd.to_datetime(orders['timestamp'])
orders = orders.sort_values(["user_id", "timestamp"])

orders["order_number"] = orders.groupby('user_id').cumcount()
orders["order_dow"] = orders["timestamp"].dt.dayofweek
orders["order_hour_of_day"] = orders["timestamp"].dt.hour
orders['days_since_prior_order'] = (orders["timestamp"] - orders.groupby('user_id')['timestamp'].shift(1)).dt.round('1d').dt.days
orders['days_since_prior_order'].fillna(0.0, inplace=True)
orders = orders.drop(columns=["timestamp"])
orders.head()

#### Creating Prior + Train Set

In [None]:
# Creating 'reordered' column
all_order_products = pd.merge(all_order_products, orders[["order_id","user_id","order_number"]], on="order_id").sort_values(['user_id','order_number'])
all_order_products = all_order_products.sort_values(["user_id","order_number"])
all_order_products['reordered'] = all_order_products.duplicated(subset=['user_id','product_id'])

# Creating Training and Prior Sets
order_products_prior = all_order_products[all_order_products.groupby(['user_id'])['order_number'].transform(max) != all_order_products['order_number']]
order_products_train = all_order_products[all_order_products.groupby(['user_id'])['order_number'].transform(max) == all_order_products['order_number']]

del all_order_products

In [None]:
# Set all_data to false if testing. 
# Using all the data takes a long time to train
all_data = True
if not all_data:
    ids = orders["user_id"].unique()[:100]
    orders = orders[orders["user_id"].isin(ids)]

#creating a dataframe that will contain only prior information
op = pd.merge(orders, order_products_prior, on='order_id', how='inner', suffixes=('', '_y'))
op = op.drop(op.filter(regex='_y$').columns.tolist(), axis=1)
op.head()

# Creating features related to the users. i.e using user_id

In [None]:
#Total number of orders placed by each users
users = op.groupby(by='user_id')['order_number'].aggregate('max').to_frame('u_num_of_orders').reset_index()
users.head()

In [None]:
#average number of products bought by the user in each purchase.

#1. First getting the total number of products in each order.
total_prd_per_order = op.groupby(by=['user_id', 'order_id'])['product_id'].aggregate('count').to_frame('total_products_per_order').reset_index()

#2. Getting the average products purchased by each user
avg_products = total_prd_per_order.groupby(by=['user_id'])['total_products_per_order'].mean().to_frame('u_avg_prd').reset_index()
avg_products.head()

#deleting the total_prd_per_order dataframe
del total_prd_per_order

In [None]:
#dow (Day of week) of most orders placed by each user 
dow = op.groupby('user_id')['order_dow'].agg(lambda x: x.mode().iloc[0]).to_frame(name='dow_most_orders_u').reset_index()
dow.head()

In [None]:
#hour of day when most orders placed by each user
hod = op.groupby('user_id')['order_hour_of_day'].agg(lambda x: x.mode().iloc[0]).to_frame(name='hod_most_orders_u').reset_index()
hod.head()

In [None]:
# Merging the user created features.

#1. merging avg_products with users
users = users.merge(avg_products, on='user_id', how='left')
#deleting avg_products
del avg_products

#2. merging dow with users.
users = users.merge(dow, on='user_id', how='left')
#deleting dow
del dow

#3. merging hod with users
users = users.merge(hod, on='user_id', how='left')
#deleting dow
del hod
users.head()

# Creating features related to the products using product_id.

In [None]:
#number of time a product was purchased.
prd = op.groupby(by='product_id')['order_id'].agg('count').to_frame('prd_count_p').reset_index()
prd.head()

In [None]:
#products reorder ratio.
reorder_p = op.groupby(by='product_id')['reordered'].agg('mean').to_frame('p_reordered_ratio').reset_index()
reorder_p.head()

In [None]:
#merging the reorder_p with prd
prd = prd.merge(reorder_p, on='product_id', how='left')
#deleting reorder_p
del reorder_p
prd.head()

# Creating user-product features.

In [None]:
#how many times a user bought the same product.
uxp = op.groupby(by=['user_id', 'product_id'])['order_id'].agg('count').to_frame('uxp_times_bought').reset_index()
uxp.head()

In [None]:
#reorder ratio of the user for each product.
reorder_uxp = op.groupby(by=['user_id', 'product_id'])['reordered'].agg('mean').to_frame('uxp_reordered_ratio').reset_index()
reorder_uxp.head()

In [None]:
#merging the two dataframes into one
uxp = uxp.merge(reorder_uxp, on=['user_id', 'product_id'], how='left')
#deleting reorder_uxp
del reorder_uxp
uxp.head()

# Merging all the features into data DF.

In [None]:
#merging users df into uxp
data = uxp.merge(users, on='user_id', how='left')

#merging products df into data
data = data.merge(prd, on='product_id', how='left')
data = data.merge(order_products_train[["user_id", "order_id"]].drop_duplicates(), on='user_id')

#deleting unwanted dfs
del [users, prd, uxp]

data.head()

# Creating Training + Validation

In [None]:
#merging the information from the order_proucts_train df into the data_train.
data_train = data.merge(order_products_train[['product_id', 'order_id', 'reordered']], on=['product_id', 'order_id'], how='left')
del data

data_train.head()

In [None]:
#filling the NAN values
data_train['reordered'].fillna(0.0, inplace=True)
data_train['reordered'] = data_train['reordered'].astype(int)

#deleting eval_set, order_id as they are not needed for training.
data_train.drop(['order_id'], axis=1, inplace=True)
data_train.head()

In [None]:
#deleting unwanted df
del [order_products_prior, order_products_train, orders]

In [None]:
#merging the aisles and department ids to with the train and test data
data_train = data_train.merge(products[['product_id', 'aisle_id', 'department_id']], on='product_id', how='left')

#setting user_id and product_id as index.
data_train = data_train.set_index(['user_id', 'product_id'])

In [None]:
#creating training and validation set
train, valid = sklearn.model_selection.train_test_split(data_train, test_size=0.1)
del data_train
train.head()

In [None]:
#creating data and labels
X_train, y_train = train.drop('reordered', axis=1), train['reordered']

#creating data and labels
X_valid, y_valid = valid.drop('reordered', axis=1), train['reordered']

# Testing Hyperparameters w/ Optuna

In [None]:
def create_model(trial):
    max_depth = trial.suggest_int('max_depth', 3, 8)
    min_child_weight = trial.suggest_int('min_child_weight', 0.5, 1)
    sub_sample = trial.suggest_uniform('sub_sample', 0.10, 0.8)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.50, 1.0)
    
        
    model = xgb.XGBClassifier(
        objective='binary:logistic'
        metric='binary_logloss',
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        sub_sample=sub_sample, 
        colsample_bytree=colsample_bytree,
    )
    
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_train, y_train)
    #see link in markdown above for this next line
    score = sklearn.metrics.f1_score(X_valid, model.predict_proba(y_valid)[:,1])
    return score

In [None]:
sampler = TPESampler(seed=seed)

study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=1)
params = study.best_params #getting best params from study
print(params)