# Machine Learning Engineer Capstone Project
## Starbucks Challenge
### by Mark-Danney Oonk

---

This notebook describes:
- The training and refinement of the model
- Justification of the model against the benchmark
- Discussion of outcomes and possible improvements

Running the code in this notebook results in:
- model/???.??? with the trained neural network that is able to make offer recommendations based on profile data

In [84]:
# imports

import joblib
import keras_tuner as kt
import logging
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import MinMaxScaler

# constants
SEED = 42
ACCOUNT_PROPERTIES = ['age', 'income', 'gender_F', 'gender_M', 'gender_O', 'gender_U', 'account_age']
OFFER_CHANNELS = ['channels_email', 'channels_mobile', 'channels_social', 'channels_web']
OFFER_TYPES = ['offer_type_bogo', 'offer_type_discount', 'offer_type_informational']
OFFER_PARAMETERS = ['reward', 'difficulty', 'duration']

In [76]:
# load scaler and data

scaler: MinMaxScaler = joblib.load('./model/scaler.gz')
SCALED_COLS = joblib.load('./model/SCALED_COLS')

successes: pd.DataFrame = pd.read_csv('./data/successes.csv')

successes, successes_test = train_test_split(successes, test_size=0.2)

display(successes)

Unnamed: 0,reward,difficulty,duration,channels_email,channels_mobile,channels_social,channels_web,offer_type_bogo,offer_type_discount,offer_type_informational,successful,amount,age,income,gender_F,gender_M,gender_O,gender_U,account_age
15281,0.0,0.00,0.000000,1,1,1,0,0.0,0.0,1.0,1,0.000000,0.530120,0.077778,1.0,0.0,0.0,0.0,0.158530
47159,0.2,0.50,0.571429,1,1,0,1,0.0,1.0,0.0,1,0.320557,0.240964,0.433333,1.0,0.0,0.0,0.0,0.069117
9824,0.2,0.50,1.000000,1,1,1,1,0.0,1.0,0.0,1,0.109794,0.084337,0.033333,0.0,1.0,0.0,0.0,0.185409
21605,0.2,0.50,1.000000,1,1,1,1,0.0,1.0,0.0,1,0.443070,0.421687,0.900000,0.0,1.0,0.0,0.0,0.370269
62131,0.2,0.50,1.000000,1,1,1,1,0.0,1.0,0.0,0,0.020159,0.048193,0.033333,0.0,1.0,0.0,0.0,0.052660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56242,0.5,0.25,0.571429,1,1,0,1,1.0,0.0,0.0,0,0.000000,0.445783,0.144444,0.0,1.0,0.0,0.0,0.009325
8178,0.3,0.35,0.571429,1,1,1,1,0.0,1.0,0.0,1,0.813982,0.301205,0.733333,1.0,0.0,0.0,0.0,0.856829
54817,0.0,0.00,0.000000,1,1,1,0,0.0,0.0,1.0,1,0.019381,0.542169,0.100000,0.0,1.0,0.0,0.0,0.547998
16702,1.0,0.50,0.285714,1,1,1,1,1.0,0.0,0.0,0,0.000000,0.566265,0.666667,0.0,1.0,0.0,0.0,0.364783


In [77]:
# Build data for offer success prediction

successes_X = successes[ACCOUNT_PROPERTIES + OFFER_CHANNELS + OFFER_TYPES + OFFER_PARAMETERS]
successes_y = successes[['successful', 'amount']]

successes_test_X = successes[ACCOUNT_PROPERTIES + OFFER_CHANNELS + OFFER_TYPES + OFFER_PARAMETERS]
successes_test_y = successes[['successful', 'amount']]

display(successes_X)
display(successes_y)

Unnamed: 0,age,income,gender_F,gender_M,gender_O,gender_U,account_age,channels_email,channels_mobile,channels_social,channels_web,offer_type_bogo,offer_type_discount,offer_type_informational,reward,difficulty,duration
15281,0.530120,0.077778,1.0,0.0,0.0,0.0,0.158530,1,1,1,0,0.0,0.0,1.0,0.0,0.00,0.000000
47159,0.240964,0.433333,1.0,0.0,0.0,0.0,0.069117,1,1,0,1,0.0,1.0,0.0,0.2,0.50,0.571429
9824,0.084337,0.033333,0.0,1.0,0.0,0.0,0.185409,1,1,1,1,0.0,1.0,0.0,0.2,0.50,1.000000
21605,0.421687,0.900000,0.0,1.0,0.0,0.0,0.370269,1,1,1,1,0.0,1.0,0.0,0.2,0.50,1.000000
62131,0.048193,0.033333,0.0,1.0,0.0,0.0,0.052660,1,1,1,1,0.0,1.0,0.0,0.2,0.50,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56242,0.445783,0.144444,0.0,1.0,0.0,0.0,0.009325,1,1,0,1,1.0,0.0,0.0,0.5,0.25,0.571429
8178,0.301205,0.733333,1.0,0.0,0.0,0.0,0.856829,1,1,1,1,0.0,1.0,0.0,0.3,0.35,0.571429
54817,0.542169,0.100000,0.0,1.0,0.0,0.0,0.547998,1,1,1,0,0.0,0.0,1.0,0.0,0.00,0.000000
16702,0.566265,0.666667,0.0,1.0,0.0,0.0,0.364783,1,1,1,1,1.0,0.0,0.0,1.0,0.50,0.285714


Unnamed: 0,successful,amount
15281,1,0.000000
47159,1,0.320557
9824,1,0.109794
21605,1,0.443070
62131,0,0.020159
...,...,...
56242,0,0.000000
8178,1,0.813982
54817,1,0.019381
16702,0,0.000000


In [78]:
# Build data for offer proposal and amount spent prediction

# Get best offer for money spent per account
idx = successes.groupby(ACCOUNT_PROPERTIES, sort=False)['amount'].transform(max) == successes['amount']
top_spending = successes[idx].groupby(ACCOUNT_PROPERTIES, sort=False).first().reset_index()

top_spending_X = top_spending[ACCOUNT_PROPERTIES]
top_spending_y = top_spending[OFFER_CHANNELS + OFFER_TYPES + OFFER_PARAMETERS + ['amount']]

idx_train = successes_test.groupby(ACCOUNT_PROPERTIES, sort=False)['amount'].transform(max) == successes_test['amount']
top_spending_test = successes_test[idx_train].groupby(ACCOUNT_PROPERTIES, sort=False).first().reset_index()

top_spending_test_X = top_spending_test[ACCOUNT_PROPERTIES]
top_spending_test_y = top_spending_test[OFFER_CHANNELS + OFFER_TYPES + OFFER_PARAMETERS + ['amount']]

display(top_spending.head())
display(top_spending.describe())

Unnamed: 0,age,income,gender_F,gender_M,gender_O,gender_U,account_age,reward,difficulty,duration,channels_email,channels_mobile,channels_social,channels_web,offer_type_bogo,offer_type_discount,offer_type_informational,successful,amount
0,0.240964,0.433333,1.0,0.0,0.0,0.0,0.069117,0.2,0.5,0.571429,1,1,0,1,0.0,1.0,0.0,1,0.320557
1,0.421687,0.9,0.0,1.0,0.0,0.0,0.370269,0.2,0.5,1.0,1,1,1,1,0.0,1.0,0.0,1,0.44307
2,0.710843,0.6,0.0,1.0,0.0,0.0,0.499726,0.3,0.35,0.571429,1,1,1,1,0.0,1.0,0.0,1,0.103046
3,0.493976,0.133333,0.0,1.0,0.0,0.0,0.622052,0.0,0.0,0.0,1,1,1,0,0.0,0.0,1.0,1,0.075273
4,0.253012,0.622222,0.0,1.0,0.0,0.0,0.268239,0.0,0.0,0.0,1,1,1,0,0.0,0.0,1.0,1,0.661706


Unnamed: 0,age,income,gender_F,gender_M,gender_O,gender_U,account_age,reward,difficulty,duration,channels_email,channels_mobile,channels_social,channels_web,offer_type_bogo,offer_type_discount,offer_type_informational,successful,amount
count,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0,15597.0
mean,0.438676,0.393474,0.387703,0.537924,0.013592,0.060781,0.288323,0.451074,0.389325,0.515731,1.0,0.929987,0.716805,0.773995,0.423158,0.407771,0.169071,0.691992,0.181627
std,0.202936,0.232716,0.487242,0.498576,0.115795,0.238936,0.22893,0.35614,0.247034,0.316066,0.0,0.255178,0.450565,0.418256,0.494076,0.491436,0.374827,0.461685,0.170283
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.313253,0.222222,0.0,0.0,0.0,0.0,0.115195,0.2,0.25,0.285714,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.061429
50%,0.438476,0.393389,0.0,1.0,0.0,0.0,0.199122,0.3,0.5,0.571429,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.147257
75%,0.578313,0.533333,1.0,1.0,0.0,0.0,0.439934,1.0,0.5,0.571429,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.249697
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [36]:
# Create a scaler just for amount, so we can invert scale that

amount_col = SCALED_COLS.index('amount')
scaler_amount = MinMaxScaler()
scaler_amount.min_, scaler_amount.scale_ = scaler.min_[amount_col], scaler.scale_[amount_col]

In [37]:
logger = tf.get_logger()
logger.setLevel(logging.ERROR)
print('Using:')
print('\t\u2022 TensorFlow version:', tf.__version__)
print('\t\u2022 tf.keras version:', tf.keras.__version__)
print('\t\u2022 Running on GPU' if tf.test.is_gpu_available() else '\t\u2022 GPU device not found. Running on CPU')

Using:
	• TensorFlow version: 2.6.0
	• tf.keras version: 2.6.0
	• Running on GPU


## Constructing the datasets

Since the amount of rows is relatively small, we'll use a K-Fold cross-validation method to split the dataset.

In [38]:
# F1 score calculation from https://datascience.stackexchange.com/a/45166

from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [52]:
# Create hyperband parameter tuning parameters

# Hyperband tuning doesn't support batch size tuning out of the box, so make it
class HyperbandWithBatchTuning(kt.Hyperband):
    def run_trial(self, trial, *args, **kwargs):
        kwargs['batch_size'] = trial.hyperparameters.Int('batch_size', 16, 64, step=16)
        return super(HyperbandWithBatchTuning, self).run_trial(trial, *args, **kwargs)

def success_model_builder(hp: kt.HyperParameters):
    hp_dropout = hp.Float('dropout', min_value=0.2, max_value=0.4, step=0.1)
    hp_units = hp.Int('units', min_value=64, max_value=128, step=32)
    hp_units_second = hp.Int('units_2', min_value=64, max_value=128, step=32)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model = Sequential()
    model.add(Input(shape=successes_X.shape))
    model.add(Dropout(rate=hp_dropout, seed=SEED))
    model.add(Dense(units=hp_units, activation='relu'))
    model.add(Dropout(rate=hp_dropout, seed=SEED))
    model.add(Dense(units=hp_units_second, activation='relu'))
    model.add(Dropout(rate=hp_dropout, seed=SEED))
    model.add(Dense(successes_y.shape[1], activation='softmax'))

    model.compile(optimizer=keras.optimizers.Adadelta(learning_rate=hp_learning_rate),
                loss=keras.losses.MeanSquaredLogarithmicError(),
                metrics=['acc', f1_m, precision_m, recall_m])
    return model

def spending_model_builder(hp: kt.HyperParameters):
    hp_dropout = hp.Float('dropout', min_value=0.2, max_value=0.4, step=0.1)
    hp_units = hp.Int('units', min_value=32, max_value=128, step=32)
    hp_units_second = hp.Int('units_2', min_value=32, max_value=128, step=32)
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    model = Sequential()
    model.add(Input(shape=top_spending_X.shape))
    model.add(Dropout(rate=hp_dropout, seed=SEED))
    model.add(Dense(units=hp_units, activation='relu'))
    model.add(Dropout(rate=hp_dropout, seed=SEED))
    model.add(Dense(units=hp_units_second, activation='relu'))
    model.add(Dropout(rate=hp_dropout, seed=SEED))
    model.add(Dense(top_spending_y.shape[1], activation='softmax'))

    model.compile(optimizer=keras.optimizers.Adadelta(learning_rate=hp_learning_rate),
                loss=keras.losses.MeanSquaredLogarithmicError(),
                metrics=['acc', f1_m, precision_m, recall_m])
    return model

In [53]:
tuner_successes = HyperbandWithBatchTuning(success_model_builder,
                     objective='val_acc',
                     max_epochs=15,
                     factor=3,
                     directory='model',
                     project_name='starbucks_successes',
                     seed=SEED)

tuner_spendings = HyperbandWithBatchTuning(spending_model_builder,
                     objective='val_acc',
                     max_epochs=15,
                     factor=3,
                     directory='model',
                     project_name='starbucks_spending',
                     seed=SEED)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

First, we find the optimal training parameters

In [50]:
tuner_successes.search(successes_X, successes_y, epochs=50, validation_split=0.2, callbacks=[stop_early])

best_hps_successes=tuner_successes.get_best_hyperparameters()[0]

print(best_hps_successes.values)

Trial 31 Complete [00h 04m 21s]
val_acc: 0.8635314702987671

Best val_acc So Far: 0.8635314702987671
Total elapsed time: 00h 26m 50s
{'dropout': 0.30000000000000004, 'units': 64, 'units_2': 96, 'learning_rate': 0.001, 'tuner/epochs': 2, 'tuner/initial_epoch': 0, 'tuner/bracket': 2, 'tuner/round': 0, 'batch_size': 16}


In [54]:
tuner_spendings.search(top_spending_X, top_spending_y, epochs=50, validation_split=0.2, callbacks=[stop_early])

best_hps_spendings=tuner_spendings.get_best_hyperparameters()[0]
model_spendings = tuner_spendings.hypermodel.build(best_hps_spendings)

print(best_hps_spendings.values)

Trial 31 Complete [00h 00m 28s]
val_acc: 1.0

Best val_acc So Far: 1.0
Total elapsed time: 00h 07m 39s
{'dropout': 0.2, 'units': 96, 'units_2': 64, 'learning_rate': 0.01, 'batch_size': 48, 'tuner/epochs': 15, 'tuner/initial_epoch': 0, 'tuner/bracket': 0, 'tuner/round': 0}


In [71]:
save_dir = './model/starbucks_successes/saved'
kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)

for fold_no, (train, val) in enumerate(kfold.split(successes_X, successes_y)):
    model_successes = tuner_successes.hypermodel.build(best_hps_successes)
    
    history = model_successes.fit(x=successes_X.iloc[train], y=successes_y.iloc[train], epochs=3)
    scores = model_successes.evaluate(successes_X.iloc[val], successes_y.iloc[val])
    filename = f'{save_dir}/model_success_fold{fold_no}'

    print(f'Scores for fold {fold_no}:')
    for i, score in enumerate(scores):
         print(f'{model_successes.metrics_names[i]} of {score}')
         filename += f'__{model_successes.metrics_names[i]}_{score}'
    model_successes.save(filename+'.h5')

Epoch 1/3
Epoch 2/3
Epoch 3/3
Scores for fold 0:
loss of 0.33721545338630676
acc of 0.8263205289840698
f1_m of 0.603198230266571
precision_m of 0.4509325325489044
recall_m of 0.9351516962051392
Epoch 1/3
Epoch 2/3
Epoch 3/3
Scores for fold 1:
loss of 0.37154117226600647
acc of 0.8038396239280701
f1_m of 0.5542852282524109
precision_m of 0.4154166579246521
recall_m of 0.8506759405136108
Epoch 1/3
Epoch 2/3
Epoch 3/3
Scores for fold 2:
loss of 0.3305387496948242
acc of 0.8540734052658081
f1_m of 0.6275783777236938
precision_m of 0.47259917855262756
recall_m of 0.9561967253684998
Epoch 1/3
Epoch 2/3
Epoch 3/3
Scores for fold 3:
loss of 0.3387490212917328
acc of 0.8552671074867249
f1_m of 0.6341372728347778
precision_m of 0.4779960513114929
recall_m of 0.9637618064880371
Epoch 1/3
Epoch 2/3
Epoch 3/3
Scores for fold 4:
loss of 0.29743045568466187
acc of 0.853973925113678
f1_m of 0.6359875202178955
precision_m of 0.4789881110191345
recall_m of 0.9699750542640686


In [75]:
custom_objects = {'f1_m': f1_m, 'precision_m': precision_m, 'recall_m': recall_m}
with keras.utils.custom_object_scope(custom_objects):
    model_successes = keras.models.load_model('./model/starbucks_successes/saved/model_success_fold3__loss_0.3387490212917328__acc_0.8552671074867249__f1_m_0.6341372728347778__precision_m_0.4779960513114929__recall_m_0.9637618064880371.h5')

model_successes.summary()

Model: "sequential_36"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout_108 (Dropout)        (None, 50265, 17)         0         
_________________________________________________________________
dense_108 (Dense)            (None, 50265, 64)         1152      
_________________________________________________________________
dropout_109 (Dropout)        (None, 50265, 64)         0         
_________________________________________________________________
dense_109 (Dense)            (None, 50265, 96)         6240      
_________________________________________________________________
dropout_110 (Dropout)        (None, 50265, 96)         0         
_________________________________________________________________
dense_110 (Dense)            (None, 50265, 2)          194       
Total params: 7,586
Trainable params: 7,586
Non-trainable params: 0
___________________________________________________

In [82]:
scores = model_successes.evaluate(successes_test_X, successes_test_y)
print(f'Scores for testset:')
for i, score in enumerate(scores):
        print(f'{model_successes.metrics_names[i]} of {score}')

Scores for testset:
loss of 0.33673176169395447
acc of 0.8547497987747192
f1_m of 0.6333511471748352
precision_m of 0.47682446241378784
recall_m of 0.9655615091323853
