In [145]:
import sagemaker
import boto3

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.estimator import Estimator

import pandas as pd
import numpy as np
import math
import json
import os
import pprint
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split

mpl.rcParams['figure.figsize'] = (15.0, 6.0)
pd.set_option('max_columns', 30)

In [82]:
# Create sagemaker session 
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

## Upload Trainings Data To S3

In [83]:
# Read preprocessed data
data = pd.read_csv('preprocessed_data.csv').drop(['person', 'offer'], axis=1).rename(columns = {'offer_completed': 'label'})
data.head()

Unnamed: 0,label,age,income,reward,difficulty,duration,days_since_registration,web,email,mobile,social,gender_M,gender_O,offer_type_discount
0,1,33.0,72000.0,5,5,5,461,1,1,1,1,1,0,0
1,1,33.0,72000.0,2,10,10,461,1,1,1,1,1,0,1
2,0,,,5,5,5,92,1,1,1,1,0,0,0
3,1,40.0,57000.0,5,20,10,198,1,1,0,0,0,1,1
4,1,40.0,57000.0,3,7,7,198,1,1,1,1,0,1,1


In [84]:
# Make train test split
train, test = train_test_split(data, test_size=0.3, random_state=0)
print("train shape: ", train.shape)
print("test shape: ", test.shape)

train shape:  (27878, 14)
test shape:  (11948, 14)


In [89]:
# Upload data to S3 
data_dir = 'data'
bucket = sagemaker_session.default_bucket()
prefix = 'sagemaker/starbucks_rewards'

train.to_csv(os.path.join(data_dir, 'train.csv'), header=False, index=False)
test.to_csv(os.path.join(data_dir, 'test.csv'), header=False, index=False)

train_location = sagemaker_session.upload_data(os.path.join(data_dir, 'train.csv'), key_prefix=prefix)
test_location = sagemaker_session.upload_data(os.path.join(data_dir, 'test.csv'), key_prefix=prefix)

In [91]:
# Check: Has the upload been successful?
s3_client = boto3.client('s3')
for obj in s3_client.list_objects(Bucket=bucket)['Contents']:
    print(obj['Key'])

sagemaker/starbucks_rewards/test.csv
sagemaker/starbucks_rewards/train.csv


## Train Estimator

In [122]:
!pygmentize src/train.py

[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mimport[39;49;00m [04m[36mnumpy[39;49;00m [34mas[39;49;00m [04m[36mnp[39;49;00m

[34mimport[39;49;00m [04m[36mjoblib[39;49;00m

[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mpreprocessing[39;49;00m [34mimport[39;49;00m StandardScaler
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mimpute[39;49;00m [34mimport[39;49;00m SimpleImputer
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mensemble[39;49;00m [34mimport[39;49;00m RandomForestClassifier
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mpipeline[39;49;00m [34mimport[39;49;00m Pipeline
[34mfrom[39;49;00m [04m[36msklearn[39;49;00m[04m[36m.[39;49;00m[04m[36mmodel_selection[39

In [123]:
# Set directory to save model artifacts
s3_output_path = "s3://{}/{}/output".format(bucket, prefix)

# Instantiate the sklearn estimator
estimator = SKLearn(
    sagemaker_session=sagemaker_session,
    role=role,
    entry_point='train.py',
    source_dir='src',
    py_version='py3',
    framework_version='0.23-1',
    instance_count=1,
    instance_type='ml.c4.xlarge',
    output_path=s3_output_path
)

In [124]:
%%time

# Train estimator on S3 training data
estimator.fit({'train': train_location})

2021-02-27 12:39:15 Starting - Starting the training job...
2021-02-27 12:39:17 Starting - Launching requested ML instancesProfilerReport-1614429555: InProgress
......
2021-02-27 12:40:31 Starting - Preparing the instances for training......
2021-02-27 12:41:39 Downloading - Downloading input data...
2021-02-27 12:42:12 Training - Downloading the training image..[34m2021-02-27 12:42:28,923 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2021-02-27 12:42:28,925 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-02-27 12:42:28,935 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m

2021-02-27 12:42:32 Training - Training image download completed. Training in progress.[34m2021-02-27 12:42:36,573 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-02-27 12:42:36,586 sagemaker-training-toolkit INFO     No GPUs detected (nor

## Deploy Model

In [125]:
%%time

# Deploy model and assign to variable for making predictions
predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium'
)

-------------------!CPU times: user 341 ms, sys: 15.6 ms, total: 356 ms
Wall time: 9min 32s


## Evaluate Model

In [126]:
# Split test data into feature matrix and target vector
X_test = test.iloc[:,1:]
y_test = test.iloc[:,0]

In [128]:
pred_proba = predictor.predict(X_test)
pred_proba

array([0.42470315, 0.64550168, 0.02014354, ..., 0.062982  , 0.23421987,
       0.96312812])

In [166]:
roc_auc_score(y_test, pred_proba)

0.8308796388934323

## Application: Decide Which Offer A Customer Should Get

In [137]:
customers = [
    {
        'id': '0009655768c64bdeb2e877511632db8f',
        'age': 33,
        'gender': 'M',
        'income': 72000,
        'days_since_registration': 461}, 
    {
        'id': '00116118485d4dfda04fdbaba9a87b5c',
        'age': np.nan,
        'gender': np.nan,
        'income': np.nan,
        'days_since_registration': 92}, 
    {
        'id': '0011e0d4e6b944f998e987f904e8c1e5',
        'age': 40,
        'gender': 'O',
        'income': 57000,
        'days_since_registration': 198}
]

In [129]:
offers = [
    {
        'id': 'ae264e3637204a6fb9bb56bc8210ddfd',
        'type': 'bogo',
        'web': 1,
        'email': 1,
        'social': 1,
        'mobile': 0,
        'reward': 10,
        'difficulty': 10,
        'duration': 7}, 
    {
        'id': 'f19421c1d4aa40978ebb69ca19b0e20d',
        'type': 'bogo',
        'web': 1,
        'email': 1,
        'social': 1,
        'mobile': 1,
        'reward': 5,
        'difficulty': 5,
        'duration': 5}, 
    {
        'id': '0b1e1539f2cc45b7b9fa7c272da2e1d7',
        'type': 'discount',
        'web': 1,
        'email': 1,
        'social': 0,
        'mobile': 0,
        'reward': 5,
        'difficulty': 20,
        'duration': 10}  
]

In [159]:
def get_success_probabilities(customers, offers):
    """Calculates for each customer the success probabilities of various offers"""
    
    probas = {}
    for customer in customers:
        probas[customer['id']] = {}

        for offer in offers:
            pred = predictor.predict([[
                customer['age'], customer['income'], offer['reward'], offer['difficulty'], offer['duration'], 
                customer['days_since_registration'], offer['web'], offer['email'], offer['mobile'], offer['social'], 
                (1 if customer['gender']=='M' else 0), (1 if customer['gender']=='O' else 0), (1 if offer['type']=='discount' else 0)
            ]])
            probas[customer['id']][offer['id']] = float(pred.squeeze())
            
    return probas

def get_best_offer_for_customer(probas):
    """Infers best offer for customer from success probabilities"""
    
    choices = {}
    for customer in probas.keys():
        
        best_offer = None
        best_proba = 0
        for offer_id, proba in probas[customer].items():
            if proba > best_proba:
                best_offer = offer_id
                best_proba = proba
            
        choices[customer] = best_offer
        
    return choices

In [160]:
probas = get_success_probabilities(customers, offers)
pprint.pprint(results)

{'0009655768c64bdeb2e877511632db8f': {'0b1e1539f2cc45b7b9fa7c272da2e1d7': 0.9426955139133467,
                                      'ae264e3637204a6fb9bb56bc8210ddfd': 0.5648851131888629,
                                      'f19421c1d4aa40978ebb69ca19b0e20d': 0.9340771182491163},
 '00116118485d4dfda04fdbaba9a87b5c': {'0b1e1539f2cc45b7b9fa7c272da2e1d7': 0.053495241758556036,
                                      'ae264e3637204a6fb9bb56bc8210ddfd': 0.02977202504887236,
                                      'f19421c1d4aa40978ebb69ca19b0e20d': 0.10867103658455653},
 '0011e0d4e6b944f998e987f904e8c1e5': {'0b1e1539f2cc45b7b9fa7c272da2e1d7': 0.68744225190439,
                                      'ae264e3637204a6fb9bb56bc8210ddfd': 0.5578886429082346,
                                      'f19421c1d4aa40978ebb69ca19b0e20d': 0.6036967331975791}}


In [161]:
choices = get_best_offer_for_customer(probas)
pprint.pprint(choices)

{'0009655768c64bdeb2e877511632db8f': '0b1e1539f2cc45b7b9fa7c272da2e1d7',
 '00116118485d4dfda04fdbaba9a87b5c': 'f19421c1d4aa40978ebb69ca19b0e20d',
 '0011e0d4e6b944f998e987f904e8c1e5': '0b1e1539f2cc45b7b9fa7c272da2e1d7'}


## Delete Endpoint

In [164]:
predictor.delete_endpoint()