# Starbucks Capstone Challenge - Implementation

## Introduction

We are going to test out the implementation workflow using our library here. We'll try to simulate different scenario for different users.

## Setup

In [1]:
import sys

!{sys.executable} -m pip install -e ../ --quiet

In [2]:
import pandas as pd
import numpy as np

from sb_capstone.wrangling import (
    clean_transcript_group,
    clean_profile
)

from sb_capstone.shaping import (
    convert_for_receive_training,
    convert_for_select_training
)

from sb_capstone.experiment import(
    train_receive_offer,
    train_select_offer,
    receive_offer,
    select_offer
)

## Training

In [3]:
transcript_group = pd.read_csv("../data/processed/transcript_group.csv")
transcript_group = clean_transcript_group(transcript_group)

transcript_group.head()

Unnamed: 0,id,wave,diffs,received,viewed,completed,purchased,amount,reward,non_offer_amount,...,mobile,social,gender,age,generation,group,income,membership_year,membership_month,membership_day
0,1,2,48.0,True,True,False,False,0.0,0.0,0.0,...,True,False,U,,,,,2017,2,12
1,2,2,49.866737,False,False,False,False,0.0,0.0,0.0,...,False,False,F,55.0,boomers,middle_age,112000.0,2017,7,15
2,3,2,36.0,True,True,False,False,0.0,0.0,0.0,...,True,False,U,,,,,2018,7,12
3,4,2,26.0,True,True,True,True,19.67,0.0,29.72,...,True,True,F,75.0,silent,old,100000.0,2017,5,9
4,5,2,49.866737,False,False,False,True,0.0,0.0,3.5,...,False,False,U,,,,,2017,8,4


In [4]:
transcript_group.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102000 entries, 0 to 101999
Data columns (total 28 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   id                 102000 non-null  int64   
 1   wave               102000 non-null  int64   
 2   diffs              102000 non-null  float64 
 3   received           102000 non-null  bool    
 4   viewed             102000 non-null  bool    
 5   completed          102000 non-null  bool    
 6   purchased          102000 non-null  bool    
 7   amount             102000 non-null  float64 
 8   reward             102000 non-null  float64 
 9   non_offer_amount   102000 non-null  float64 
 10  mapped_offer       102000 non-null  category
 11  spendings          102000 non-null  float64 
 12  recommended_offer  102000 non-null  int64   
 13  offer_type         102000 non-null  category
 14  difficulty         102000 non-null  float64 
 15  duration           102000 non-null

In [5]:
data = convert_for_receive_training(transcript_group)
data.head()

Unnamed: 0,purchased,gender,age,income,membership_year,membership_month,membership_day,gen_z,millenials,gen_x,boomers,silent,young,adult,middle_age,old
0,1.0,0.0,75.0,100000.0,2017.0,5.0,9.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1.0,1.0,65.0,53000.0,2018.0,2.0,9.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0.0,1.0,58.0,51000.0,2017.0,11.0,11.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,61.0,57000.0,2017.0,9.0,11.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,26.0,46000.0,2014.0,2.0,13.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66501 entries, 0 to 66500
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   purchased         66501 non-null  float64
 1   gender            66501 non-null  float64
 2   age               66501 non-null  float64
 3   income            66501 non-null  float64
 4   membership_year   66501 non-null  float64
 5   membership_month  66501 non-null  float64
 6   membership_day    66501 non-null  float64
 7   gen_z             66501 non-null  float64
 8   millenials        66501 non-null  float64
 9   gen_x             66501 non-null  float64
 10  boomers           66501 non-null  float64
 11  silent            66501 non-null  float64
 12  young             66501 non-null  float64
 13  adult             66501 non-null  float64
 14  middle_age        66501 non-null  float64
 15  old               66501 non-null  float64
dtypes: float64(16)
memory usage: 8.1 MB


In [7]:
train_receive_offer(data, "../models/receive_offer.pkl")

('../models/receive_offer.pkl',
 {'0.0': {'precision': 0.6625183648777115,
   'recall': 0.7742652257347743,
   'f1-score': 0.7140461997019375,
   'support': 9901},
  '1.0': {'precision': 0.5578635014836796,
   'recall': 0.41933085501858736,
   'f1-score': 0.47877758913412566,
   'support': 6725},
  'accuracy': 0.6306989053290027,
  'macro avg': {'precision': 0.6101909331806956,
   'recall': 0.5967980403766808,
   'f1-score': 0.5964118944180316,
   'support': 16626},
  'weighted avg': {'precision': 0.6201868385740387,
   'recall': 0.6306989053290027,
   'f1-score': 0.6188831174170503,
   'support': 16626}})

In [8]:
data = convert_for_select_training(transcript_group)
data.head()

Unnamed: 0,gender,age,income,membership_year,membership_month,membership_day,1,2,3,4,5,6,7,8,9,10
0,0,75.0,100000.0,2017,5,9,1,0,0,1,0,0,0,1,0,0
1,1,68.0,70000.0,2018,4,26,0,0,0,1,0,0,1,0,0,0
2,1,65.0,53000.0,2018,2,9,0,0,0,1,0,0,1,0,1,1
3,1,58.0,51000.0,2017,11,11,0,0,0,0,0,0,1,0,0,0
4,0,61.0,57000.0,2017,9,11,0,1,0,0,0,0,0,0,1,0


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11587 entries, 0 to 11586
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            11587 non-null  int64  
 1   age               11587 non-null  float64
 2   income            11587 non-null  float64
 3   membership_year   11587 non-null  int64  
 4   membership_month  11587 non-null  int64  
 5   membership_day    11587 non-null  int64  
 6   1                 11587 non-null  uint8  
 7   2                 11587 non-null  uint8  
 8   3                 11587 non-null  uint8  
 9   4                 11587 non-null  uint8  
 10  5                 11587 non-null  uint8  
 11  6                 11587 non-null  uint8  
 12  7                 11587 non-null  uint8  
 13  8                 11587 non-null  uint8  
 14  9                 11587 non-null  uint8  
 15  10                11587 non-null  uint8  
dtypes: float64(2), int64(4), uint8(10)
memor

In [10]:
train_select_offer(data, "../models/select_offer.pkl")

('../models/select_offer.pkl',
 {'0': {'precision': 0.23136645962732919,
   'recall': 0.2547008547008547,
   'f1-score': 0.24247355573637103,
   'support': 585},
  '1': {'precision': 0.23007518796992482,
   'recall': 0.261986301369863,
   'f1-score': 0.24499599679743794,
   'support': 584},
  '2': {'precision': 0.12098765432098765,
   'recall': 0.1484848484848485,
   'f1-score': 0.13333333333333336,
   'support': 330},
  '3': {'precision': 0.1988527724665392,
   'recall': 0.22510822510822512,
   'f1-score': 0.2111675126903553,
   'support': 462},
  '4': {'precision': 0.11864406779661017,
   'recall': 0.14432989690721648,
   'f1-score': 0.13023255813953488,
   'support': 291},
  '5': {'precision': 0.30785123966942146,
   'recall': 0.3378684807256236,
   'f1-score': 0.3221621621621621,
   'support': 882},
  '6': {'precision': 0.3425925925925926,
   'recall': 0.3611713665943601,
   'f1-score': 0.3516367476240761,
   'support': 922},
  '7': {'precision': 0.25297619047619047,
   'recall': 0

## Implementation

Let us load first existing customers and select random users.

In [11]:
profile = pd.read_csv("../data/processed/profile.csv")
profile = clean_profile(profile)

profile.head()


Unnamed: 0,id,gender,age,income,became_member_on
0,1,,,,2017-02-12
1,2,F,55.0,112000.0,2017-07-15
2,3,,,,2018-07-12
3,4,F,75.0,100000.0,2017-05-09
4,5,,,,2017-08-04


In [12]:
receive_offer(profile[profile.id < 10])

Unnamed: 0,id,receive_offer
0,1,
1,2,False
2,3,
3,4,True
4,5,
5,6,True
6,7,
7,8,
8,9,True


In [13]:
receive_offer(profile[profile.id < 10], default_value=False)

Unnamed: 0,id,receive_offer
0,1,False
1,2,False
2,3,False
3,4,True
4,5,False
5,6,True
6,7,False
7,8,False
8,9,True


In [14]:
new_profile = pd.DataFrame([[17001, "M", 41.0, 32000.0, "2018-12-01"]], columns=profile.columns)
new_profile = clean_profile(new_profile)
new_profile

Unnamed: 0,id,gender,age,income,became_member_on
0,17001,M,41.0,32000.0,2018-12-01


Let's also test new profiles.

In [15]:
receive_offer(new_profile)

Unnamed: 0,id,receive_offer
0,17001,False


In [16]:
select_offer(profile[profile.id<10], default_offers=[6,7])

Unnamed: 0,id,recommended_offers
0,2,"[6, 7, 10]"
1,4,"[1, 4, 8]"
2,6,"[4, 7]"
3,9,"[4, 7, 9, 10]"
0,1,"[6, 7]"
1,3,"[6, 7]"
2,5,"[6, 7]"
3,7,"[6, 7]"
4,8,"[6, 7]"


In [17]:
select_offer(profile[profile.id<2])

Unnamed: 0,id,recommended_offers
0,1,[]


Let's use it with new profiles.

In [18]:
select_offer(new_profile)

Unnamed: 0,id,recommended_offers
0,17001,[8]


# Conclusion

With methods `receive_offer` and `select_offer`, along with setting defaults, will provide flexible library that can be used in designing the next experiments to gather new information and data.