# Starbucks Capstone Challenge - Data Preparation

## Introduction

## Setup

In [1]:
import sys

!{sys.executable} -m pip install -e ../ --quiet

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from sb_capstone.wrangling import (
    clean_portfolio,
    clean_profile,
    clean_transcript
)

from sb_capstone.shaping import (
    get_transcript_combined,
    get_transcript_group
)

In [3]:
portfolio = pd.read_csv('../data/processed/portfolio.csv')
portfolio = clean_portfolio(portfolio)
portfolio.head()

Unnamed: 0,id,offer_type,channels,reward,difficulty,duration
0,1,bogo,"[email, mobile, social]",10,10,7
1,2,bogo,"[web, email, mobile, social]",10,10,5
2,3,informational,"[web, email, mobile]",0,0,4
3,4,bogo,"[web, email, mobile]",5,5,7
4,5,discount,"[web, email]",5,20,10


In [4]:
portfolio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   id          10 non-null     int64   
 1   offer_type  10 non-null     category
 2   channels    10 non-null     object  
 3   reward      10 non-null     int64   
 4   difficulty  10 non-null     int64   
 5   duration    10 non-null     int64   
dtypes: category(1), int64(4), object(1)
memory usage: 670.0+ bytes


In [5]:
profile = pd.read_csv('../data/processed/profile.csv')
profile = clean_profile(profile)
profile.head()

Unnamed: 0,id,gender,age,income,became_member_on
0,1,,,,2017-02-12
1,2,F,55.0,112000.0,2017-07-15
2,3,,,,2018-07-12
3,4,F,75.0,100000.0,2017-05-09
4,5,,,,2017-08-04


In [6]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                17000 non-null  int64         
 1   gender            14825 non-null  category      
 2   age               14825 non-null  float64       
 3   income            14825 non-null  float64       
 4   became_member_on  17000 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](1), float64(2), int64(1)
memory usage: 548.1 KB


In [7]:
transcript = pd.read_csv('../data/processed/transcript.csv')
transcript = clean_transcript(transcript)
transcript.head()

Unnamed: 0,person_id,event,time,offer_id,amount,reward
0,4,offer_received,0,4,,
1,4,offer_viewed,6,4,,
2,4,transaction,132,0,19.89,
3,4,offer_completed,132,4,,5.0
4,4,transaction,144,0,17.78,


In [8]:
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   person_id  306534 non-null  int64   
 1   event      306534 non-null  category
 2   time       306534 non-null  int64   
 3   offer_id   306534 non-null  category
 4   amount     138953 non-null  float64 
 5   reward     33579 non-null   float64 
dtypes: category(2), float64(2), int64(2)
memory usage: 9.9 MB


## Data Transformation

To help us compute the success of each offer, we need to simplify and compress the transcript by summarizing the timeline and summarizing them by extracting their offer group and offer type, because duplicate offers can occur on a single customer's timeline. If the customer has not received any offer, set the offer group and type to zero. 

Additional to that, we need to set if the offer was completed. The challenge for that is that informational offers does not have completion event, but as the presumption in the details of the data, that customers are influenced from the date of its validity. Therefore, we can set the completion event for informational offers if a transaction happens within its validity period.

In [9]:
transcript_all = transcript \
    .merge(portfolio, left_on="offer_id", right_on="id", how="left") \
    .rename(columns={"reward_x": "reward", "reward_y": "offer_reward"})

transcript_all.head()

Unnamed: 0,person_id,event,time,offer_id,amount,reward,id,offer_type,channels,offer_reward,difficulty,duration
0,4,offer_received,0,4,,,4.0,bogo,"[web, email, mobile]",5.0,5.0,7.0
1,4,offer_viewed,6,4,,,4.0,bogo,"[web, email, mobile]",5.0,5.0,7.0
2,4,transaction,132,0,19.89,,,,,,,
3,4,offer_completed,132,4,,5.0,4.0,bogo,"[web, email, mobile]",5.0,5.0,7.0
4,4,transaction,144,0,17.78,,,,,,,


In [10]:
transcript_all = get_transcript_combined(transcript_all)
transcript_all[transcript_all.person_id == 4]

Unnamed: 0,person_id,event,time,mapped_offer,amount,reward,offer_id,offer_type,channels,offer_reward,difficulty,duration,wave,day,offer_group
0,4,offer_received,0,4,,,4.0,bogo,"[web, email, mobile]",5.0,5.0,7.0,1,1,1.0
1,4,offer_viewed,6,4,,,4.0,bogo,"[web, email, mobile]",5.0,5.0,7.0,1,1,1.0
2,4,transaction,132,0,19.89,,4.0,,,,,,1,6,1.0
3,4,offer_completed,132,4,,5.0,4.0,bogo,"[web, email, mobile]",5.0,5.0,7.0,1,6,1.0
4,4,transaction,144,0,17.78,,0.0,,,,,,1,7,-1.0
5,4,offer_received,168,8,,,8.0,informational,"[email, mobile, social]",0.0,0.0,3.0,2,8,2.0
6,4,offer_viewed,216,8,,,8.0,informational,"[email, mobile, social]",0.0,0.0,3.0,2,10,2.0
7,4,transaction,222,0,19.67,,8.0,,,,,,2,10,2.0
8,4,transaction,240,0,29.72,,0.0,,,,,,2,11,-2.0
9,4,transaction,378,0,23.93,,0.0,,,,,,3,16,-3.0


In [11]:
transcript_group = get_transcript_group(transcript_all, profile)
transcript_group[transcript_group.person_id == 4]

Unnamed: 0,id,gender,age,income,became_member_on,wave,person_id,offer_group,event,mapped_offer,amount,reward,offer_type,channels,offer_reward,difficulty,duration,non_offer_amount
3,4,F,75.0,100000.0,2017-05-09,2,4.0,2.0,"[offer_received, offer_viewed, offer_completed]",0.0,19.67,,informational,"[email, mobile, social]",0.0,0.0,3.0,29.72
34003,4,F,75.0,100000.0,2017-05-09,4,4.0,3.0,"[offer_received, offer_viewed, offer_completed]",1.0,0.0,10.0,bogo,"[email, mobile, social]",10.0,10.0,7.0,
51003,4,F,75.0,100000.0,2017-05-09,5,4.0,4.0,"[offer_received, offer_completed, offer_viewed]",0.0,21.72,5.0,bogo,"[web, email, mobile, social]",5.0,5.0,5.0,26.56
85003,4,F,75.0,100000.0,2017-05-09,1,4.0,1.0,"[offer_received, offer_viewed, offer_completed]",0.0,19.89,5.0,bogo,"[web, email, mobile]",5.0,5.0,7.0,17.78


In [12]:
mask = transcript_group.event.fillna("").apply(lambda x: "transaction" in x)
transcript_group[(transcript_group.offer_type == "informational")] 

Unnamed: 0,id,gender,age,income,became_member_on,wave,person_id,offer_group,event,mapped_offer,amount,reward,offer_type,channels,offer_reward,difficulty,duration,non_offer_amount
3,4,F,75.0,100000.0,2017-05-09,2,4.0,2.0,"[offer_received, offer_viewed, offer_completed]",0.0,19.67,,informational,"[email, mobile, social]",0.0,0.0,3.0,29.72
9,10,,,,2016-11-22,2,10.0,1.0,"[offer_received, offer_viewed]",8.0,0.00,,informational,"[email, mobile, social]",0.0,0.0,3.0,
22,23,F,78.0,71000.0,2017-06-16,2,23.0,2.0,"[offer_received, offer_viewed, offer_completed]",0.0,25.27,,informational,"[email, mobile, social]",0.0,0.0,3.0,
23,24,,,,2017-09-07,2,24.0,1.0,"[offer_received, offer_viewed, offer_completed]",0.0,2.31,,informational,"[web, email, mobile]",0.0,0.0,4.0,4.62
28,29,F,46.0,59000.0,2017-09-11,2,29.0,1.0,[offer_received],3.0,0.00,,informational,"[web, email, mobile]",0.0,0.0,4.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101974,16975,F,59.0,71000.0,2016-01-13,1,16975.0,1.0,"[offer_received, offer_viewed]",8.0,0.00,,informational,"[email, mobile, social]",0.0,0.0,3.0,
101975,16976,F,61.0,81000.0,2017-12-31,1,16976.0,1.0,"[offer_received, offer_viewed]",3.0,0.00,,informational,"[web, email, mobile]",0.0,0.0,4.0,
101977,16978,,,,2016-06-09,1,16978.0,1.0,"[offer_received, offer_completed, offer_viewed]",0.0,4.69,,informational,"[web, email, mobile]",0.0,0.0,4.0,2.38
101981,16982,M,84.0,93000.0,2016-06-29,1,16982.0,1.0,"[offer_received, offer_viewed, offer_completed]",0.0,25.07,,informational,"[email, mobile, social]",0.0,0.0,3.0,


In [13]:
transcript_all.to_csv("../data/processed/transcript_all.csv", index=False)

In [None]:
transcript_group.to_csv("../data/processed/transcript_group.csv", index=False)