# Udacity Capstone Project
## Starbucks Promotional Offers
### April 2023

# Data Sets
The data for this project are contained in three files:

- **portfolio.json** - containing offer ids and meta data about each offer (duration, type, etc.)
- **profile.json** - demographic data for each customer
- **transcript.json** - records for transactions, offers received, offers viewed, and offers completed

## Schema details
**portfolio.json**
* id (string) - offer id
* offer_type (string) - type of offer ie BOGO, discount, informational
* difficulty (int) - minimum required spend to complete an offer
* reward (int) - reward given for completing an offer
* duration (int) - time for offer to be open, in days
* channels (list of strings)

**profile.json**
* age (int) - age of the customer 
* became_member_on (int) - date when customer created an app account
* gender (str) - gender of the customer (note some entries contain 'O' for other rather than M or F)
* id (str) - customer id
* income (float) - customer's income

**transcript.json**
* event (str) - record description (ie transaction, offer received, offer viewed, etc.)
* person (str) - customer id
* time (int) - time in hours since start of test. The data begins at time t=0
* value - (dict of strings) - either an offer id or transaction amount depending on the record

In [1]:
# import necessary libraries
import pandas as pd
import numpy as np
import math
import json
from sklearn import preprocessing
%matplotlib inline

In [41]:
# read in the json files
portfolio = pd.read_json('data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('data/profile.json', orient='records', lines=True)
transcript = pd.read_json('data/transcript.json', orient='records', lines=True)

## Data Exploration

### portfolio.json

In [3]:
# look at first few rows
portfolio.head()

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [4]:
# shape of portfolio data
print('Rows: {}'.format(portfolio.shape[0]))
print('Columns: {}'.format(portfolio.shape[1]))

Rows: 10
Columns: 6


In [5]:
# column information
portfolio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reward      10 non-null     int64 
 1   channels    10 non-null     object
 2   difficulty  10 non-null     int64 
 3   duration    10 non-null     int64 
 4   offer_type  10 non-null     object
 5   id          10 non-null     object
dtypes: int64(3), object(3)
memory usage: 608.0+ bytes


In [6]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [7]:
label_encoder = preprocessing.LabelEncoder()

In [8]:
profile['id'] = label_encoder.fit_transform(profile['id'])

In [9]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,6961,20170212,
1,F,55,398,20170715,112000.0
2,,118,3746,20180712,
3,F,75,7996,20170509,100000.0
4,,118,10735,20170804,


In [42]:
transcript.head()

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


In [43]:
transcript.rename(columns = {'person' : 'customer_id'}, inplace = True)
transcript.head(2)

Unnamed: 0,customer_id,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0


In [44]:
# one-hot encode 'event' column
event_temp = pd.get_dummies(transcript['event'])
event_temp.drop(['transaction'], axis = 1, inplace = True)

# merge event_temp into transcript
transcript = pd.concat([transcript, event_temp], axis = 1)

In [45]:
transcript.head(2)

Unnamed: 0,customer_id,event,value,time,offer completed,offer received,offer viewed
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0,0,1,0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0,0,1,0


In [35]:
transcript['offer_id'] = transcript['value'].apply(lambda x: list(x.values())[0])
transcript.head(2)

Unnamed: 0,customer_id,event,value,time,offer completed,offer received,offer viewed,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0,0,1,0,9b98b8c7a33c4b65b9aebfe6a799e6d9
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0,0,1,0,0b1e1539f2cc45b7b9fa7c272da2e1d7


In [37]:
transcript['amount'] = transcript.query('event == "transaction"')['value'].apply(lambda x: list(x.values())[0])
transcript.head(2)

Unnamed: 0,customer_id,event,value,time,offer completed,offer received,offer viewed,offer_id,amount
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0,0,1,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0,0,1,0,0b1e1539f2cc45b7b9fa7c272da2e1d7,


In [38]:
transcript.sample(5)

Unnamed: 0,customer_id,event,value,time,offer completed,offer received,offer viewed,offer_id,amount
86133,0b3878783eed4009993f706e51f19d9e,transaction,{'amount': 26.04},216,0,0,0,26.04,26.04
43165,a60dc180138d4278ae6c6eba2c364356,transaction,{'amount': 9.29},102,0,0,0,9.29,9.29
14840,1072640954974f2a9b4aff013d6c827f,offer viewed,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0,0,0,1,fafdcd668e3743c1bb461111dcafc2a4,
159748,aecc4f164b69498fa32aa173636fecbe,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},408,0,1,0,4d5c57ea9a6940dd891ad53e9dbe8da0,
107281,ed79f35c799d4f829f864b86f5eb43f7,offer completed,{'offer_id': 'ae264e3637204a6fb9bb56bc8210ddfd...,312,1,0,0,ae264e3637204a6fb9bb56bc8210ddfd,


In [46]:
def extract_offer_id(value):
    '''
    extract_offer_id:
        - extract offer_id column from transcript dataframe value column
    
    IN:
        - transcript dataframe value colum
    
    OUT:
        - offer_id column extracted from value column where value key is 'offer id' or 'offer_id'
    '''
    if list(value.keys())[0] in ['offer id', 'offer_id']:
        return list(value.values())[0]
    
def extract_amount(value):
    '''
    extract_offer_id:
        - extract amount column from transcript dataframe value column
        
    IN:
        - transcript dataframe value colum
    
    OUT:
        - amount extracted from value column where value key is 'offer id' or 'offer_id'
    '''
    if list(value.keys())[0] in ['amount']:
        return list(value.values())[0]

In [47]:
transcript['offer_id'] = transcript.value.apply(extract_offer_id)
transcript['amount'] = transcript.value.apply(extract_amount)

In [48]:
transcript.sample(5)

Unnamed: 0,customer_id,event,value,time,offer completed,offer received,offer viewed,offer_id,amount
146540,9b6e529df1234889bb807759037a3f7d,transaction,{'amount': 2.17},390,0,0,0,,2.17
246455,43daf8fb68d147d38cb1eb669d7916c5,offer received,{'offer id': 'ae264e3637204a6fb9bb56bc8210ddfd'},576,0,1,0,ae264e3637204a6fb9bb56bc8210ddfd,
98305,d0ab05be7544435f935f13082b31ff0e,transaction,{'amount': 6.1},264,0,0,0,,6.1
265109,8e96eadadb8c46bebc04f2697f4367fa,transaction,{'amount': 1.79},582,0,0,0,,1.79
174614,21f58f38b6ac4f67acd2959072adaa7c,offer completed,{'offer_id': '9b98b8c7a33c4b65b9aebfe6a799e6d9...,426,1,0,0,9b98b8c7a33c4b65b9aebfe6a799e6d9,


In [None]:
names = { 'Amar', 'Hema', 'Balu' }
>>> lst = { key:value for key,value in marks.items() if key in names}

In [146]:
offer_vals = {'offer id', 'offer_id'}
transcript['offer_id'] = {key:value for key, value in transcript.value.items() if key in offer_vals}

ValueError: Length of values (0) does not match length of index (306534)