# Starbucks Capstone Challenge - Exploratory Data Analysis

## Setup

In [1]:
!pip install -e ../

Obtaining file:///C:/Users/netxph/Projects/sb-capstone
Installing collected packages: sb-capstone
  Attempting uninstall: sb-capstone
    Found existing installation: sb-capstone 0.1.0
    Uninstalling sb-capstone-0.1.0:
      Successfully uninstalled sb-capstone-0.1.0
  Running setup.py develop for sb-capstone
Successfully installed sb-capstone-0.1.0


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sb_capstone.wrangling import (
    clean_portfolio, 
    clean_profile, 
    ChannelType
)

In [3]:
portfolio = pd.read_csv("../data/processed/portfolio.csv")
portfolio = clean_portfolio(portfolio)
portfolio.head()

Unnamed: 0,id,offer_type,channels,reward,difficulty,duration
0,1,bogo,"[email, mobile, social]",10,10,7
1,2,bogo,"[web, email, mobile, social]",10,10,5
2,3,informational,"[web, email, mobile]",0,0,4
3,4,bogo,"[web, email, mobile]",5,5,7
4,5,discount,"[web, email]",5,20,10


In [4]:
portfolio.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   id          10 non-null     int64   
 1   offer_type  10 non-null     category
 2   channels    10 non-null     object  
 3   reward      10 non-null     int64   
 4   difficulty  10 non-null     int64   
 5   duration    10 non-null     int64   
dtypes: category(1), int64(4), object(1)
memory usage: 670.0+ bytes


In [5]:
profile = pd.read_csv("../data/processed/profile.csv")
profile = clean_profile(profile)
profile.head()

Unnamed: 0,id,gender,age,income,became_member_on
0,1,,,,2017-02-12
1,2,F,55.0,112000.0,2017-07-15
2,3,,,,2018-07-12
3,4,F,75.0,100000.0,2017-05-09
4,5,,,,2017-08-04


In [6]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   id                17000 non-null  int64         
 1   gender            14825 non-null  category      
 2   age               14825 non-null  float64       
 3   income            14825 non-null  float64       
 4   became_member_on  17000 non-null  datetime64[ns]
dtypes: category(1), datetime64[ns](1), float64(2), int64(1)
memory usage: 548.1 KB


In [7]:
transcript = pd.read_csv("../data/processed/transcript.csv")
transcript.head()

Unnamed: 0,person_id,event,time,offer_id,amount,reward
0,4,offer_received,0,4,,
1,4,offer_viewed,6,4,,
2,4,transaction,132,0,19.89,
3,4,offer_completed,132,4,,5.0
4,4,transaction,144,0,17.78,


## Data Analysis

* Descriptive
* Missing Values
* Duplicates

In [8]:
portfolio.describe(include="all")

Unnamed: 0,id,offer_type,channels,reward,difficulty,duration
count,10.0,10,10,10.0,10.0,10.0
unique,,3,4,,,
top,,bogo,"[web, email, mobile, social]",,,
freq,,4,4,,,
mean,5.5,,,4.2,7.7,6.5
std,3.02765,,,3.583915,5.831905,2.321398
min,1.0,,,0.0,0.0,3.0
25%,3.25,,,2.0,5.0,5.0
50%,5.5,,,4.0,8.5,7.0
75%,7.75,,,5.0,10.0,7.0


In [9]:
portfolio.offer_type.value_counts()

bogo             4
discount         4
informational    2
Name: offer_type, dtype: int64

In [10]:
portfolio.channels.explode().astype(ChannelType).value_counts()

email     10
mobile     9
web        8
social     6
Name: channels, dtype: int64

In [11]:
portfolio.explode(column="channels").groupby("id").channels.count()

id
1     3
2     4
3     3
4     3
5     2
6     4
7     4
8     3
9     4
10    3
Name: channels, dtype: int64

In [12]:
profile.describe(include="all", datetime_is_numeric=True)

Unnamed: 0,id,gender,age,income,became_member_on
count,17000.0,14825,14825.0,14825.0,17000
unique,,3,,,
top,,M,,,
freq,,8484,,,
mean,8500.5,,54.393524,65404.991568,2017-02-23 13:12:10.164706048
min,1.0,,18.0,30000.0,2013-07-29 00:00:00
25%,4250.75,,42.0,49000.0,2016-05-26 00:00:00
50%,8500.5,,55.0,64000.0,2017-08-02 00:00:00
75%,12750.25,,66.0,80000.0,2017-12-30 00:00:00
max,17000.0,,101.0,120000.0,2018-07-26 00:00:00


In [17]:
profile.gender.value_counts(dropna=False)

M      8484
F      6129
NaN    2175
O       212
Name: gender, dtype: int64

In [25]:
null_profiles = profile[profile.gender.isna() & profile.age.isna() & profile.income.isna()][["id", "gender", "became_member_on"]]
len(null_profiles)

2175

In [26]:
null_profiles.describe(datetime_is_numeric=True)

Unnamed: 0,id,became_member_on
count,2175.0,2175
mean,8517.805517,2017-03-29 19:53:42.620689920
min,1.0,2013-08-02 00:00:00
25%,4006.5,2016-07-04 12:00:00
50%,8542.0,2017-07-31 00:00:00
75%,12855.0,2017-12-26 00:00:00
max,16995.0,2018-07-26 00:00:00
std,4985.473324,


> NOTES

> Is it safe to delete the profiles with missing values?

> Need to check if there are transcripts with profiles that have missing values