<a href="https://colab.research.google.com/github/p33ves/CMPT726-ML-Project/blob/KH/Detecting_Potential_Passive_Customers_Data%20processing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Detecting Potential Passive Customers on the Starbucks Reward Datasets

References : 
- https://towardsdatascience.com/starbucks-offer-personalization-sending-the-right-offer-to-the-right-customer-14d4fbc20575
- https://github.com/lalago31/Data-Science-Portfolio/tree/master/Capstone%20-%20Starbucks%20Offer%20Personalization


In [None]:
import pandas as pd
import numpy as np
import math
import json

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

from datetime import datetime
from dateutil.relativedelta import relativedelta

Installing and importing plotly on Colab

In [None]:
!pip install plotly==4.12.0
import plotly.express as px

Collecting plotly==4.12.0
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/af86e9d9bf1a3e4f2dabebeabd02a32e8ddf671a5d072b3af2b011efea99/plotly-4.12.0-py2.py3-none-any.whl (13.1MB)
[K     |████████████████████████████████| 13.1MB 5.8MB/s 
Installing collected packages: plotly
  Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-4.12.0


Loading json datasets from Github repo

In [None]:
portfolio_url = 'https://raw.githubusercontent.com/p33ves/CMPT726-ML-Project/KH/org_datasets/portfolio.json'
portfolio = pd.read_json(portfolio_url, orient='records', lines=True)

profile_url = 'https://raw.githubusercontent.com/p33ves/CMPT726-ML-Project/KH/org_datasets/profile.json'
profile = pd.read_json(profile_url, orient='records', lines=True)

transcript_url = 'https://raw.githubusercontent.com/p33ves/CMPT726-ML-Project/KH/org_datasets/transcript.json'
transcript = pd.read_json(transcript_url, orient='records', lines=True)

### A. Portfolio Dataset

In [None]:
print(portfolio['offer_type'].value_counts())
portfolio.head(10)

discount         4
bogo             4
informational    2
Name: offer_type, dtype: int64


Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7
5,3,"[web, email, mobile, social]",7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2
6,2,"[web, email, mobile, social]",10,10,discount,fafdcd668e3743c1bb461111dcafc2a4
7,0,"[email, mobile, social]",0,3,informational,5a8bc65990b245e5a138643cd4eb9837
8,5,"[web, email, mobile, social]",5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d
9,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5


In [None]:
medium = portfolio['channels'].str.join(',').str.get_dummies(',')
portfolio = pd.concat([portfolio, medium], axis='columns') # concat medium columns
portfolio = portfolio.drop(['channels'], axis='columns')

In [None]:
portfolio.head(10)

Unnamed: 0,reward,difficulty,duration,offer_type,id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1
5,3,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1
6,2,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1
7,0,0,3,informational,5a8bc65990b245e5a138643cd4eb9837,1,1,1,0
8,5,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1
9,2,10,7,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0,1


In [None]:
portfolio_sum = portfolio.groupby('offer_type')['email','mobile','social','web'].agg('sum')
portfolio_sum.head()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0_level_0,email,mobile,social,web
offer_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bogo,4,4,3,3
discount,4,3,2,4
informational,2,2,1,1


In [None]:
portfolio_sum.reset_index(inplace=True)
portfolio_sum.head()

Unnamed: 0,offer_type,email,mobile,social,web
0,bogo,4,4,3,3
1,discount,4,3,2,4
2,informational,2,2,1,1


Visualizing the offer type

In [None]:
offer_type_bar = px.bar(portfolio_sum, x='offer_type', y=['email', 'mobile', 'social', 'web'], barmode='group')
offer_type_bar.show()

### B. Profile Data

In [None]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [None]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            14825 non-null  object 
 1   age               17000 non-null  int64  
 2   id                17000 non-null  object 
 3   became_member_on  17000 non-null  int64  
 4   income            14825 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 664.2+ KB


In [None]:
# Finding null data on the profile dataframe
profile.isnull().sum()

gender              2175
age                    0
id                     0
became_member_on       0
income              2175
dtype: int64

In [None]:
age_hist = px.histogram(profile, x='age', nbins=20, title='Age Distribution')
age_hist.show()

In [None]:
profile = profile.dropna(axis='index')
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
5,M,68,e2127556f4f64592b11af22de27a7932,20180426,70000.0
8,M,65,389bc3fa690240e798340f5a15918d5c,20180209,53000.0
12,M,58,2eeac8d8feae4a8cad5a6af0499a211d,20171111,51000.0


In [None]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14825 entries, 1 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            14825 non-null  object 
 1   age               14825 non-null  int64  
 2   id                14825 non-null  object 
 3   became_member_on  14825 non-null  int64  
 4   income            14825 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 694.9+ KB


In [None]:
# After removing the rows with null values of income
age_hist = px.histogram(profile, x='age', nbins=20, title='Age Distribution')
age_hist.show()

### C. Transcript Dataset

In [None]:
transcript.head(10)

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0
5,389bc3fa690240e798340f5a15918d5c,offer received,{'offer id': 'f19421c1d4aa40978ebb69ca19b0e20d'},0
6,c4863c7985cf408faee930f111475da3,offer received,{'offer id': '2298d6c36e964ae4a3e7e9706d1fb8c2'},0
7,2eeac8d8feae4a8cad5a6af0499a211d,offer received,{'offer id': '3f207df678b143eea3cee63160fa8bed'},0
8,aa4862eba776480b8bb9c68455b8c2e1,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
9,31dda685af34476cad5bc968bdb01c53,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0


In [None]:
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   person  306534 non-null  object
 1   event   306534 non-null  object
 2   value   306534 non-null  object
 3   time    306534 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 9.4+ MB


In [None]:
# How many unique customers ID in profile?
transcript['person'].unique().shape

(17000,)

In [None]:
transcript['event'].value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

In [None]:
# Extracting keys in value column
value_keys = []
for i in range(transcript.shape[0]):
    if transcript['value'][i].keys() not in value_keys:
        value_keys.append(transcript.value[i].keys())
print(value_keys)

[dict_keys(['offer id']), dict_keys(['amount']), dict_keys(['offer_id', 'reward'])]


If value column has 
- only 'offer id' -> offer viewed / offer received
- only 'amount' -> transaction
- 'offer_id' & 'reward' -> offer completed

In [None]:
# parse the json format of value column to individual columns
parsed_value = transcript['value'].apply(pd.Series)
transcript = pd.concat([transcript, parsed_value], axis=1)

In [None]:
transcript.head()

Unnamed: 0,person,event,value,time,offer id,amount,offer_id,reward
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0,9b98b8c7a33c4b65b9aebfe6a799e6d9,,,
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0,0b1e1539f2cc45b7b9fa7c272da2e1d7,,,
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0,2906b810c7d4411798c6938adc9daaa5,,,
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0,fafdcd668e3743c1bb461111dcafc2a4,,,
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0,4d5c57ea9a6940dd891ad53e9dbe8da0,,,


In [None]:
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 8 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   person    306534 non-null  object 
 1   event     306534 non-null  object 
 2   value     306534 non-null  object 
 3   time      306534 non-null  int64  
 4   offer id  134002 non-null  object 
 5   amount    138953 non-null  float64
 6   offer_id  33579 non-null   object 
 7   reward    33579 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 18.7+ MB


In [None]:
# offer id : when customer viewed an offer
# offer_id : when customer completed an offer

# Create new column to combine the two "offer id" & ""offer_id" columns
# if "offer id" is null and "offer_id" is not null then "offer_id" else offer 
transcript['offer_id_new'] = np.where(transcript['offer id'].isnull() & transcript['offer_id'].notnull(), 
                                      transcript['offer_id'],
                                      transcript['offer id'])
# Drop "offer id" & ""offer_id" columns
transcript.drop(['offer id', 'offer_id'], axis=1, inplace=True)

# Rename offer_id column
transcript.rename(columns={'offer_id_new':'offer_id'}, inplace=True)

In [None]:
transcript.head()

Unnamed: 0,person,event,value,time,amount,reward,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0,,,9b98b8c7a33c4b65b9aebfe6a799e6d9
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0,,,0b1e1539f2cc45b7b9fa7c272da2e1d7
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0,,,2906b810c7d4411798c6938adc9daaa5
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0,,,fafdcd668e3743c1bb461111dcafc2a4
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0,,,4d5c57ea9a6940dd891ad53e9dbe8da0


In [None]:
print('Events where an offer is completed')
print(transcript.loc[(transcript['offer_id'].notnull()) & (transcript['reward'].notnull()),'event'].value_counts())
print('\n')
print('Events where an offer is viewed')
print(transcript.loc[(transcript['offer_id'].notnull()) & (transcript['reward'].isnull()),'event'].value_counts())
print('\n')
print('Events where transaction occured')
print(transcript.loc[(transcript['amount'].notnull()) ,'event'].value_counts())

Events where an offer is completed
offer completed    33579
Name: event, dtype: int64


Events where an offer is viewed
offer received    76277
offer viewed      57725
Name: event, dtype: int64


Events where transaction occured
transaction    138953
Name: event, dtype: int64


In [None]:
# Rename portfolio's column 'id' as 'offer_id'
portfolio.rename(columns={'id':'offer_id'}, inplace=True) 
portfolio.head(10)

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1
5,3,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1
6,2,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1
7,0,0,3,informational,5a8bc65990b245e5a138643cd4eb9837,1,1,1,0
8,5,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1
9,2,10,7,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0,1


In [None]:
# Join transcript with portfolio to get offer type
# use only key from transcript dataframe and preserve key order.
transcript_new = transcript.merge(portfolio, how='left', on='offer_id')
transcript_new.head()

Unnamed: 0,person,event,value,time,amount,reward_x,offer_id,reward_y,difficulty,duration,offer_type,email,mobile,social,web
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,7.0,discount,1.0,1.0,0.0,1.0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0,,,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0


In [None]:
transcript_new.groupby(['event','offer_type'])['offer_type'].count()

event            offer_type   
offer completed  bogo             15669
                 discount         17910
offer received   bogo             30499
                 discount         30543
                 informational    15235
offer viewed     bogo             25449
                 discount         21445
                 informational    10831
Name: offer_type, dtype: int64

In [None]:
# Drop 'value' column
transcript_new.drop('value', axis=1, inplace=True)
transcript_new.head(10)

Unnamed: 0,person,event,time,amount,reward_x,offer_id,reward_y,difficulty,duration,offer_type,email,mobile,social,web
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
2,e2127556f4f64592b11af22de27a7932,offer received,0,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,7.0,discount,1.0,1.0,0.0,1.0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,,,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0
5,389bc3fa690240e798340f5a15918d5c,offer received,0,,,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,5.0,bogo,1.0,1.0,1.0,1.0
6,c4863c7985cf408faee930f111475da3,offer received,0,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0
7,2eeac8d8feae4a8cad5a6af0499a211d,offer received,0,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,4.0,informational,1.0,1.0,0.0,1.0
8,aa4862eba776480b8bb9c68455b8c2e1,offer received,0,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
9,31dda685af34476cad5bc968bdb01c53,offer received,0,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0


In [None]:
transcript_new.rename(columns={'reward_x':'reward_received', 'reward_y':'reward_defined'}, inplace=True)
transcript_new.reward_received.value_counts()

5.0     12070
2.0      9334
10.0     7019
3.0      5156
Name: reward_received, dtype: int64

In [None]:
transcript_new.head()

Unnamed: 0,person,event,time,amount,reward_received,offer_id,reward_defined,difficulty,duration,offer_type,email,mobile,social,web
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,7.0,bogo,1.0,1.0,0.0,1.0
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0
2,e2127556f4f64592b11af22de27a7932,offer received,0,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,7.0,discount,1.0,1.0,0.0,1.0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,,,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0


In [None]:
transcript_new.loc[transcript_new['event']=='offer completed', 'person'].head(10)

12658    9fa9ae8f57894cc9a3b8a9bbe0fc1b2f
12672    fe97aa22dd3e48c8b143116a8403dd52
12679    629fc02d56414d91bca360decdfa9288
12692    676506bad68e4161b9bbaffeb039626b
12697    8f7dd3b2afe14c078eb4f6e6fe4ba97d
12717    227f2d69e46a4899b70d48182822cff6
12721    bb0f25e23a4c4de6a645527c275cd594
12744    d72d201be5794279aa716d8ad82b8d90
12764    73ffefd41e9a4ca3ab26b2b3697c6eb7
12767    3e621194f72e40d7a0b695ee9b7c38b7
Name: person, dtype: object

In [None]:
# check one person's history
# pd.set_option("max_rows", None)
transcript_new.loc[transcript_new.person =='fe97aa22dd3e48c8b143116a8403dd52', : ].sort_index().head(50)

Unnamed: 0,person,event,time,amount,reward_received,offer_id,reward_defined,difficulty,duration,offer_type,email,mobile,social,web
107,fe97aa22dd3e48c8b143116a8403dd52,offer received,0,,,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
12671,fe97aa22dd3e48c8b143116a8403dd52,transaction,0,18.97,,,,,,,,,,
12672,fe97aa22dd3e48c8b143116a8403dd52,offer completed,0,,2.0,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
15578,fe97aa22dd3e48c8b143116a8403dd52,offer viewed,6,,,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0
24238,fe97aa22dd3e48c8b143116a8403dd52,transaction,30,13.9,,,,,,,,,,
38230,fe97aa22dd3e48c8b143116a8403dd52,transaction,84,16.99,,,,,,,,,,
39554,fe97aa22dd3e48c8b143116a8403dd52,transaction,90,13.96,,,,,,,,,,
53283,fe97aa22dd3e48c8b143116a8403dd52,offer received,168,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,4.0,informational,1.0,1.0,0.0,1.0
79443,fe97aa22dd3e48c8b143116a8403dd52,offer viewed,198,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,4.0,informational,1.0,1.0,0.0,1.0
79444,fe97aa22dd3e48c8b143116a8403dd52,transaction,198,28.71,,,,,,,,,,


## Feature Engineering

To detect potential passive custoemrs using supervised learning models,
first, the definition of being a passive customer should be made to label the training dataset.

These are the features we could consider for the machine learning models.
 
1. Count of 'transactions' for each customer
2. Total amount(sum) of transaction for each customer
3. Ratio of total amount(sum) / count(transactions)
3. Count of 'offer viewed' for each customer
4. Count of 'offer completed' for each customer
5. became_member_on
6. Gender
7. Age
8. Income
9. Average difficulty score

In [None]:
transcript_new['count'] = 1

# get count of all events and calculations
count_of_events = pd.pivot_table(transcript_new, values='count', index = 'person', columns='event', aggfunc=np.sum)
count_of_events['completed/transaction percentage'] = count_of_events['offer completed'] / count_of_events['transaction']
count_of_events['completed/viewed percentage'] = count_of_events['offer completed'] / count_of_events['offer viewed']

# if percentage is over 50%, label as "Active". This labelling can be changed should we decide on better ways to classify customers
count_of_events['Customer Type'] = 'passive'
count_of_events.loc[count_of_events['completed/transaction percentage'] >= 0.3, 'Customer Type'] = 'active' 
count_of_events.head()

event,offer completed,offer received,offer viewed,transaction,completed/transaction percentage,completed/viewed percentage,Customer Type
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0009655768c64bdeb2e877511632db8f,3.0,5.0,4.0,8.0,0.375,0.75,active
00116118485d4dfda04fdbaba9a87b5c,,2.0,2.0,3.0,,,passive
0011e0d4e6b944f998e987f904e8c1e5,3.0,5.0,5.0,5.0,0.6,0.6,active
0020c2b971eb4e9188eac86d93036a77,3.0,5.0,3.0,8.0,0.375,1.0,active
0020ccbbb6d84e358d3414a3ff76cffd,3.0,4.0,4.0,12.0,0.25,0.75,passive
003d66b6608740288d6cc97a6903f4f0,3.0,5.0,4.0,18.0,0.166667,0.75,passive
00426fe3ffde4c6b9cb9ad6d077a13ea,1.0,5.0,2.0,17.0,0.058824,0.5,passive
004b041fbfe44859945daa2c7f79ee64,2.0,3.0,2.0,6.0,0.333333,1.0,active
004c5799adbf42868b9cff0396190900,5.0,5.0,4.0,12.0,0.416667,1.25,active
005500a7188546ff8a767329a2f7c76a,1.0,5.0,3.0,4.0,0.25,0.333333,passive


In [None]:
# sum of transactions amount
transactions_amount = transcript_new[['person', 'amount']]
transactions_amount = transactions_amount.groupby(['person']).sum()
transactions_amount.head()

Unnamed: 0_level_0,amount
person,Unnamed: 1_level_1
0009655768c64bdeb2e877511632db8f,127.6
00116118485d4dfda04fdbaba9a87b5c,4.09
0011e0d4e6b944f998e987f904e8c1e5,79.46
0020c2b971eb4e9188eac86d93036a77,196.86
0020ccbbb6d84e358d3414a3ff76cffd,154.05
003d66b6608740288d6cc97a6903f4f0,48.34
00426fe3ffde4c6b9cb9ad6d077a13ea,68.51
004b041fbfe44859945daa2c7f79ee64,138.36
004c5799adbf42868b9cff0396190900,347.38
005500a7188546ff8a767329a2f7c76a,20.36


In [None]:
# get all distinct offers for each person
distinct_offer_by_person = transcript_new[['person', 'count', 'offer_id', 'difficulty', 'duration', 'offer_type','email', 'social', 'mobile', 'web']]
distinct_offer_by_person = distinct_offer_by_person.drop_duplicates()
distinct_offer_by_person.sort_values(by=['person'], inplace=True)
distinct_offer_by_person = distinct_offer_by_person.dropna(subset=['offer_id'])
distinct_offer_by_person.head()

Unnamed: 0,person,count,offer_id,difficulty,duration,offer_type,email,social,mobile,web
55972,0009655768c64bdeb2e877511632db8f,1,5a8bc65990b245e5a138643cd4eb9837,0.0,3.0,informational,1.0,1.0,1.0,0.0
247879,0009655768c64bdeb2e877511632db8f,1,2906b810c7d4411798c6938adc9daaa5,10.0,7.0,discount,1.0,0.0,1.0,1.0
113605,0009655768c64bdeb2e877511632db8f,1,3f207df678b143eea3cee63160fa8bed,0.0,4.0,informational,1.0,0.0,1.0,1.0
204340,0009655768c64bdeb2e877511632db8f,1,fafdcd668e3743c1bb461111dcafc2a4,10.0,10.0,discount,1.0,1.0,1.0,1.0
153401,0009655768c64bdeb2e877511632db8f,1,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,bogo,1.0,1.0,1.0,1.0
56475,00116118485d4dfda04fdbaba9a87b5c,1,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,bogo,1.0,1.0,1.0,1.0
204643,0011e0d4e6b944f998e987f904e8c1e5,1,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,7.0,bogo,1.0,0.0,1.0,1.0
56298,0011e0d4e6b944f998e987f904e8c1e5,1,2298d6c36e964ae4a3e7e9706d1fb8c2,7.0,7.0,discount,1.0,1.0,1.0,1.0
153697,0011e0d4e6b944f998e987f904e8c1e5,1,0b1e1539f2cc45b7b9fa7c272da2e1d7,20.0,10.0,discount,1.0,0.0,0.0,1.0
113919,0011e0d4e6b944f998e987f904e8c1e5,1,5a8bc65990b245e5a138643cd4eb9837,0.0,3.0,informational,1.0,1.0,1.0,0.0


In [None]:
# get offer type count for each person
offer_type = pd.pivot_table(distinct_offer_by_person, values='count', index='person', columns='offer_type', aggfunc=np.sum)
offer_type.fillna(value=0, inplace=True)
offer_type.head()

offer_type,bogo,discount,informational
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0009655768c64bdeb2e877511632db8f,1.0,2.0,2.0
00116118485d4dfda04fdbaba9a87b5c,1.0,0.0,0.0
0011e0d4e6b944f998e987f904e8c1e5,1.0,2.0,2.0
0020c2b971eb4e9188eac86d93036a77,2.0,1.0,1.0
0020ccbbb6d84e358d3414a3ff76cffd,2.0,1.0,1.0


In [None]:
# average difficulty score and duration
avg_difficulty_duration = distinct_offer_by_person[['person', 'difficulty', 'duration']]
avg_difficulty_duration = avg_difficulty_duration.groupby(['person']).mean()
avg_difficulty_duration.head(10)

Unnamed: 0_level_0,difficulty,duration
person,Unnamed: 1_level_1,Unnamed: 2_level_1
0009655768c64bdeb2e877511632db8f,5.0,5.8
00116118485d4dfda04fdbaba9a87b5c,5.0,5.0
0011e0d4e6b944f998e987f904e8c1e5,6.4,6.2
0020c2b971eb4e9188eac86d93036a77,7.5,6.25
0020ccbbb6d84e358d3414a3ff76cffd,4.25,5.5
003d66b6608740288d6cc97a6903f4f0,7.5,6.75
00426fe3ffde4c6b9cb9ad6d077a13ea,10.0,7.5
004b041fbfe44859945daa2c7f79ee64,5.0,6.333333
004c5799adbf42868b9cff0396190900,8.333333,7.333333
005500a7188546ff8a767329a2f7c76a,8.333333,7.0


In [None]:
# count of offer delivery method
count_of_delivery = distinct_offer_by_person[['person', 'email', 'social', 'mobile', 'web']]
count_of_delivery = count_of_delivery.groupby(['person']).sum()
count_of_delivery.head(10)

Unnamed: 0_level_0,email,social,mobile,web
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0009655768c64bdeb2e877511632db8f,5.0,3.0,5.0,4.0
00116118485d4dfda04fdbaba9a87b5c,1.0,1.0,1.0,1.0
0011e0d4e6b944f998e987f904e8c1e5,5.0,2.0,4.0,4.0
0020c2b971eb4e9188eac86d93036a77,4.0,4.0,4.0,2.0
0020ccbbb6d84e358d3414a3ff76cffd,4.0,3.0,4.0,3.0
003d66b6608740288d6cc97a6903f4f0,4.0,2.0,3.0,3.0
00426fe3ffde4c6b9cb9ad6d077a13ea,4.0,2.0,3.0,3.0
004b041fbfe44859945daa2c7f79ee64,3.0,2.0,3.0,3.0
004c5799adbf42868b9cff0396190900,3.0,3.0,3.0,2.0
005500a7188546ff8a767329a2f7c76a,3.0,1.0,3.0,2.0


In [None]:
# merge all the sub-datasets to get all features
info = count_of_events.merge(transactions_amount, how='left', on='person')
info['avg amount per transaction'] = info['amount']/info['transaction']
info = info.merge(offer_type, how='left', on='person')
info = info.merge(avg_difficulty_duration, how='left', on='person')
info = info.merge(count_of_delivery, how='left', on='person')
info.head(10)

Unnamed: 0_level_0,offer completed,offer received,offer viewed,transaction,completed/transaction percentage,completed/viewed percentage,Customer Type,amount,avg amount per transaction,bogo,discount,informational,difficulty,duration,email,social,mobile,web
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0009655768c64bdeb2e877511632db8f,3.0,5.0,4.0,8.0,0.375,0.75,active,127.6,15.95,1.0,2.0,2.0,5.0,5.8,5.0,3.0,5.0,4.0
00116118485d4dfda04fdbaba9a87b5c,,2.0,2.0,3.0,,,passive,4.09,1.363333,1.0,0.0,0.0,5.0,5.0,1.0,1.0,1.0,1.0
0011e0d4e6b944f998e987f904e8c1e5,3.0,5.0,5.0,5.0,0.6,0.6,active,79.46,15.892,1.0,2.0,2.0,6.4,6.2,5.0,2.0,4.0,4.0
0020c2b971eb4e9188eac86d93036a77,3.0,5.0,3.0,8.0,0.375,1.0,active,196.86,24.6075,2.0,1.0,1.0,7.5,6.25,4.0,4.0,4.0,2.0
0020ccbbb6d84e358d3414a3ff76cffd,3.0,4.0,4.0,12.0,0.25,0.75,passive,154.05,12.8375,2.0,1.0,1.0,4.25,5.5,4.0,3.0,4.0,3.0
003d66b6608740288d6cc97a6903f4f0,3.0,5.0,4.0,18.0,0.166667,0.75,passive,48.34,2.685556,0.0,2.0,2.0,7.5,6.75,4.0,2.0,3.0,3.0
00426fe3ffde4c6b9cb9ad6d077a13ea,1.0,5.0,2.0,17.0,0.058824,0.5,passive,68.51,4.03,0.0,3.0,1.0,10.0,7.5,4.0,2.0,3.0,3.0
004b041fbfe44859945daa2c7f79ee64,2.0,3.0,2.0,6.0,0.333333,1.0,active,138.36,23.06,1.0,1.0,1.0,5.0,6.333333,3.0,2.0,3.0,3.0
004c5799adbf42868b9cff0396190900,5.0,5.0,4.0,12.0,0.416667,1.25,active,347.38,28.948333,2.0,1.0,0.0,8.333333,7.333333,3.0,3.0,3.0,2.0
005500a7188546ff8a767329a2f7c76a,1.0,5.0,3.0,4.0,0.25,0.333333,passive,20.36,5.09,2.0,1.0,0.0,8.333333,7.0,3.0,1.0,3.0,2.0


In [None]:
# inner join with profile dataset to get the whole picture
full_dataset = info.merge(profile, how='inner', left_on='person', right_on='id')
full_dataset.head()

Unnamed: 0,offer completed,offer received,offer viewed,transaction,completed/transaction percentage,completed/viewed percentage,Customer Type,amount,avg amount per transaction,bogo,discount,informational,difficulty,duration,email,social,mobile,web,gender,age,id,became_member_on,income
0,3.0,5.0,4.0,8.0,0.375,0.75,active,127.6,15.95,1.0,2.0,2.0,5.0,5.8,5.0,3.0,5.0,4.0,M,33,0009655768c64bdeb2e877511632db8f,20170421,72000.0
1,3.0,5.0,5.0,5.0,0.6,0.6,active,79.46,15.892,1.0,2.0,2.0,6.4,6.2,5.0,2.0,4.0,4.0,O,40,0011e0d4e6b944f998e987f904e8c1e5,20180109,57000.0
2,3.0,5.0,3.0,8.0,0.375,1.0,active,196.86,24.6075,2.0,1.0,1.0,7.5,6.25,4.0,4.0,4.0,2.0,F,59,0020c2b971eb4e9188eac86d93036a77,20160304,90000.0
3,3.0,4.0,4.0,12.0,0.25,0.75,passive,154.05,12.8375,2.0,1.0,1.0,4.25,5.5,4.0,3.0,4.0,3.0,F,24,0020ccbbb6d84e358d3414a3ff76cffd,20161111,60000.0
4,3.0,5.0,4.0,18.0,0.166667,0.75,passive,48.34,2.685556,0.0,2.0,2.0,7.5,6.75,4.0,2.0,3.0,3.0,F,26,003d66b6608740288d6cc97a6903f4f0,20170621,73000.0


In [None]:
# get how long person has been a member in years
full_dataset['membership_time'] = (datetime.now() - pd.to_datetime(full_dataset['became_member_on'], format='%Y%m%d'))/np.timedelta64(1, 'Y')
full_dataset.head()

Unnamed: 0,offer completed,offer received,offer viewed,transaction,completed/transaction percentage,completed/viewed percentage,Customer Type,amount,avg amount per transaction,bogo,discount,informational,difficulty,duration,email,social,mobile,web,gender,age,id,became_member_on,income,membership_time
0,3.0,5.0,4.0,8.0,0.375,0.75,active,127.6,15.95,1.0,2.0,2.0,5.0,5.8,5.0,3.0,5.0,4.0,M,33,0009655768c64bdeb2e877511632db8f,20170421,72000.0,3.592265
1,3.0,5.0,5.0,5.0,0.6,0.6,active,79.46,15.892,1.0,2.0,2.0,6.4,6.2,5.0,2.0,4.0,4.0,O,40,0011e0d4e6b944f998e987f904e8c1e5,20180109,57000.0,2.872196
2,3.0,5.0,3.0,8.0,0.375,1.0,active,196.86,24.6075,2.0,1.0,1.0,7.5,6.25,4.0,4.0,4.0,2.0,F,59,0020c2b971eb4e9188eac86d93036a77,20160304,90000.0,4.723021
3,3.0,4.0,4.0,12.0,0.25,0.75,passive,154.05,12.8375,2.0,1.0,1.0,4.25,5.5,4.0,3.0,4.0,3.0,F,24,0020ccbbb6d84e358d3414a3ff76cffd,20161111,60000.0,4.033068
4,3.0,5.0,4.0,18.0,0.166667,0.75,passive,48.34,2.685556,0.0,2.0,2.0,7.5,6.75,4.0,2.0,3.0,3.0,F,26,003d66b6608740288d6cc97a6903f4f0,20170621,73000.0,3.425253
