<a href="https://colab.research.google.com/github/p33ves/CMPT726-ML-Project/blob/main/Datasets_Preprocessing_(Detecting_Potential_Passive_Customers_on_the_Starbucks_Reward).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Datasets Preprocessing (Detecting Potential Passive Customers on the Starbucks Reward)

## Importing Datasets

References : 
- https://towardsdatascience.com/starbucks-offer-personalization-sending-the-right-offer-to-the-right-customer-14d4fbc20575
- https://github.com/lalago31/Data-Science-Portfolio/tree/master/Capstone%20-%20Starbucks%20Offer%20Personalization


In [None]:
import pandas as pd
import numpy as np
import math
import json
import os, datetime

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

# from datetime import datetime
from dateutil.relativedelta import relativedelta

Installing and importing plotly on Colab

In [None]:
!pip install plotly==4.12.0
import plotly.express as px

Collecting plotly==4.12.0
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/af86e9d9bf1a3e4f2dabebeabd02a32e8ddf671a5d072b3af2b011efea99/plotly-4.12.0-py2.py3-none-any.whl (13.1MB)
[K     |████████████████████████████████| 13.1MB 326kB/s 
Installing collected packages: plotly
  Found existing installation: plotly 4.4.1
    Uninstalling plotly-4.4.1:
      Successfully uninstalled plotly-4.4.1
Successfully installed plotly-4.12.0


Loading json datasets from Github repo

In [None]:
portfolio_url = 'https://raw.githubusercontent.com/p33ves/CMPT726-ML-Project/KH/org_datasets/portfolio.json'
portfolio = pd.read_json(portfolio_url, orient='records', lines=True)

profile_url = 'https://raw.githubusercontent.com/p33ves/CMPT726-ML-Project/KH/org_datasets/profile.json'
profile = pd.read_json(profile_url, orient='records', lines=True)

transcript_url = 'https://raw.githubusercontent.com/p33ves/CMPT726-ML-Project/KH/org_datasets/transcript.json'
transcript = pd.read_json(transcript_url, orient='records', lines=True)

## Data Cleaning Operations

### A. Portfolio Dataset

In [None]:
print(portfolio['offer_type'].value_counts())
# Rename portfolio's column 'id' as 'offer_id'
portfolio.rename(columns={'id':'offer_id'}, inplace=True)
portfolio.head(10)

bogo             4
discount         4
informational    2
Name: offer_type, dtype: int64


Unnamed: 0,reward,channels,difficulty,duration,offer_type,offer_id
0,10,"[email, mobile, social]",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"[web, email, mobile, social]",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"[web, email, mobile]",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"[web, email, mobile]",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"[web, email]",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7
5,3,"[web, email, mobile, social]",7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2
6,2,"[web, email, mobile, social]",10,10,discount,fafdcd668e3743c1bb461111dcafc2a4
7,0,"[email, mobile, social]",0,3,informational,5a8bc65990b245e5a138643cd4eb9837
8,5,"[web, email, mobile, social]",5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d
9,2,"[web, email, mobile]",10,7,discount,2906b810c7d4411798c6938adc9daaa5


Unpacking channels

In [None]:
medium = portfolio['channels'].str.join(',').str.get_dummies(',')
portfolio = pd.concat([portfolio, medium], axis='columns') # concat medium columns
portfolio = portfolio.drop(['channels'], axis='columns')
portfolio['duration_in_hours'] = portfolio['duration']*24
portfolio

Unnamed: 0,reward,difficulty,duration,offer_type,offer_id,email,mobile,social,web,duration_in_hours
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0,168
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1,120
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1,96
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1,168
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1,240
5,3,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1,168
6,2,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1,240
7,0,0,3,informational,5a8bc65990b245e5a138643cd4eb9837,1,1,1,0,72
8,5,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1,120
9,2,10,7,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0,1,168


In [None]:
portfolio.drop(['duration'], axis=1, inplace=True)
portfolio.rename(columns={'duration_in_hours':'duration'},inplace=True)
portfolio

Unnamed: 0,reward,difficulty,offer_type,offer_id,email,mobile,social,web,duration
0,10,10,bogo,ae264e3637204a6fb9bb56bc8210ddfd,1,1,1,0,168
1,10,10,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1,1,120
2,0,0,informational,3f207df678b143eea3cee63160fa8bed,1,1,0,1,96
3,5,5,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0,1,168
4,5,20,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0,1,240
5,3,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1,1,168
6,2,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1,1,240
7,0,0,informational,5a8bc65990b245e5a138643cd4eb9837,1,1,1,0,72
8,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1,1,120
9,2,10,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0,1,168


Stats of each channel(offer type)

In [None]:
portfolio_sum = portfolio.groupby('offer_type')['email','mobile','social','web'].agg('sum')
portfolio_sum.reset_index(inplace=True)
portfolio_sum


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,offer_type,email,mobile,social,web
0,bogo,4,4,3,3
1,discount,4,3,2,4
2,informational,2,2,1,1


Visualizing the offer type

In [None]:
offer_type_bar = px.bar(portfolio_sum, x='offer_type', y=['email', 'mobile', 'social', 'web'], barmode='group')
offer_type_bar.show()

### B. Profile Dataset

In [None]:
profile.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [None]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            14825 non-null  object 
 1   age               17000 non-null  int64  
 2   id                17000 non-null  object 
 3   became_member_on  17000 non-null  int64  
 4   income            14825 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 664.2+ KB


In [None]:
# Finding null data on the profile dataframe
profile.isnull().sum()

gender              2175
age                    0
id                     0
became_member_on       0
income              2175
dtype: int64

In [None]:
# Before cleaning
age_hist = px.histogram(profile, x='age', nbins=20, title='Age Distribution')
age_hist.show()

In [None]:
profile = profile.dropna(axis='index')
profile.rename(columns={'id': 'cust_id'}, inplace=True)
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'].astype(str), format='%Y%m%d')
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days
profile.sample(10)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member
8779,M,68,ade6c5f2bc8a4862978716b70e1b037b,2016-08-04,104000.0,1575
8778,M,75,25adc7b5ab704fcc9f8777ed914234ff,2017-08-09,92000.0,1205
14745,F,42,0b5dbd14f53748c0b91f7d72b4ee60e6,2017-04-16,62000.0,1320
9367,M,47,bc9320cc000741509180ead338d30ace,2018-04-18,46000.0,953
2473,M,94,fa5d0a39814a48b1a5886c9acdf95b92,2018-01-02,54000.0,1059
13402,M,44,b25250db635840a994a37dc1d7e7ab59,2017-01-16,77000.0,1410
514,M,73,af15533a5b1a42a98e767fc9103e325c,2018-04-14,76000.0,957
8726,F,46,e7378718486c40f2817ac5e6ea7e15d0,2017-11-13,50000.0,1109
5486,F,65,dd2f15d3224349b3a54cbb4a91e89e0c,2016-05-17,66000.0,1654
8528,F,49,d824bb703d434f5fa99b99cb0a85635a,2017-10-30,63000.0,1123


In [None]:
profile.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14825 entries, 1 to 16999
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   gender            14825 non-null  object        
 1   age               14825 non-null  int64         
 2   cust_id           14825 non-null  object        
 3   became_member_on  14825 non-null  datetime64[ns]
 4   income            14825 non-null  float64       
 5   days_as_member    14825 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 810.7+ KB


In [None]:
# After removing the rows with null values of income
age_hist = px.histogram(profile, x='age', nbins=30, title='Age Distribution')
age_hist.show()

### C. Transcript Dataset

In [None]:
transcript.head(10)

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0
5,389bc3fa690240e798340f5a15918d5c,offer received,{'offer id': 'f19421c1d4aa40978ebb69ca19b0e20d'},0
6,c4863c7985cf408faee930f111475da3,offer received,{'offer id': '2298d6c36e964ae4a3e7e9706d1fb8c2'},0
7,2eeac8d8feae4a8cad5a6af0499a211d,offer received,{'offer id': '3f207df678b143eea3cee63160fa8bed'},0
8,aa4862eba776480b8bb9c68455b8c2e1,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
9,31dda685af34476cad5bc968bdb01c53,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0


In [None]:
transcript.rename(columns={'person': 'cust_id'}, inplace=True)
transcript.rename(columns={'time':'hours_till_action'},inplace=True)
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   cust_id            306534 non-null  object
 1   event              306534 non-null  object
 2   value              306534 non-null  object
 3   hours_till_action  306534 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 9.4+ MB


In [None]:
# How many unique customers ID in profile?
transcript['cust_id'].unique().shape

(17000,)

In [None]:
transcript['event'].value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

In [None]:
# Extracting keys in value column
value_keys = []
for i in range(transcript.shape[0]):
    if transcript['value'][i].keys() not in value_keys:
        value_keys.append(transcript.value[i].keys())
print(value_keys)

[dict_keys(['offer id']), dict_keys(['amount']), dict_keys(['offer_id', 'reward'])]


If value column has 
- only 'offer id' -> offer viewed / offer received
- only 'amount' -> transaction
- 'offer_id' & 'reward' -> offer completed

In [None]:
# parse the json format of value column to individual columns
parsed_value = transcript['value'].apply(pd.Series)
transcript = pd.concat([transcript, parsed_value], axis=1)

In [None]:
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   cust_id            306534 non-null  object 
 1   event              306534 non-null  object 
 2   value              306534 non-null  object 
 3   hours_till_action  306534 non-null  int64  
 4   offer id           134002 non-null  object 
 5   amount             138953 non-null  float64
 6   offer_id           33579 non-null   object 
 7   reward             33579 non-null   float64
dtypes: float64(2), int64(1), object(5)
memory usage: 18.7+ MB


In [None]:
# offer id : when customer viewed an offer
# offer_id : when customer completed an offer

# Create new column to combine the two "offer id" & ""offer_id" columns
# if "offer id" is null and "offer_id" is not null then "offer_id" else offer 
transcript['offer_id_new'] = np.where(transcript['offer id'].isnull() & transcript['offer_id'].notnull(), 
                                      transcript['offer_id'],
                                      transcript['offer id'])
# Drop "offer id" & ""offer_id" columns
transcript.drop(['offer id', 'offer_id'], axis=1, inplace=True)

# Rename "offer_id_new" column to "offer_id"
transcript.rename(columns={'offer_id_new':'offer_id'}, inplace=True)

# Drop "value" columns
transcript.drop(['value'],axis=1,inplace=True)

In [None]:
transcript.head()

Unnamed: 0,cust_id,event,hours_till_action,amount,reward,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,0,,,9b98b8c7a33c4b65b9aebfe6a799e6d9
1,a03223e636434f42ac4c3df47e8bac43,offer received,0,,,0b1e1539f2cc45b7b9fa7c272da2e1d7
2,e2127556f4f64592b11af22de27a7932,offer received,0,,,2906b810c7d4411798c6938adc9daaa5
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,0,,,fafdcd668e3743c1bb461111dcafc2a4
4,68617ca6246f4fbc85e91a2a49552598,offer received,0,,,4d5c57ea9a6940dd891ad53e9dbe8da0


Stats of events

In [None]:
print('Events where an offer is completed')
print(transcript.loc[(transcript['offer_id'].notnull()) & (transcript['reward'].notnull()),'event'].value_counts())
print('\n')
print('Events where an offer is viewed')
print(transcript.loc[(transcript['offer_id'].notnull()) & (transcript['reward'].isnull()),'event'].value_counts())
print('\n')
print('Events where transaction occured')
print(transcript.loc[(transcript['amount'].notnull()) ,'event'].value_counts())

Events where an offer is completed
offer completed    33579
Name: event, dtype: int64


Events where an offer is viewed
offer received    76277
offer viewed      57725
Name: event, dtype: int64


Events where transaction occured
transaction    138953
Name: event, dtype: int64


### D. Merging all the datasets together

Joining Transcript and portfolio

In [None]:
# Join transcript with portfolio to get offer type
# use only key from transcript dataframe and preserve key order.
total = transcript.merge(portfolio, how='left', on='offer_id')
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306534 entries, 0 to 306533
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   cust_id            306534 non-null  object 
 1   event              306534 non-null  object 
 2   hours_till_action  306534 non-null  int64  
 3   amount             138953 non-null  float64
 4   reward_x           33579 non-null   float64
 5   offer_id           167581 non-null  object 
 6   reward_y           167581 non-null  float64
 7   difficulty         167581 non-null  float64
 8   offer_type         167581 non-null  object 
 9   email              167581 non-null  float64
 10  mobile             167581 non-null  float64
 11  social             167581 non-null  float64
 12  web                167581 non-null  float64
 13  duration           167581 non-null  float64
dtypes: float64(9), int64(1), object(4)
memory usage: 35.1+ MB


Joining Transcript and profile

In [None]:
total = total.merge(profile,how='left',on='cust_id')
total.rename(columns={'reward_x':'reward_received', 'reward_y':'reward_defined'},inplace=True)
total.sample(10)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member
48043,5205c76a56b248a1ac86ed72090a29a0,transaction,132,30.99,,,,,,,,,,,F,50.0,2017-08-21,108000.0,1193.0
232420,07adfad3f0c54e738d60fd5a51285486,offer viewed,534,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,discount,1.0,0.0,0.0,1.0,240.0,M,54.0,2018-07-02,37000.0,878.0
402,6878e89f853c4691b765ae71386dbe09,offer received,0,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,,,NaT,,
286222,d7ed79bbb5c94f689098728c0ed047ab,transaction,636,16.64,,,,,,,,,,,M,69.0,2017-06-24,79000.0,1251.0
304104,2e6aab1dfef6443aa00ab9567c72e4b5,transaction,702,4.94,,,,,,,,,,,M,23.0,2013-10-24,53000.0,2590.0
146328,cfa607dcb90541e2b9a5c34a810fed05,transaction,390,4.33,,,,,,,,,,,M,63.0,2014-09-25,40000.0,2254.0
114782,27d73703045d4f63b2ce5aa79f3e9596,offer received,336,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,,,NaT,,
8360,01873cc8de734961949af7c04b2e9872,offer received,0,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,informational,1.0,1.0,0.0,1.0,96.0,M,72.0,2018-01-22,45000.0,1039.0
23828,7c0407a5b3bb4ea39bf1291e59682da5,offer viewed,24,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,bogo,1.0,1.0,1.0,1.0,120.0,M,41.0,2016-02-16,71000.0,1745.0
132294,759ec3a510e84d9598cf238f719dbda5,transaction,348,17.9,,,,,,,,,,,M,55.0,2016-08-28,37000.0,1551.0


In [None]:
total.shape

(306534, 19)

Removing duplicates from transcript data

In [None]:
dup = total[total.duplicated()]
dup

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member
66123,3dde94fa581145cb9f206624f1a94d5a,offer completed,168,,2.0,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,F,51.0,2017-11-14,56000.0,1108.0
66783,e9fb6ed2cecb4980ba98c86abc9c91e3,offer completed,168,,10.0,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,M,78.0,2015-04-21,55000.0,2046.0
67614,a7dc060f6fc94ca7bf71fbb188187dca,offer completed,168,,5.0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,O,60.0,2017-02-01,69000.0,1394.0
68562,30478a4c1e884a63a822aa87b833ed7a,offer completed,168,,3.0,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,F,73.0,2017-12-09,74000.0,1083.0
69218,84fb57a7fe8045a8bf6236738ee73a0f,offer completed,168,,10.0,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,F,64.0,2017-06-15,113000.0,1260.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297625,6ba2450a438540999e633a5d99c7c7a0,offer completed,672,,5.0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,M,28.0,2015-01-24,61000.0,2133.0
299471,f39fe7ea4e5946378e6d224504b77797,offer completed,684,,5.0,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,discount,1.0,0.0,0.0,1.0,240.0,F,54.0,2017-08-07,98000.0,1207.0
304756,0785f1fce0b04ba08e01c7d2ebab4917,offer completed,708,,5.0,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,discount,1.0,0.0,0.0,1.0,240.0,F,51.0,2017-08-15,78000.0,1199.0
305551,b7e216b6472b46648272c29a52a86702,offer completed,714,,2.0,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,discount,1.0,1.0,1.0,1.0,240.0,M,53.0,2018-07-08,113000.0,872.0


In [None]:
total_dist = total.drop_duplicates()
total_dist.shape

(306137, 19)

In [None]:
total_dist.sample(10)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member
185665,c658ac6c1b2142c4b2ad19cd26fdd354,transaction,450,8.16,,,,,,,,,,,F,50.0,2017-08-21,34000.0,1193.0
225192,ae27549c1fcc4407a7d5d270928ff0ab,transaction,522,6.17,,,,,,,,,,,M,38.0,2018-06-15,38000.0,895.0
219199,5e44bc50a2b84b0a89d7246f5a85617e,offer viewed,510,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,M,43.0,2018-07-23,99000.0,857.0
28373,b6471ab102724837b8e65d46fc5b8184,transaction,42,29.11,,,,,,,,,,,F,75.0,2015-08-07,86000.0,1938.0
264617,a58629a6827f48a2a5d3127db9bee811,offer viewed,582,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,,,NaT,,
105428,6577d9555cfe49359566c3271289b314,offer completed,300,,2.0,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,O,64.0,2017-11-13,76000.0,1109.0
21860,2b50d8e66de14044ad46ca57417faadf,offer viewed,18,,,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,F,29.0,2018-05-19,62000.0,922.0
156543,1473097b38ef4af2baa785f83d10b95a,offer received,408,,,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,F,64.0,2017-11-09,44000.0,1113.0
26337,30a5ebaf192842c3a31b8fbb90529ac2,transaction,36,9.61,,,,,,,,,,,M,20.0,2018-02-03,39000.0,1027.0
119575,e5476f13ac174d3ab883a26f738d3cdd,offer received,336,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,M,66.0,2017-11-25,30000.0,1097.0


In [None]:
total_dist.groupby(['event','offer_type'])['offer_type'].count()

event            offer_type   
offer completed  bogo             15501
                 discount         17681
offer received   bogo             30499
                 discount         30543
                 informational    15235
offer viewed     bogo             25449
                 discount         21445
                 informational    10831
Name: offer_type, dtype: int64

A customer's entire history of events

In [None]:
# customers who completed their offer
total_dist.loc[total['event']=='offer completed', 'cust_id'].sample(10)

301563    d9e5e0c806da46268f0a1905c6e212c2
77424     2d5ba4affbf143bfa45deb4b6418eb65
128109    dd2f15d3224349b3a54cbb4a91e89e0c
298420    e4087b3f387b47e38d58e920f779056b
127586    cc3ca7632fda478cb573c06ac9f445aa
176011    19b675ce8cea454d8b849fbff3c0ff75
141705    162e7d9c644b438ea4606e337df0d3d1
131315    c1b48c3da509479ba0df92dd90803fdb
75788     bb8f4c201203433cb610477526658b13
24901     eaeffe7dfa5c475694239b9075ffc943
Name: cust_id, dtype: object

In [None]:
# check one person's history
# pd.set_option("max_rows", None)
total_dist.loc[total.cust_id =='ebb7a9a6b45f4ba7b7600c651373ac4e', : ].sort_index().head(50)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member
60081,ebb7a9a6b45f4ba7b7600c651373ac4e,offer received,168,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,F,60.0,2016-05-06,93000.0,1665.0
80558,ebb7a9a6b45f4ba7b7600c651373ac4e,offer viewed,198,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,F,60.0,2016-05-06,93000.0,1665.0
82492,ebb7a9a6b45f4ba7b7600c651373ac4e,transaction,204,24.46,,,,,,,,,,,F,60.0,2016-05-06,93000.0,1665.0
82493,ebb7a9a6b45f4ba7b7600c651373ac4e,offer completed,204,,2.0,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,F,60.0,2016-05-06,93000.0,1665.0
89836,ebb7a9a6b45f4ba7b7600c651373ac4e,transaction,228,14.21,,,,,,,,,,,F,60.0,2016-05-06,93000.0,1665.0
117731,ebb7a9a6b45f4ba7b7600c651373ac4e,offer received,336,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,F,60.0,2016-05-06,93000.0,1665.0
125548,ebb7a9a6b45f4ba7b7600c651373ac4e,offer viewed,336,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,F,60.0,2016-05-06,93000.0,1665.0
134077,ebb7a9a6b45f4ba7b7600c651373ac4e,transaction,354,28.53,,,,,,,,,,,F,60.0,2016-05-06,93000.0,1665.0
134078,ebb7a9a6b45f4ba7b7600c651373ac4e,offer completed,354,,3.0,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,F,60.0,2016-05-06,93000.0,1665.0
136465,ebb7a9a6b45f4ba7b7600c651373ac4e,transaction,360,36.3,,,,,,,,,,,F,60.0,2016-05-06,93000.0,1665.0


## Feature Engineering

These are the features we could consider for the machine learning models.

1. gender 
2. age 
3. income 
4. days_as_member 
5. Total offer completed 
6. Total offer received 
7. Total offer viewed 
8. Total transaction 
9. offer completed_via_email 
10. offer completed_via_social 
11. offer completed_via_mobile 
12. offer completed_via_web 
13. offer completed bogo count 
14. offer completed discount count 
15. offer received_via_email 
16. offer received_via_social 
17. offer received_via_mobile 
18. offer received_via_web 
19. offer received bogo count 
20. offer received discount count 
21. offer received informational count 
22. offer viewed_via_email 
23. offer viewed_via_social 
24. offer viewed_via_mobile 
25. offer viewed_via_web 
26. offer viewed bogo count 
27. offer viewed discount count 
28. offer viewed informational count 
29. Total transaction amount 
30. avg_difficulty 
31. avg_duration 
32. total_reward_received 
33. total_reward_defined 
34. total_view_ratio 
35. total_completion_ratio 
36. avg_transaction_amt 
37. transactions_per_membership_days 

In [None]:
user_offer_relation = total_dist[['cust_id','offer_id']]
user_offer_relation

Unnamed: 0,cust_id,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9
1,a03223e636434f42ac4c3df47e8bac43,0b1e1539f2cc45b7b9fa7c272da2e1d7
2,e2127556f4f64592b11af22de27a7932,2906b810c7d4411798c6938adc9daaa5
3,8ec6ce2a7e7949b1bf142def7d0e0586,fafdcd668e3743c1bb461111dcafc2a4
4,68617ca6246f4fbc85e91a2a49552598,4d5c57ea9a6940dd891ad53e9dbe8da0
...,...,...
306529,b3a1272bc9904337b331bf348c3e8c17,
306530,68213b08d99a4ae1b0dcb72aebd9aa35,
306531,a00058cf10334a308c68e7631c529907,
306532,76ddbd6576844afe811f1a3c0fbb5bec,


In [None]:
# Distict pairs of cust_id and offer_id
combinations = user_offer_relation.drop_duplicates()
combinations

Unnamed: 0,cust_id,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9
1,a03223e636434f42ac4c3df47e8bac43,0b1e1539f2cc45b7b9fa7c272da2e1d7
2,e2127556f4f64592b11af22de27a7932,2906b810c7d4411798c6938adc9daaa5
3,8ec6ce2a7e7949b1bf142def7d0e0586,fafdcd668e3743c1bb461111dcafc2a4
4,68617ca6246f4fbc85e91a2a49552598,4d5c57ea9a6940dd891ad53e9dbe8da0
...,...,...
306116,542c41f5afc049e7ae7d4721ace9d286,
306203,448dabde725040978b8a247a20bac126,
306259,7718656997f3453db0f5aeca9cd35240,
306278,54463e5d95124b7fb3133fc1eae71952,


In [None]:
total_dist['count'] = 1
total_dist.sample(10)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member,count
277922,d819428f44234b24a1851503756bc537,transaction,612,14.66,,,,,,,,,,,F,76.0,2016-02-29,118000.0,1732.0,1
146468,1476bfc85b584e30a79790be39f7929f,transaction,390,32.47,,,,,,,,,,,F,55.0,2015-10-22,101000.0,1862.0,1
10748,a36a8550a93e4d04ae93e5e6cca1564d,offer received,0,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,M,32.0,2016-07-11,69000.0,1599.0,1
237069,9e71d772ceed40408f7a26dea47cda37,offer viewed,546,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,M,51.0,2017-10-05,35000.0,1148.0,1
147804,2ba1d65405594702af723081531011ef,transaction,396,13.04,,,,,,,,,,,M,18.0,2015-09-30,38000.0,1884.0,1
138876,0bcecfedf7e448b08e5ed4c00f47217d,transaction,366,4.88,,,,,,,,,,,M,70.0,2013-10-24,58000.0,2590.0,1
290537,b75b00a50cfa4059aef0851e81904a24,offer completed,648,,2.0,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,M,67.0,2016-04-09,114000.0,1692.0,1
262072,570f7cc3a63249d9b295d5fb8a7c1d73,offer viewed,576,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,,,NaT,,,1
64412,c19daa02b8294bc58af617231fe6d49b,offer received,168,,,5a8bc65990b245e5a138643cd4eb9837,0.0,0.0,informational,1.0,1.0,1.0,0.0,72.0,F,56.0,2018-04-30,51000.0,941.0,1
261176,edbb23d321174301a15049a09df4072b,offer completed,576,,10.0,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,bogo,1.0,1.0,1.0,1.0,120.0,F,100.0,2017-05-07,71000.0,1299.0,1


Total counts for all events

In [None]:
count_of_events = pd.pivot_table(total_dist, values='count', index = 'cust_id', columns='event', aggfunc=np.sum)
count_of_events.rename(columns=lambda x: "Total "+x, inplace=True)
count_of_events

event,Total offer completed,Total offer received,Total offer viewed,Total transaction
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0009655768c64bdeb2e877511632db8f,3.0,5.0,4.0,8.0
00116118485d4dfda04fdbaba9a87b5c,,2.0,2.0,3.0
0011e0d4e6b944f998e987f904e8c1e5,3.0,5.0,5.0,5.0
0020c2b971eb4e9188eac86d93036a77,3.0,5.0,3.0,8.0
0020ccbbb6d84e358d3414a3ff76cffd,3.0,4.0,4.0,12.0
...,...,...,...,...
fff3ba4757bd42088c044ca26d73817a,3.0,6.0,3.0,11.0
fff7576017104bcc8677a8d63322b5e1,3.0,5.0,4.0,6.0
fff8957ea8b240a6b5e634b6ee8eafcf,,3.0,2.0,5.0
fffad4f4828548d1b5583907f2e9906b,3.0,4.0,4.0,12.0


Merging profile with 'counts of events'

In [None]:
profile_features = profile
profile_features = profile_features.merge(count_of_events,how='left',on='cust_id')
profile_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14825 entries, 0 to 14824
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   gender                 14825 non-null  object        
 1   age                    14825 non-null  int64         
 2   cust_id                14825 non-null  object        
 3   became_member_on       14825 non-null  datetime64[ns]
 4   income                 14825 non-null  float64       
 5   days_as_member         14825 non-null  int64         
 6   Total offer completed  11986 non-null  float64       
 7   Total offer received   14820 non-null  float64       
 8   Total offer viewed     14675 non-null  float64       
 9   Total transaction      14492 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 1.2+ MB


Individual event counts based on offer type and counts of their modes of delivery

In [None]:
event_group = total_dist.groupby('event')
offer_counts = dict()

for event, group in event_group:
    if event != 'transaction':        
        offer_counts[event] = pd.pivot_table(group, values='count', index='cust_id', columns='offer_type', aggfunc=np.sum)
        offer_counts[event].rename(columns=lambda x: event+" "+x+" count", inplace=True)

        count_of_delivery_by_event = group[['cust_id', 'email', 'social', 'mobile', 'web']]
        count_of_delivery_by_event = count_of_delivery_by_event.groupby(['cust_id']).sum()
        count_of_delivery_by_event.rename(columns=lambda x: event+"_via_"+x, inplace=True)

        profile_features = profile_features.merge(count_of_delivery_by_event,how='left',on='cust_id')
    else:
        offer_counts[event] = group.groupby(['cust_id'])['amount'].agg('sum').to_frame()
        offer_counts[event].rename(columns=lambda x: "Total transaction "+x, inplace=True)
        
    profile_features = profile_features.merge(offer_counts[event],how='left',on='cust_id')

profile_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14825 entries, 0 to 14824
Data columns (total 31 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   gender                              14825 non-null  object        
 1   age                                 14825 non-null  int64         
 2   cust_id                             14825 non-null  object        
 3   became_member_on                    14825 non-null  datetime64[ns]
 4   income                              14825 non-null  float64       
 5   days_as_member                      14825 non-null  int64         
 6   Total offer completed               11986 non-null  float64       
 7   Total offer received                14820 non-null  float64       
 8   Total offer viewed                  14675 non-null  float64       
 9   Total transaction                   14492 non-null  float64       
 10  offer completed_via_em

In [None]:
profile_features.fillna(0, inplace=True)    
print(profile_features.shape)
profile_features.sample(10)

(14825, 31)


Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,offer completed_via_email,offer completed_via_social,offer completed_via_mobile,offer completed_via_web,offer completed bogo count,offer completed discount count,offer received_via_email,offer received_via_social,offer received_via_mobile,offer received_via_web,offer received bogo count,offer received discount count,offer received informational count,offer viewed_via_email,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount
12636,F,74,134befa315d24b59a267013a8cfcbde9,2015-09-24,73000.0,1890,0.0,2.0,2.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,1.0,1.0,0.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,1.0,22.52
3319,M,46,7dc394b25e2c43c4bd9f00f45fa5cead,2017-09-06,50000.0,1177,0.0,5.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,4.0,5.0,4.0,2.0,3.0,0.0,4.0,4.0,4.0,3.0,2.0,2.0,0.0,6.87
2533,M,55,c019e0e0d9a14d0fbdeff01a55f50fcd,2016-12-25,62000.0,1432,2.0,4.0,4.0,20.0,2.0,1.0,1.0,2.0,1.0,1.0,4.0,3.0,3.0,2.0,2.0,1.0,1.0,4.0,3.0,3.0,2.0,2.0,1.0,1.0,185.02
1227,F,55,f4e2258e9c23436e8db7dea8a2e12ec5,2017-08-16,81000.0,1198,0.0,5.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,5.0,3.0,2.0,0.0,3.0,2.0,2.0,2.0,1.0,1.0,0.0,1.0,1016.93
10002,M,59,f326530596a54b17a1eb13d402024d40,2018-07-04,64000.0,876,0.0,5.0,5.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.0,4.0,4.0,1.0,2.0,2.0,5.0,3.0,4.0,4.0,1.0,2.0,2.0,49.01
9662,F,66,c39913cf356a4dfc98f0326c42878193,2018-05-04,75000.0,937,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,3.0,3.0,2.0,0.0,1.0,2.0,1.0,2.0,2.0,1.0,0.0,1.0,0.0
9307,M,62,2d05ba02bca448b3a9432aa8ff7f4415,2013-11-24,45000.0,2559,2.0,4.0,4.0,18.0,2.0,2.0,2.0,2.0,0.0,2.0,4.0,4.0,4.0,3.0,1.0,2.0,1.0,4.0,4.0,4.0,3.0,1.0,2.0,1.0,50.45
1156,M,62,242ae46c55a74e1797499e12308c3965,2014-03-12,68000.0,2451,6.0,6.0,5.0,8.0,6.0,2.0,4.0,6.0,1.0,5.0,6.0,2.0,4.0,6.0,1.0,5.0,0.0,5.0,2.0,3.0,5.0,1.0,4.0,0.0,117.49
12195,M,37,37fe1a9de1c1480e8557d097835d7c0f,2018-02-08,33000.0,1022,2.0,4.0,4.0,11.0,2.0,2.0,2.0,1.0,1.0,1.0,4.0,3.0,4.0,2.0,2.0,1.0,1.0,4.0,3.0,4.0,2.0,2.0,1.0,1.0,98.01
9560,M,31,4b169bec1aba4aed81b70657da3d8e61,2016-08-19,36000.0,1560,2.0,3.0,2.0,11.0,2.0,1.0,2.0,2.0,0.0,2.0,3.0,1.0,3.0,3.0,0.0,2.0,1.0,2.0,1.0,2.0,2.0,0.0,1.0,1.0,299.09



Average difficulty score and duration

In [None]:
avg_difficulty_duration = total_dist[['cust_id', 'difficulty', 'duration']]
avg_difficulty_duration = avg_difficulty_duration.groupby(['cust_id']).mean()
avg_difficulty_duration.rename(columns=lambda x: "avg_"+x, inplace=True)
avg_difficulty_duration.sample(10)

Unnamed: 0_level_0,avg_difficulty,avg_duration
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1
b193da492dcb4b029ebade62a96eefdc,9.444444,184.0
c811024989454214bffab4c58bcf56a9,10.0,176.0
d209eab2e56f4430a45b0b08d2f3d72a,12.142857,174.857143
bd40b5d8818e40bba31a9c1195255017,9.583333,170.0
3622ada8c6394b32a8d7ba31c737e926,7.307692,173.538462
5b4c24de054a48e8bca2015cc6d583d2,7.5,180.0
e311b9067f484ff8a11bc16c6c234465,8.636364,154.909091
1c28b1041b4a4b06bfb1d2bfd4ef075b,3.5,132.0
04e34b1c5fe64fbcb76bfe47ebbbf015,5.333333,140.8
4757f0969cb440e0883c7dda26151cb2,6.0,144.0


In [None]:
profile_features = profile_features.merge(avg_difficulty_duration,how='left',on='cust_id')
print(profile_features.shape)
profile_features.sample(5)

(14825, 33)


Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,offer completed_via_email,offer completed_via_social,offer completed_via_mobile,offer completed_via_web,offer completed bogo count,offer completed discount count,offer received_via_email,offer received_via_social,offer received_via_mobile,offer received_via_web,offer received bogo count,offer received discount count,offer received informational count,offer viewed_via_email,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration
4599,M,80,0e7f9c775784453d8504e4a480d56c8e,2014-03-30,51000.0,2433,5.0,5.0,4.0,10.0,5.0,5.0,5.0,4.0,2.0,3.0,5.0,5.0,5.0,4.0,2.0,3.0,0.0,4.0,4.0,4.0,4.0,1.0,3.0,0.0,142.33,8.714286,173.142857
6376,F,63,7bd5eb568781460c9d49a2a3e3e59494,2017-02-17,45000.0,1378,2.0,4.0,2.0,7.0,2.0,1.0,1.0,2.0,0.0,2.0,4.0,2.0,3.0,3.0,0.0,3.0,1.0,2.0,2.0,2.0,1.0,0.0,1.0,1.0,80.33,8.875,162.0
12143,F,90,75346ec64fd44dd196e782cbeab000ef,2016-08-26,54000.0,1553,3.0,4.0,4.0,16.0,3.0,3.0,3.0,2.0,1.0,2.0,4.0,3.0,4.0,3.0,1.0,2.0,1.0,4.0,3.0,4.0,3.0,1.0,2.0,1.0,239.54,6.545455,154.909091
2841,M,50,484e1baf58db4496943ceb9fc29935a3,2018-02-13,94000.0,1017,1.0,4.0,4.0,2.0,1.0,1.0,1.0,1.0,0.0,1.0,4.0,3.0,3.0,3.0,2.0,2.0,0.0,4.0,3.0,3.0,3.0,2.0,2.0,0.0,62.14,12.222222,197.333333
13159,M,83,3d30c8c79e85402bab39c0c5adfa2b08,2017-10-05,94000.0,1148,3.0,4.0,2.0,10.0,3.0,2.0,2.0,2.0,1.0,2.0,4.0,2.0,3.0,3.0,1.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0,263.85,11.111111,200.0


Total rewards received

In [None]:
rewards = total_dist[['cust_id', 'reward_received']]
sum_rewards = rewards.groupby(['cust_id']).sum()
sum_rewards.rename(columns=lambda x: "total_"+x, inplace=True)
sum_rewards.sample(10)

Unnamed: 0_level_0,total_reward_received
cust_id,Unnamed: 1_level_1
8c03c9716ffa4825b57af6b5cfee0f82,0.0
284650de773c461da7dd5c547e5ac590,18.0
28ddc2fd0f9344008828d2fd1d846e24,2.0
2c0c244b5b0c4b2b96e9cfaaa82eb499,13.0
261e8e8518ef4383b1b1f3243a29c337,13.0
fc73a0a1bd924d2998c7ee08c6cc0789,0.0
cb47e296ba87499b98445d2d1b8b5e52,0.0
2f10c8022a9342ce8303cca33023c303,5.0
9c10540f4b6842deb88e827d8d1260da,0.0
c259c5ceb06440219fa381a1fbf4c337,8.0


In [None]:
profile_features = profile_features.merge(sum_rewards,how='left',on='cust_id')
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,offer completed_via_email,offer completed_via_social,offer completed_via_mobile,offer completed_via_web,offer completed bogo count,offer completed discount count,offer received_via_email,offer received_via_social,offer received_via_mobile,offer received_via_web,offer received bogo count,offer received discount count,offer received informational count,offer viewed_via_email,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration,total_reward_received
3124,M,90,1c8b3756397541c7b4dade350741a9ed,2018-04-09,38000.0,962,0.0,6.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2.0,6.0,6.0,3.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,9.87,6.875,135.0,0.0
9617,M,54,e08c6b48a9804fdeaf5c712a3644d1b2,2018-06-15,40000.0,895,0.0,5.0,2.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2.0,5.0,4.0,3.0,0.0,2.0,2.0,2.0,2.0,1.0,2.0,0.0,0.0,10.51,5.0,133.714286,0.0
605,M,48,c87e041cfaac44f9a2e3b4791b915ee6,2017-08-16,56000.0,1198,1.0,6.0,5.0,10.0,1.0,1.0,1.0,1.0,0.0,1.0,6.0,4.0,6.0,4.0,1.0,2.0,3.0,5.0,4.0,5.0,3.0,1.0,2.0,2.0,23.83,5.833333,164.0,2.0
8766,M,69,313b01db6d7441d8814cd7926e114b34,2016-07-11,61000.0,1599,3.0,6.0,6.0,9.0,3.0,2.0,3.0,3.0,1.0,2.0,6.0,3.0,6.0,5.0,2.0,2.0,2.0,6.0,3.0,6.0,5.0,2.0,2.0,2.0,114.0,6.666667,150.4,14.0
8587,M,36,591c00f08b1641a8b7e9826ba360f93c,2018-06-15,64000.0,895,1.0,4.0,3.0,9.0,1.0,1.0,1.0,1.0,0.0,1.0,4.0,3.0,4.0,3.0,0.0,3.0,1.0,3.0,3.0,3.0,2.0,0.0,2.0,1.0,41.91,6.75,171.0,2.0


Total reward defined

In [None]:
offer_received_data = total_dist[total_dist['event'] == 'offer received']
offer_received_data.sample(5)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member,count
249669,95bfff94b9b44ba98bb6b69bc3efc3c7,offer received,576,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,M,61.0,2017-10-19,98000.0,1134.0,1
156597,c621318761c34015b1189d9b8336529b,offer received,408,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,bogo,1.0,1.0,1.0,1.0,120.0,F,72.0,2017-11-05,74000.0,1117.0,1
120079,058406afd91541fbae0060827c0ac156,offer received,336,,,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,discount,1.0,1.0,1.0,1.0,240.0,F,70.0,2018-07-06,91000.0,874.0,1
162449,7425c781fb554b79a7a06095c1f813a8,offer received,408,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,M,80.0,2017-07-28,37000.0,1217.0,1
155835,12c783f75d5847c08e1a0e582a6b735a,offer received,408,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,M,38.0,2015-10-16,67000.0,1868.0,1


In [None]:
rewards = offer_received_data[['cust_id', 'reward_defined']]
sum_rewards = rewards.groupby(['cust_id']).sum()
sum_rewards.rename(columns=lambda x: "total_"+x, inplace=True)
sum_rewards.sample(10)

Unnamed: 0_level_0,total_reward_defined
cust_id,Unnamed: 1_level_1
5a7af629a0244c35b98846d4fc8d4603,6.0
6368c154e8014fa49bc178a60f453b04,11.0
a6c6bc7bbbaa46f4a7c36d036cf537b2,29.0
f011ba52208c4c5badede12c114b02e9,9.0
6bb9186fb03e4d99a4c7f10ce849c65b,22.0
9dc0a23131994ab08657bc748f425765,34.0
eff43f470d36465b8aa4a12d79d25986,12.0
7798d98a070845328d838312b65ba846,32.0
6d5da41e6c85436782eafe2fedd00dad,28.0
da9e7cdd4b0e4978b03f92d1245c8c60,9.0


In [None]:
profile_features = profile_features.merge(sum_rewards,how='left',on='cust_id')
print(profile_features.shape)
profile_features.sample(5)

(14825, 35)


Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,offer completed_via_email,offer completed_via_social,offer completed_via_mobile,offer completed_via_web,offer completed bogo count,offer completed discount count,offer received_via_email,offer received_via_social,offer received_via_mobile,offer received_via_web,offer received bogo count,offer received discount count,offer received informational count,offer viewed_via_email,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration,total_reward_received,total_reward_defined
9350,F,48,c93bf8f191f347b0bac4167e507b8a60,2017-04-18,87000.0,1318,3.0,4.0,2.0,6.0,3.0,2.0,3.0,2.0,3.0,0.0,4.0,2.0,4.0,3.0,4.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,0.0,0.0,155.35,6.111111,152.0,20.0,25.0
9429,M,23,d54ad47d89284047809a1626127cad82,2014-05-19,37000.0,2383,1.0,4.0,3.0,12.0,1.0,1.0,1.0,1.0,1.0,0.0,4.0,3.0,4.0,3.0,3.0,0.0,1.0,3.0,3.0,3.0,2.0,3.0,0.0,0.0,38.36,6.875,129.0,5.0,25.0
14079,M,47,9fcbff4f8d7241faa4ab8a9d19c8a812,2017-10-13,94000.0,1140,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,2.0,1.0,0.0,1.0,2.0,1.0,2.0,2.0,1.0,0.0,1.0,0.0,5.0,108.0,0.0,10.0
9223,F,26,741d5560b72c438092cffca139177734,2015-04-24,61000.0,2043,1.0,5.0,4.0,18.0,1.0,1.0,1.0,1.0,1.0,0.0,5.0,4.0,4.0,3.0,4.0,1.0,0.0,4.0,4.0,4.0,2.0,4.0,0.0,0.0,57.06,9.5,151.2,5.0,40.0
11473,M,37,3835f3a19c534c359faaf72b0861c32a,2016-04-04,74000.0,1697,4.0,6.0,3.0,16.0,4.0,3.0,4.0,4.0,1.0,3.0,6.0,4.0,6.0,5.0,1.0,3.0,2.0,3.0,3.0,3.0,2.0,1.0,1.0,1.0,323.71,6.538462,164.307692,11.0,11.0


View and completion ratio

In [None]:
profile_features['total_view_ratio'] = round(profile_features['Total offer viewed']/profile_features['Total offer received'], 2)
profile_features['total_completion_ratio'] = round(profile_features['Total offer completed']/profile_features['Total offer received'], 3) 
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,offer completed_via_email,offer completed_via_social,offer completed_via_mobile,offer completed_via_web,offer completed bogo count,offer completed discount count,offer received_via_email,offer received_via_social,offer received_via_mobile,offer received_via_web,offer received bogo count,offer received discount count,offer received informational count,offer viewed_via_email,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration,total_reward_received,total_reward_defined,total_view_ratio,total_completion_ratio
11140,M,62,839649618a584369a51d594acbd1e80d,2018-04-19,116000.0,952,1.0,4.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,4.0,3.0,4.0,4.0,2.0,2.0,0.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,32.5,10.0,161.142857,10.0,24.0,0.5,0.25
13356,M,48,422466659dc94890a4e044633de07d39,2015-11-15,62000.0,1838,5.0,5.0,4.0,8.0,5.0,1.0,4.0,5.0,2.0,3.0,5.0,1.0,4.0,5.0,2.0,3.0,0.0,4.0,1.0,4.0,4.0,2.0,2.0,0.0,121.94,9.285714,193.714286,19.0,19.0,0.8,1.0
9720,F,63,5c887483f2a44a8f88771962c8f07f9b,2018-01-08,70000.0,1053,0.0,6.0,4.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,3.0,5.0,5.0,0.0,4.0,2.0,4.0,3.0,4.0,3.0,0.0,3.0,1.0,11.5,7.4,163.2,0.0,12.0,0.67,0.0
8212,F,76,eb1bb72548a04d08a5d473ef59594d10,2016-11-05,71000.0,1482,3.0,5.0,5.0,7.0,3.0,1.0,3.0,3.0,2.0,1.0,5.0,3.0,5.0,4.0,3.0,1.0,1.0,5.0,3.0,5.0,4.0,3.0,1.0,1.0,142.43,6.538462,134.769231,17.0,22.0,1.0,0.6
12182,M,64,0d56277b07ed4a8bb3a19325803e3579,2016-06-13,81000.0,1627,4.0,5.0,5.0,10.0,4.0,2.0,4.0,4.0,1.0,3.0,5.0,3.0,5.0,4.0,1.0,3.0,1.0,5.0,3.0,5.0,4.0,1.0,3.0,1.0,179.79,8.571429,159.428571,16.0,16.0,1.0,0.8


Avg_transaction_amt and transactions_per_membership_days

In [1]:
profile_features['avg_transaction_amt'] = round(profile_features['Total transaction amount']/profile_features['Total transaction'], 2)
profile_features['transactions_per_membership_days'] = profile_features['Total transaction']/profile_features['days_as_member']


profile_features['weeks_as_member'] = (pd.to_datetime('20180801', format='%Y%m%d') - pd.to_datetime(profile_features['became_member_on'], format='%Y%m%d'))/np.timedelta64(1, 'W')
profile_features['avg transaction count by membership time'] = profile_features['Total transaction']/profile_features['weeks_as_member']

profile_features.fillna(0, inplace=True)
print(profile_features.shape)
profile_features.sample(5)

NameError: ignored

## Labelling Customer Type (Taget)

The definition of being an active or passive customer

: Customer who has more than '80% of total view ratio' and '20% of total completion ratio' will be considered as an 'active' customer.


In [None]:
len(profile_features[profile_features['Total offer viewed'] < profile_features['Total offer completed']])

1932

In [None]:
# first method of labelling
profile_features['customer_type'] = "passive"
profile_features.loc[(profile_features['total_view_ratio'] >= 0.8) & (profile_features['total_completion_ratio'] >= 0.2), 'customer_type'] = "active" 

In [None]:
# second method of labelling
profile_features['customer type'] = 'passive'
profile_features.loc[profile_features['avg transaction count by membership time'] >= 0.5, 'customer type'] = "active" 

In [None]:
profile_features.sample(10)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,offer completed_via_email,offer completed_via_social,offer completed_via_mobile,offer completed_via_web,offer completed bogo count,offer completed discount count,offer received_via_email,offer received_via_social,offer received_via_mobile,offer received_via_web,offer received bogo count,offer received discount count,offer received informational count,offer viewed_via_email,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration,total_reward_received,total_reward_defined,total_view_ratio,total_completion_ratio,avg_transaction_amt,transactions_per_membership_days,membership_weeks,avg transaction count by membership time,customer_type,customer type
14803,F,44,e2fd53ed790240c586b3188f23542cca,2017-01-19,51000.0,1407,2.0,3.0,3.0,8.0,2.0,1.0,2.0,2.0,0.0,2.0,3.0,2.0,3.0,2.0,0.0,2.0,1.0,3.0,2.0,3.0,2.0,0.0,2.0,1.0,102.37,6.375,144.0,5.0,5.0,1.0,0.667,12.8,0.005686,79.857143,0.100179,active,passive
4525,M,50,95e94202440a43939f3195fc975e7367,2017-02-09,43000.0,1386,5.0,6.0,4.0,12.0,5.0,4.0,4.0,3.0,3.0,2.0,6.0,4.0,5.0,4.0,3.0,2.0,1.0,4.0,4.0,4.0,2.0,3.0,1.0,0.0,136.95,10.066667,163.2,38.0,38.0,0.67,0.833,11.41,0.008658,76.857143,0.156134,passive,passive
1955,M,50,f37d9566b54547d8bdc64cc949ddab90,2017-09-27,71000.0,1156,5.0,5.0,4.0,9.0,5.0,3.0,5.0,5.0,2.0,3.0,5.0,3.0,5.0,5.0,2.0,3.0,0.0,4.0,3.0,4.0,4.0,1.0,3.0,0.0,188.49,8.642857,173.142857,22.0,22.0,0.8,1.0,20.94,0.007785,44.0,0.204545,active,passive
1600,M,64,393041633e9e4d1c834537ae4ecf063d,2018-04-26,75000.0,945,0.0,3.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,3.0,2.0,1.0,0.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,1.0,16.62,4.0,96.0,0.0,10.0,0.67,0.0,2.77,0.006349,13.857143,0.43299,passive,passive
11422,M,66,b1328662a0944c01945de75da6b0c060,2015-10-01,50000.0,1883,2.0,5.0,0.0,6.0,2.0,0.0,1.0,2.0,1.0,1.0,5.0,1.0,2.0,4.0,2.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,70.29,14.285714,209.142857,10.0,30.0,0.0,0.4,11.72,0.003186,147.857143,0.04058,passive,passive
12245,F,95,249acca93b064e4a81a41c8340e2ef84,2016-06-06,66000.0,1634,4.0,4.0,4.0,10.0,4.0,3.0,4.0,4.0,3.0,1.0,4.0,3.0,4.0,4.0,3.0,1.0,0.0,4.0,3.0,4.0,4.0,3.0,1.0,0.0,129.78,8.75,132.0,27.0,27.0,1.0,1.0,12.98,0.00612,112.285714,0.089059,active,passive
14307,M,48,8f68a2ee85fe41b5bd1b01ed54a34e27,2016-08-18,92000.0,1561,5.0,5.0,5.0,11.0,5.0,4.0,5.0,3.0,4.0,1.0,5.0,4.0,5.0,3.0,4.0,1.0,0.0,5.0,4.0,5.0,3.0,4.0,1.0,0.0,267.01,8.4,158.4,38.0,38.0,1.0,1.0,24.27,0.007047,101.857143,0.107994,active,passive
5382,F,58,808d7d2d38784160aca7a43b7d04eb6f,2017-10-28,37000.0,1125,5.0,5.0,5.0,11.0,5.0,3.0,4.0,3.0,4.0,1.0,5.0,3.0,4.0,3.0,4.0,1.0,0.0,5.0,3.0,4.0,3.0,4.0,1.0,0.0,92.42,11.0,172.8,40.0,40.0,1.0,1.0,8.4,0.009778,39.571429,0.277978,active,passive
12745,M,34,21ce757cdf2441f3921014edc2775e6c,2018-03-02,73000.0,1000,0.0,4.0,2.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2.0,3.0,3.0,2.0,2.0,0.0,2.0,2.0,2.0,1.0,1.0,1.0,0.0,14.07,9.833333,180.0,0.0,23.0,0.5,0.0,3.52,0.004,21.714286,0.184211,passive,passive
1648,F,30,a40887a885c941a38fea086a675285b9,2018-06-19,56000.0,891,3.0,3.0,3.0,9.0,3.0,2.0,3.0,3.0,1.0,2.0,3.0,2.0,3.0,3.0,1.0,2.0,0.0,3.0,2.0,3.0,3.0,1.0,2.0,0.0,118.76,8.333333,176.0,9.0,9.0,1.0,1.0,13.2,0.010101,6.142857,1.465116,active,active


In [None]:
profile_features.value_counts('customer_type')

customer_type
passive    8052
active     6773
dtype: int64

In [None]:
profile_features.value_counts('customer type')

customer type
passive    13607
active      1218
dtype: int64