# Data preprocessing

## Importing required libraries and reading the input json files:

In [1]:
import os, datetime, json
import pandas as pd
import numpy as np

In [2]:
transcript = pd.read_json('../data/transcript.json', orient='records', lines=True)
portfolio = pd.read_json('../data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('../data/profile.json', orient='records', lines=True)

## Data cleansing operations

### Cleaning profile.json

In [3]:
profile.shape

(17000, 5)

In [4]:
profile.dropna(axis=0,inplace=True)
profile.rename(columns={'id': 'cust_id'}, inplace=True)
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'].astype(str), format='%Y%m%d')
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days
profile.sample(10)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member
13335,M,42,6a048964e72d420eb28dcacabdfe4526,2018-05-26,64000.0,913
4468,M,43,a4ecd1ebb16a4800bdbf57aaf0144078,2018-07-11,93000.0,867
12002,M,68,0a34323c7f254960979fb7c6cafa6b3c,2015-02-10,70000.0,2114
425,M,51,227f2d69e46a4899b70d48182822cff6,2018-05-12,100000.0,927
7564,M,66,db06532760134014aa9445c7560554af,2017-06-06,92000.0,1267
9954,M,48,be36abf9679d4981963d4af7d14eeafe,2017-12-03,45000.0,1087
11349,F,55,d5ea4111ae9c4ccb998c0019c91c9329,2017-09-07,54000.0,1174
12964,M,53,6c3846ea64974f65bfa6611706e769a5,2017-10-08,46000.0,1143
6939,M,67,a1cf198e6f334dcb9a4a4f6d8f6d4b84,2013-08-28,63000.0,2645
13,F,61,aa4862eba776480b8bb9c68455b8c2e1,2017-09-11,57000.0,1170


In [5]:
profile.to_csv('../data/cln_profile.csv')

### Cleaning portfolio.json

In [6]:
portfolio.rename(columns={'id': 'offer_id'}, inplace=True)
portfolio.set_index('offer_id', inplace=True)
portfolio.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, ae264e3637204a6fb9bb56bc8210ddfd to 2906b810c7d4411798c6938adc9daaa5
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reward      10 non-null     int64 
 1   channels    10 non-null     object
 2   difficulty  10 non-null     int64 
 3   duration    10 non-null     int64 
 4   offer_type  10 non-null     object
dtypes: int64(3), object(2)
memory usage: 480.0+ bytes


#### Unpacking channels

In [7]:
medium = portfolio['channels'].str.join(',').str.get_dummies(',')
portfolio = pd.concat([portfolio, medium], axis=1) # concat medium columns
portfolio = portfolio.drop(['channels'], axis=1)
portfolio['duration_in_hours'] = portfolio['duration']*24
portfolio

Unnamed: 0_level_0,reward,difficulty,duration,offer_type,email,mobile,social,web,duration_in_hours
offer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ae264e3637204a6fb9bb56bc8210ddfd,10,10,7,bogo,1,1,1,0,168
4d5c57ea9a6940dd891ad53e9dbe8da0,10,10,5,bogo,1,1,1,1,120
3f207df678b143eea3cee63160fa8bed,0,0,4,informational,1,1,0,1,96
9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,7,bogo,1,1,0,1,168
0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,10,discount,1,0,0,1,240
2298d6c36e964ae4a3e7e9706d1fb8c2,3,7,7,discount,1,1,1,1,168
fafdcd668e3743c1bb461111dcafc2a4,2,10,10,discount,1,1,1,1,240
5a8bc65990b245e5a138643cd4eb9837,0,0,3,informational,1,1,1,0,72
f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,bogo,1,1,1,1,120
2906b810c7d4411798c6938adc9daaa5,2,10,7,discount,1,1,0,1,168


In [8]:
portfolio.drop(['duration'], axis=1, inplace=True)
portfolio.rename(columns={'duration_in_hours':'duration'},inplace=True)
portfolio

Unnamed: 0_level_0,reward,difficulty,offer_type,email,mobile,social,web,duration
offer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ae264e3637204a6fb9bb56bc8210ddfd,10,10,bogo,1,1,1,0,168
4d5c57ea9a6940dd891ad53e9dbe8da0,10,10,bogo,1,1,1,1,120
3f207df678b143eea3cee63160fa8bed,0,0,informational,1,1,0,1,96
9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,bogo,1,1,0,1,168
0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,discount,1,0,0,1,240
2298d6c36e964ae4a3e7e9706d1fb8c2,3,7,discount,1,1,1,1,168
fafdcd668e3743c1bb461111dcafc2a4,2,10,discount,1,1,1,1,240
5a8bc65990b245e5a138643cd4eb9837,0,0,informational,1,1,1,0,72
f19421c1d4aa40978ebb69ca19b0e20d,5,5,bogo,1,1,1,1,120
2906b810c7d4411798c6938adc9daaa5,2,10,discount,1,1,0,1,168


In [9]:
portfolio.to_csv('../data/cln_portfolio.csv')

### Cleaning and grouping transcript.json based on cust_id, event

In [10]:
transcript.rename(columns={'person': 'cust_id'}, inplace=True)
transcript.rename(columns={'time':'hours_till_action'},inplace=True)
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   cust_id            306534 non-null  object
 1   event              306534 non-null  object
 2   value              306534 non-null  object
 3   hours_till_action  306534 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 9.4+ MB


#### Unpacking value column into seperate columns

In [11]:
transcript = pd.concat([transcript, transcript['value'].apply(pd.Series)], axis=1)
# selecting from offer id and offer_id for different events
transcript['offer_id_new'] = np.where(transcript['offer id'].isnull() & transcript['offer_id'].notnull(),transcript['offer_id'],transcript['offer id'])
transcript.drop(['offer id','offer_id'],axis=1,inplace=True)
transcript.drop(['value'],axis=1,inplace=True)
transcript.rename(columns={'offer_id_new':'offer_id'},inplace=True)
transcript.sample(10)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward,offer_id
111168,7f0124b2a6e94e8fba10c4055d8edde3,offer received,336,,,f19421c1d4aa40978ebb69ca19b0e20d
202846,e6d3335fdf2c4cd2ac8edcd2bb06b289,offer received,504,,,3f207df678b143eea3cee63160fa8bed
93205,a10162dac5614ce28bc0e2d154e2224c,offer completed,240,,5.0,9b98b8c7a33c4b65b9aebfe6a799e6d9
38115,83377d650e4246fa96240dcdfe44e3be,transaction,78,2.41,,
221787,7920c1de943d4acfbb2ab5b623504125,offer viewed,510,,,2298d6c36e964ae4a3e7e9706d1fb8c2
153240,ea8ae9ebc89b41ffb097dabbf8fe3c24,offer received,408,,,2298d6c36e964ae4a3e7e9706d1fb8c2
184025,633c42afdb4d44599ca2683c720dc69b,transaction,444,1.46,,
280255,11e648deac3b4a98ad5c10bb398fd393,transaction,618,1.13,,
216716,b66473610fa74072a1345b6c93d52464,offer viewed,504,,,f19421c1d4aa40978ebb69ca19b0e20d
27631,5d6a854b87274cc6af6f95a83d321af5,transaction,36,4.37,,


In [12]:
transcript.to_csv('../data/cln_transcript.csv')

### Merging all the datasets together

#### Joining Transcript and portfolio

In [13]:
total = transcript.merge(portfolio,how='left',on='offer_id')
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306534 entries, 0 to 306533
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   cust_id            306534 non-null  object 
 1   event              306534 non-null  object 
 2   hours_till_action  306534 non-null  int64  
 3   amount             138953 non-null  float64
 4   reward_x           33579 non-null   float64
 5   offer_id           167581 non-null  object 
 6   reward_y           167581 non-null  float64
 7   difficulty         167581 non-null  float64
 8   offer_type         167581 non-null  object 
 9   email              167581 non-null  float64
 10  mobile             167581 non-null  float64
 11  social             167581 non-null  float64
 12  web                167581 non-null  float64
 13  duration           167581 non-null  float64
dtypes: float64(9), int64(1), object(4)
memory usage: 35.1+ MB


#### Joining Transcript and profile

In [14]:
total = total.merge(profile,how='left',on='cust_id')
total.rename(columns={'reward_x':'reward_received', 'reward_y':'reward_defined'},inplace=True)
total.sample(10)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member
206121,9e34cfea6818482197c2a5c397a2e256,offer received,504,,,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,bogo,1.0,1.0,1.0,1.0,120.0,M,99.0,2018-01-15,108000.0,1044.0
192152,d087fd0166404163b7d1e1e7cf2a9ac7,transaction,468,0.22,,,,,,,,,,,M,71.0,2017-12-04,63000.0,1086.0
209771,ff932c6f8bb641bd816955337d153676,offer received,504,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,bogo,1.0,1.0,1.0,1.0,120.0,M,65.0,2015-09-29,76000.0,1883.0
44318,0b250fe9fab14dac96ba542a572da081,offer viewed,108,,,3f207df678b143eea3cee63160fa8bed,0.0,0.0,informational,1.0,1.0,0.0,1.0,96.0,F,63.0,2016-03-04,85000.0,1726.0
961,82d8f33acf684ed6b16260d1dc1b30d5,offer received,0,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,,,NaT,,
168481,d351650b4dfa4351bcfa0ada32c51c63,transaction,414,10.69,,,,,,,,,,,M,54.0,2017-03-27,61000.0,1338.0
33510,4ee75462df554af99b7ea4d5edb7f369,transaction,60,13.46,,,,,,,,,,,F,26.0,2016-06-21,35000.0,1617.0
104257,72412f41d944472dbba2183ae9af8a8e,offer completed,294,,2.0,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,discount,1.0,1.0,1.0,1.0,240.0,F,88.0,2016-01-17,105000.0,1773.0
171387,e88f50ba90a44deaa3df2b72b0a3e1f7,transaction,420,1.26,,,,,,,,,,,M,75.0,2013-12-15,50000.0,2536.0
50848,4d5618b1bd0c4d1693b860e5b4daac40,transaction,150,1.88,,,,,,,,,,,M,34.0,2018-01-31,52000.0,1028.0


#### Removing duplicates from transcript data

In [15]:
dupe = total[total.duplicated()]
dupe

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member
66123,3dde94fa581145cb9f206624f1a94d5a,offer completed,168,,2.0,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,F,51.0,2017-11-14,56000.0,1106.0
66783,e9fb6ed2cecb4980ba98c86abc9c91e3,offer completed,168,,10.0,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,M,78.0,2015-04-21,55000.0,2044.0
67614,a7dc060f6fc94ca7bf71fbb188187dca,offer completed,168,,5.0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,O,60.0,2017-02-01,69000.0,1392.0
68562,30478a4c1e884a63a822aa87b833ed7a,offer completed,168,,3.0,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,F,73.0,2017-12-09,74000.0,1081.0
69218,84fb57a7fe8045a8bf6236738ee73a0f,offer completed,168,,10.0,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,F,64.0,2017-06-15,113000.0,1258.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297625,6ba2450a438540999e633a5d99c7c7a0,offer completed,672,,5.0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,M,28.0,2015-01-24,61000.0,2131.0
299471,f39fe7ea4e5946378e6d224504b77797,offer completed,684,,5.0,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,discount,1.0,0.0,0.0,1.0,240.0,F,54.0,2017-08-07,98000.0,1205.0
304756,0785f1fce0b04ba08e01c7d2ebab4917,offer completed,708,,5.0,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,discount,1.0,0.0,0.0,1.0,240.0,F,51.0,2017-08-15,78000.0,1197.0
305551,b7e216b6472b46648272c29a52a86702,offer completed,714,,2.0,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,discount,1.0,1.0,1.0,1.0,240.0,M,53.0,2018-07-08,113000.0,870.0


In [16]:
total_dist = total.drop_duplicates()

### Adding features to profile

In [17]:
total_dist['count'] = 1
total_dist.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_dist['count'] = 1


Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member,count
86025,50b72821aa1a459698ad47ff4058c1ed,transaction,216,34.66,,,,,,,,,,,M,72.0,2018-01-01,56000.0,1058.0,1
25125,6ab082b797c54160b62417326d5f0c39,offer completed,30,,5.0,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,bogo,1.0,1.0,1.0,1.0,120.0,F,59.0,2017-10-23,57000.0,1128.0,1
34400,dee41cfae12043039ee3ed880778e7dc,transaction,66,12.5,,,,,,,,,,,F,68.0,2018-03-08,68000.0,992.0,1
283980,da6c95e567f94dfdb54c16073807fcfe,transaction,630,11.17,,,,,,,,,,,M,72.0,2017-09-09,39000.0,1172.0,1
183524,ca7bb6b5974c44c0a6de4cc8c1c53cd8,offer viewed,444,,,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,bogo,1.0,1.0,1.0,1.0,120.0,M,40.0,2013-12-09,65000.0,2542.0,1
33696,81837e891cb6445894f7cf90e36d6e2e,transaction,60,9.41,,,,,,,,,,,M,38.0,2017-08-17,69000.0,1195.0,1
186990,b0d824ac00104503bddefcdf811f8c36,transaction,450,1.6,,,,,,,,,,,,,NaT,,,1
204534,9e3888d67fb44211814cc57556d09c72,offer received,504,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,M,71.0,2015-06-02,69000.0,2002.0,1
172237,51a798763c81413ca71c4f1f0530a61c,offer viewed,420,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,F,45.0,2018-05-23,75000.0,916.0,1
129321,2b9346b366f54865ac8436867d969f7d,offer viewed,342,,,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,bogo,1.0,1.0,1.0,1.0,120.0,F,52.0,2016-11-11,108000.0,1474.0,1


#### Total counts for all events

In [18]:
count_of_events = pd.pivot_table(total_dist, values='count', index = 'cust_id', columns='event', aggfunc=np.sum)
count_of_events.rename(columns=lambda x: "Total "+x, inplace=True)
count_of_events

event,Total offer completed,Total offer received,Total offer viewed,Total transaction
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0009655768c64bdeb2e877511632db8f,3.0,5.0,4.0,8.0
00116118485d4dfda04fdbaba9a87b5c,,2.0,2.0,3.0
0011e0d4e6b944f998e987f904e8c1e5,3.0,5.0,5.0,5.0
0020c2b971eb4e9188eac86d93036a77,3.0,5.0,3.0,8.0
0020ccbbb6d84e358d3414a3ff76cffd,3.0,4.0,4.0,12.0
...,...,...,...,...
fff3ba4757bd42088c044ca26d73817a,3.0,6.0,3.0,11.0
fff7576017104bcc8677a8d63322b5e1,3.0,5.0,4.0,6.0
fff8957ea8b240a6b5e634b6ee8eafcf,,3.0,2.0,5.0
fffad4f4828548d1b5583907f2e9906b,3.0,4.0,4.0,12.0


In [19]:
profile_features = profile
profile_features = profile_features.merge(count_of_events,how='left',on='cust_id')
profile_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14825 entries, 0 to 14824
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   gender                 14825 non-null  object        
 1   age                    14825 non-null  int64         
 2   cust_id                14825 non-null  object        
 3   became_member_on       14825 non-null  datetime64[ns]
 4   income                 14825 non-null  float64       
 5   days_as_member         14825 non-null  int64         
 6   Total offer completed  11986 non-null  float64       
 7   Total offer received   14820 non-null  float64       
 8   Total offer viewed     14675 non-null  float64       
 9   Total transaction      14492 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 1.2+ MB


#### Individual event counts based on offer type and counts of their modes of delivery 

In [20]:
event_group = total_dist.groupby('event')
offer_counts = dict()

for event, group in event_group:
    if event != 'transaction':        
        offer_counts[event] = pd.pivot_table(group, values='count', index='cust_id', columns='offer_type', aggfunc=np.sum)
        offer_counts[event].rename(columns=lambda x: event+" "+x+" count", inplace=True)
        count_of_delivery_by_event = group[['cust_id', 'email', 'social', 'mobile', 'web']]
        count_of_delivery_by_event = count_of_delivery_by_event.groupby(['cust_id']).sum()
        count_of_delivery_by_event.rename(columns=lambda x: event+"_via_"+x, inplace=True)
        profile_features = profile_features.merge(count_of_delivery_by_event,how='left',on='cust_id')
    else:
        offer_counts[event] = group.groupby(['cust_id'])['amount'].agg('sum').to_frame()
        offer_counts[event].rename(columns=lambda x: "Total transaction "+x, inplace=True)
    profile_features = profile_features.merge(offer_counts[event],how='left',on='cust_id')

profile_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14825 entries, 0 to 14824
Data columns (total 31 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   gender                              14825 non-null  object        
 1   age                                 14825 non-null  int64         
 2   cust_id                             14825 non-null  object        
 3   became_member_on                    14825 non-null  datetime64[ns]
 4   income                              14825 non-null  float64       
 5   days_as_member                      14825 non-null  int64         
 6   Total offer completed               11986 non-null  float64       
 7   Total offer received                14820 non-null  float64       
 8   Total offer viewed                  14675 non-null  float64       
 9   Total transaction                   14492 non-null  float64       
 10  offer completed_via_em

In [21]:
profile_features.fillna(0, inplace=True)    
profile_features.sample(10)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer received discount count,offer received informational count,offer viewed_via_email,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount
8455,M,46,e7f6dc2a3bb7493ca35cebbbe6e444bb,2013-09-12,39000.0,2630,1.0,4.0,3.0,14.0,...,2.0,1.0,3.0,2.0,3.0,3.0,1.0,2.0,0.0,27.86
3500,M,35,00e8d701c583461e81cc10053681a12b,2018-02-26,70000.0,1002,1.0,4.0,3.0,7.0,...,2.0,1.0,3.0,2.0,3.0,3.0,1.0,1.0,1.0,30.98
13251,M,49,eef5ea58662f4e27b591d39990c10ae2,2016-08-14,97000.0,1563,4.0,6.0,4.0,6.0,...,2.0,2.0,4.0,4.0,4.0,3.0,1.0,2.0,1.0,163.72
777,M,66,15705f6ebfe4407991e5707d830aacb7,2015-03-17,50000.0,2079,0.0,1.0,1.0,8.0,...,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,32.51
8823,M,64,882346bfc610473c8505199a5179d302,2017-09-10,69000.0,1171,1.0,3.0,3.0,4.0,...,1.0,1.0,3.0,2.0,3.0,1.0,1.0,1.0,1.0,508.74
8717,F,64,5917ca26c4984c5f8e950da09aaf46ce,2014-07-19,56000.0,2320,0.0,6.0,2.0,7.0,...,3.0,1.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,11.69
7934,F,79,36e540dd327144b58b77729db6167af4,2013-12-17,50000.0,2534,3.0,4.0,4.0,6.0,...,1.0,1.0,4.0,2.0,4.0,3.0,2.0,1.0,1.0,107.43
12959,F,63,b0fc778d61024cd29eea6273bec6ba5a,2018-01-01,115000.0,1058,3.0,4.0,3.0,7.0,...,1.0,0.0,3.0,3.0,3.0,3.0,3.0,0.0,0.0,194.68
7970,F,52,e61a428312024ef493facecdbf5188cb,2017-10-01,76000.0,1150,3.0,4.0,3.0,7.0,...,2.0,1.0,3.0,2.0,3.0,3.0,1.0,1.0,1.0,160.44
719,M,58,86dd4706a3054adbbaf7287b7fa9aade,2017-09-06,83000.0,1175,2.0,5.0,4.0,3.0,...,2.0,1.0,4.0,3.0,3.0,3.0,1.0,2.0,1.0,83.24


#### Average difficulty score and duration

In [22]:
avg_difficulty_duration = total_dist[['cust_id', 'difficulty', 'duration']]
avg_difficulty_duration = avg_difficulty_duration.groupby(['cust_id']).mean()
avg_difficulty_duration.rename(columns=lambda x: "avg_"+x, inplace=True)
avg_difficulty_duration.sample(10)

Unnamed: 0_level_0,avg_difficulty,avg_duration
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1
09cb1b009f154a9a93af117920ddb353,8.846154,162.461538
d885cede482e4a03b42991c72c254265,8.75,150.0
e8e04794782945cd9c436f996a8ba64f,10.0,189.6
82feefcd3f904269afdabda68c9b73e2,10.785714,183.428571
a2a61641e2d84d66b4c59266e4ddc4f9,6.0,130.666667
8f5430f8f2f442abaf2e6661a26a22a4,13.214286,198.857143
ebc303bcc1b44aa6b4d1762960451468,8.923077,179.076923
95e57e801cf84720b4249cc1d70314f5,5.538462,142.153846
b23fd66b4bd4463b8382ff9007f7f861,10.769231,188.307692
5359ca16190a4a27b65e850b05f8c129,9.5,164.0


In [23]:
profile_features = profile_features.merge(avg_difficulty_duration,how='left',on='cust_id')
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer viewed_via_email,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration
535,F,20,79aa711ee90c4297b1c6cb6f7d40421a,2018-02-03,60000.0,1025,0.0,4.0,4.0,8.0,...,4.0,3.0,4.0,3.0,0.0,3.0,1.0,17.4,6.0,144.0
73,M,64,cbebad9819494b479dbf0c1207b4c0c5,2017-03-05,70000.0,1360,4.0,4.0,4.0,9.0,...,4.0,3.0,3.0,4.0,1.0,3.0,0.0,140.34,11.75,192.0
12462,O,49,bd85eeff28ee40a2a6b0c4829a4f211e,2017-01-10,95000.0,1414,3.0,6.0,6.0,7.0,...,6.0,5.0,6.0,4.0,2.0,1.0,3.0,147.99,5.4,113.6
499,F,50,55540073b2be424a9729852465b1b8df,2018-01-06,75000.0,1053,3.0,4.0,3.0,11.0,...,3.0,3.0,3.0,2.0,0.0,2.0,1.0,221.95,6.1,170.4
14506,M,91,da7aced8db154427a94a9008200267e4,2013-12-15,49000.0,2536,2.0,5.0,2.0,8.0,...,2.0,2.0,2.0,2.0,1.0,1.0,0.0,28.26,9.222222,160.0


#### Total rewards defined and total rewards received

In [24]:
full_rewards = total_dist[['cust_id', 'reward_defined', 'reward_received']]
sum_rewards = full_rewards.groupby(['cust_id']).sum()
sum_rewards.rename(columns=lambda x: "total_"+x, inplace=True)
sum_rewards.sample(10)

Unnamed: 0_level_0,total_reward_defined,total_reward_received
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1
38c60a8b1136438690a80c3b5a211200,65.0,0.0
ca5815021d3c42e2bef68f023a7764d0,28.0,0.0
05fc9b1280454d84a712d8cb763c7c79,95.0,25.0
848d42f1ff704218827e8efeadca474d,14.0,3.0
61e7bf20af4c434ab2d89cdc565d8046,27.0,0.0
d52a2606f8654e009920fd75d0fdbf45,39.0,0.0
dce6d06c87cb4657bc6b123cc35373d1,65.0,15.0
4fdfb0cfb2164596ac8df0661a13330a,39.0,13.0
6bb33215d6c549f3afc5e743b8ff3a85,35.0,0.0
46f14dabab414aadaf119bc1e633ea3e,10.0,0.0


In [25]:
profile_features = profile_features.merge(sum_rewards,how='left',on='cust_id')
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration,total_reward_defined,total_reward_received
4824,M,76,66a04ad46d83408ea8e2e28e8ccf3300,2016-11-23,85000.0,1462,3.0,5.0,5.0,6.0,...,4.0,3.0,2.0,2.0,1.0,146.09,10.307692,158.769231,81.0,25.0
12689,M,63,815b2181c57b438086c2661cc6c33d78,2018-01-09,95000.0,1050,0.0,4.0,4.0,4.0,...,4.0,2.0,3.0,0.0,1.0,103.13,6.25,120.0,50.0,0.0
4409,M,32,c4875ed3bb9e4823bdd7ab5171326dbb,2017-09-06,35000.0,1175,0.0,5.0,3.0,6.0,...,3.0,2.0,1.0,1.0,1.0,13.06,6.75,132.0,31.0,0.0
9374,M,20,b8f309e5c6b34d1b8158468dcf6cbdec,2017-12-05,32000.0,1085,0.0,3.0,2.0,0.0,...,2.0,0.0,0.0,0.0,2.0,0.0,2.0,91.2,10.0,0.0
7758,M,28,da3d478411cd4a30a6efa9d673530f00,2017-05-23,30000.0,1281,2.0,5.0,3.0,14.0,...,3.0,1.0,1.0,0.0,2.0,87.57,5.0,134.4,34.0,7.0


#### Count of offer delivery method

In [26]:
count_of_delivery = total_dist[['cust_id', 'email', 'social', 'mobile', 'web']]
count_of_delivery = count_of_delivery.groupby(['cust_id']).sum()
count_of_delivery.rename(columns=lambda x: "Total_offers_via_"+x, inplace=True)
count_of_delivery.sample(10)

Unnamed: 0_level_0,Total_offers_via_email,Total_offers_via_social,Total_offers_via_mobile,Total_offers_via_web
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7ee0436c7abc4526a1d97f598701b4a8,10.0,8.0,10.0,8.0
4f1676b2b52549fb9ffd6c9980437df9,11.0,8.0,8.0,9.0
d8d2466b53034391b840556be4b1be2c,14.0,12.0,12.0,11.0
63f1bfb1a79949bf9bc0d0b8b184fe4e,12.0,12.0,12.0,9.0
542c41f5afc049e7ae7d4721ace9d286,12.0,6.0,12.0,10.0
2531d3b6001a4d48bff0a91ceceb97a0,15.0,11.0,13.0,13.0
abc440bf04f4476d943077f226cbedf7,8.0,2.0,5.0,8.0
dadae228edc442d59471e7e8d42e1d9e,12.0,7.0,12.0,8.0
a414d8006bc24fe18b564304efdf559b,12.0,9.0,12.0,9.0
b5105f071088481c8382eb9f73cd6a27,5.0,2.0,4.0,3.0


In [27]:
profile_features = profile_features.merge(count_of_delivery,how='left',on='cust_id')
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration,total_reward_defined,total_reward_received,Total_offers_via_email,Total_offers_via_social,Total_offers_via_mobile,Total_offers_via_web
13676,M,35,d6cd472ecc5144f5b3fbe284e8da942f,2016-05-11,33000.0,1658,3.0,5.0,2.0,14.0,...,0.0,100.55,12.0,211.2,51.0,12.0,10.0,5.0,7.0,8.0
7764,M,65,86e85d52b66549dd866e69eda19919c2,2017-09-15,52000.0,1166,4.0,5.0,4.0,5.0,...,0.0,102.67,9.615385,179.076923,62.0,19.0,13.0,9.0,12.0,10.0
14643,M,21,c04f144ebb8e4694b3c954574c7631bf,2016-04-28,42000.0,1671,0.0,3.0,3.0,18.0,...,1.0,54.37,6.666667,104.0,40.0,0.0,6.0,6.0,6.0,4.0
5719,M,68,74e06417ac8043b08b5f66b3a0a16c4a,2018-01-20,40000.0,1039,0.0,4.0,3.0,2.0,...,0.0,6.63,10.714286,161.142857,45.0,0.0,7.0,4.0,5.0,7.0
11370,M,29,2bb22e7931aa49ebabff26b9cbc6260a,2018-01-19,45000.0,1040,2.0,5.0,3.0,19.0,...,1.0,47.49,10.1,160.8,44.0,8.0,10.0,7.0,7.0,8.0


#### View and completion ratio

In [28]:
profile_features['total_view_ratio'] = round(profile_features['Total offer viewed']/profile_features['Total offer received'], 2)
profile_features['total_completion_ratio'] = round(profile_features['Total offer completed']/profile_features['Total offer received'], 3) 
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,avg_difficulty,avg_duration,total_reward_defined,total_reward_received,Total_offers_via_email,Total_offers_via_social,Total_offers_via_mobile,Total_offers_via_web,total_view_ratio,total_completion_ratio
8985,M,18,5a52e27405c84e58a0d7414562df9e10,2018-04-17,40000.0,952,0.0,6.0,4.0,4.0,...,5.0,127.2,26.0,0.0,10.0,8.0,10.0,6.0,0.67,0.0
3178,F,43,e2fd3352911844f0b3bc2965978c2798,2017-07-05,66000.0,1238,3.0,3.0,1.0,12.0,...,10.714286,188.571429,25.0,11.0,7.0,5.0,5.0,7.0,0.33,1.0
13352,M,35,3d3bad0437e3459d873772f8023653ac,2016-09-01,64000.0,1545,3.0,6.0,2.0,11.0,...,7.0,168.0,38.0,11.0,11.0,6.0,10.0,11.0,0.33,0.5
2484,M,76,990080173c3f4871bb8634a9b50e118a,2017-11-18,75000.0,1102,2.0,2.0,1.0,3.0,...,8.0,139.2,40.0,15.0,5.0,3.0,5.0,5.0,0.5,1.0
3416,M,43,5e44bc50a2b84b0a89d7246f5a85617e,2018-07-23,99000.0,855,3.0,5.0,5.0,6.0,...,8.846154,173.538462,71.0,15.0,13.0,13.0,13.0,11.0,1.0,0.6


#### Avg_transaction_amt and transactions_per_membership_days

In [29]:
profile_features['avg_transaction_amt'] = round(profile_features['Total transaction amount']/profile_features['Total transaction'], 2)
profile_features['transactions_per_membership_days'] = profile_features['Total transaction']/profile_features['days_as_member']
profile_features.fillna(0, inplace=True)
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,total_reward_defined,total_reward_received,Total_offers_via_email,Total_offers_via_social,Total_offers_via_mobile,Total_offers_via_web,total_view_ratio,total_completion_ratio,avg_transaction_amt,transactions_per_membership_days
11989,M,67,0b680efe1a0a40788ebb6fb2c587b4a7,2017-08-13,67000.0,1199,0.0,3.0,2.0,4.0,...,30.0,0.0,5.0,3.0,5.0,3.0,0.67,0.0,17.01,0.003336
11694,M,49,8956d5a699344f2ca654cda962b0d953,2016-08-26,70000.0,1551,3.0,5.0,5.0,14.0,...,30.0,10.0,13.0,8.0,13.0,11.0,1.0,0.6,16.09,0.009026
10940,F,65,2db15c88abc743109a09cd8c066eb5da,2017-12-12,46000.0,1078,5.0,5.0,4.0,13.0,...,61.0,21.0,14.0,12.0,14.0,11.0,0.8,1.0,11.32,0.012059
8690,M,86,6d43ca076aef48c3a20af0ce5cce1a52,2016-01-13,44000.0,1777,1.0,5.0,3.0,11.0,...,36.0,3.0,9.0,7.0,8.0,7.0,0.6,0.2,2.58,0.00619
12708,M,59,6b64bd90a25b4c1399eb9f6e29a4d6f4,2017-12-08,65000.0,1082,0.0,3.0,3.0,6.0,...,24.0,0.0,6.0,0.0,6.0,6.0,1.0,0.0,3.06,0.005545


In [30]:
len(profile_features[profile_features['Total offer viewed'] < profile_features['Total offer completed']])

1932

In [31]:
profile_features.to_csv('../data/xtr_profile.csv')