# Data preprocessing

## Importing required libraries and reading the input json files:

In [1]:
import os, datetime, json
import pandas as pd
import numpy as np

In [2]:
transcript = pd.read_json('../data/transcript.json', orient='records', lines=True)
portfolio = pd.read_json('../data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('../data/profile.json', orient='records', lines=True)

## Data cleansing operations

### Cleaning profile.json

In [3]:
profile.shape

(17000, 5)

In [4]:
profile.dropna(axis=0,inplace=True)
profile.rename(columns={'id': 'cust_id'}, inplace=True)
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'].astype(str), format='%Y%m%d')
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days
profile.sample(10)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member
2585,F,85,1ab5e1d8f3fa4588af94253e2a09520f,2017-12-02,87000.0,1088
7336,F,58,6315392db2274a1f89ad516e9071bce0,2018-07-11,93000.0,867
1871,M,47,e7147567b7de4eb4a0ca7d48a3388855,2017-08-06,53000.0,1206
916,F,74,941d896288ab4ceb8d96d8b02e2e96e4,2017-08-14,97000.0,1198
13970,M,69,b8725275fd214d7ab37d31d365866d54,2018-04-03,84000.0,966
4678,M,54,02a3aa431c1047be8eafec3dcd6b5fd0,2018-06-06,70000.0,902
2435,M,78,89ea78d702ef425e8802f4dbfc4790f0,2017-09-19,51000.0,1162
4347,M,64,0222d267445f4f078bc325224e471766,2018-01-02,43000.0,1057
3629,F,71,41e4610d1aa74853b2d590e321699b1c,2016-05-17,89000.0,1652
1885,F,23,478f3d690f8640e8b017e13a794936a0,2017-06-18,48000.0,1255


In [5]:
profile.to_csv('../data/cln_profile.csv')

### Cleaning portfolio.json

In [6]:
portfolio.rename(columns={'id': 'offer_id'}, inplace=True)
portfolio.set_index('offer_id', inplace=True)
portfolio.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, ae264e3637204a6fb9bb56bc8210ddfd to 2906b810c7d4411798c6938adc9daaa5
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reward      10 non-null     int64 
 1   channels    10 non-null     object
 2   difficulty  10 non-null     int64 
 3   duration    10 non-null     int64 
 4   offer_type  10 non-null     object
dtypes: int64(3), object(2)
memory usage: 480.0+ bytes


#### Unpacking channels

In [7]:
medium = portfolio['channels'].str.join(',').str.get_dummies(',')
portfolio = pd.concat([portfolio, medium], axis=1) # concat medium columns
portfolio = portfolio.drop(['channels'], axis=1)
portfolio['duration_in_hours'] = portfolio['duration']*24
portfolio

Unnamed: 0_level_0,reward,difficulty,duration,offer_type,email,mobile,social,web,duration_in_hours
offer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ae264e3637204a6fb9bb56bc8210ddfd,10,10,7,bogo,1,1,1,0,168
4d5c57ea9a6940dd891ad53e9dbe8da0,10,10,5,bogo,1,1,1,1,120
3f207df678b143eea3cee63160fa8bed,0,0,4,informational,1,1,0,1,96
9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,7,bogo,1,1,0,1,168
0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,10,discount,1,0,0,1,240
2298d6c36e964ae4a3e7e9706d1fb8c2,3,7,7,discount,1,1,1,1,168
fafdcd668e3743c1bb461111dcafc2a4,2,10,10,discount,1,1,1,1,240
5a8bc65990b245e5a138643cd4eb9837,0,0,3,informational,1,1,1,0,72
f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,bogo,1,1,1,1,120
2906b810c7d4411798c6938adc9daaa5,2,10,7,discount,1,1,0,1,168


In [8]:
portfolio.drop(['duration'], axis=1, inplace=True)
portfolio.rename(columns={'duration_in_hours':'duration'},inplace=True)
portfolio

Unnamed: 0_level_0,reward,difficulty,offer_type,email,mobile,social,web,duration
offer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ae264e3637204a6fb9bb56bc8210ddfd,10,10,bogo,1,1,1,0,168
4d5c57ea9a6940dd891ad53e9dbe8da0,10,10,bogo,1,1,1,1,120
3f207df678b143eea3cee63160fa8bed,0,0,informational,1,1,0,1,96
9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,bogo,1,1,0,1,168
0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,discount,1,0,0,1,240
2298d6c36e964ae4a3e7e9706d1fb8c2,3,7,discount,1,1,1,1,168
fafdcd668e3743c1bb461111dcafc2a4,2,10,discount,1,1,1,1,240
5a8bc65990b245e5a138643cd4eb9837,0,0,informational,1,1,1,0,72
f19421c1d4aa40978ebb69ca19b0e20d,5,5,bogo,1,1,1,1,120
2906b810c7d4411798c6938adc9daaa5,2,10,discount,1,1,0,1,168


In [9]:
portfolio.to_csv('../data/cln_portfolio.csv')

### Cleaning and grouping transcript.json based on cust_id, event

In [10]:
transcript.rename(columns={'person': 'cust_id'}, inplace=True)
transcript.rename(columns={'time':'hours_till_action'},inplace=True)
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   cust_id            306534 non-null  object
 1   event              306534 non-null  object
 2   value              306534 non-null  object
 3   hours_till_action  306534 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 9.4+ MB


#### Unpacking value column into seperate columns

In [11]:
transcript = pd.concat([transcript, transcript['value'].apply(pd.Series)], axis=1)
# selecting from offer id and offer_id for different events
transcript['offer_id_new'] = np.where(transcript['offer id'].isnull() & transcript['offer_id'].notnull(),transcript['offer_id'],transcript['offer id'])
transcript.drop(['offer id','offer_id'],axis=1,inplace=True)
transcript.drop(['value'],axis=1,inplace=True)
transcript.rename(columns={'offer_id_new':'offer_id'},inplace=True)
transcript.sample(10)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward,offer_id
244871,60c0f65a8d49497e86c58a37c596a551,offer completed,570,,2.0,fafdcd668e3743c1bb461111dcafc2a4
26721,e1711365801040bd9300b970b19efa32,offer viewed,36,,,f19421c1d4aa40978ebb69ca19b0e20d
139717,c9121c7664ca4e16b88cf81e8a964cf7,transaction,372,16.22,,
141445,5293428f33b0427e8dcbbf9c46d77d6e,transaction,372,7.34,,
139701,2d4ca96dc1584507a5187a054d781623,offer viewed,372,,,ae264e3637204a6fb9bb56bc8210ddfd
1339,2f4bde67035f445d9c805e09bef359fe,offer received,0,,,9b98b8c7a33c4b65b9aebfe6a799e6d9
228323,cc5acecea5ce4cae8b4529d418993f00,transaction,528,2.84,,
63574,7a910b1814714b69b40ba0cb0fe75870,offer received,168,,,f19421c1d4aa40978ebb69ca19b0e20d
123464,8b32527512f641f9a5447ba5a811f3af,offer received,336,,,3f207df678b143eea3cee63160fa8bed
138699,01162252405b4524a8fa1bf8e6d5f04b,transaction,366,7.0,,


In [12]:
transcript.to_csv('../data/cln_transcript.csv')

### Merging all the datasets together

#### Joining Transcript and portfolio

In [13]:
total = transcript.merge(portfolio,how='left',on='offer_id')
total.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306534 entries, 0 to 306533
Data columns (total 14 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   cust_id            306534 non-null  object 
 1   event              306534 non-null  object 
 2   hours_till_action  306534 non-null  int64  
 3   amount             138953 non-null  float64
 4   reward_x           33579 non-null   float64
 5   offer_id           167581 non-null  object 
 6   reward_y           167581 non-null  float64
 7   difficulty         167581 non-null  float64
 8   offer_type         167581 non-null  object 
 9   email              167581 non-null  float64
 10  mobile             167581 non-null  float64
 11  social             167581 non-null  float64
 12  web                167581 non-null  float64
 13  duration           167581 non-null  float64
dtypes: float64(9), int64(1), object(4)
memory usage: 35.1+ MB


#### Joining Transcript and profile

In [14]:
total = total.merge(profile,how='left',on='cust_id')
total.rename(columns={'reward_x':'reward_received', 'reward_y':'reward_defined'},inplace=True)
total.sample(10)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member
128755,fade355249524f0c8bac7c097198d183,offer viewed,342,,,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,discount,1.0,1.0,1.0,1.0,240.0,F,61.0,2016-10-14,102000.0,1502.0
194974,bae9e6a975554e4590766902957ebb9b,offer completed,474,,3.0,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,M,50.0,2013-09-18,69000.0,2624.0
208153,5cb68688b66b42db8d1985340c289eb7,offer received,504,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,M,18.0,2018-03-25,48000.0,975.0
292286,0ff34474e5b245b79f396d4c5ddef96c,offer viewed,654,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,M,26.0,2017-09-08,42000.0,1173.0
36772,d674b3444fa74d15a61baa3c882f4c20,transaction,72,1.56,,,,,,,,,,,M,40.0,2014-01-20,40000.0,2500.0
18357,4a6b6b268e7b4c6ebf96bc92cd67cf0f,offer viewed,12,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,,,NaT,,
153524,919f3f6565a544d8a06eaef8e6a86d9d,offer received,408,,,5a8bc65990b245e5a138643cd4eb9837,0.0,0.0,informational,1.0,1.0,1.0,0.0,72.0,F,58.0,2017-02-02,73000.0,1391.0
276974,a95c8b2d84b04c22b525d6fec9e1dda3,offer viewed,606,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,M,71.0,2017-09-24,55000.0,1157.0
73822,d1f41b9ca2d84f90bc7b48421a6c36f3,offer viewed,180,,,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,M,59.0,2016-07-25,76000.0,1583.0
173629,33bdcb9aa1fe4ed4a181b9e028ef1245,offer completed,420,,5.0,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,bogo,1.0,1.0,1.0,1.0,120.0,O,66.0,2015-11-09,70000.0,1842.0


#### Removing duplicates from transcript data

In [15]:
dupe = total[total.duplicated()]
dupe

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member
66123,3dde94fa581145cb9f206624f1a94d5a,offer completed,168,,2.0,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,F,51.0,2017-11-14,56000.0,1106.0
66783,e9fb6ed2cecb4980ba98c86abc9c91e3,offer completed,168,,10.0,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,M,78.0,2015-04-21,55000.0,2044.0
67614,a7dc060f6fc94ca7bf71fbb188187dca,offer completed,168,,5.0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,O,60.0,2017-02-01,69000.0,1392.0
68562,30478a4c1e884a63a822aa87b833ed7a,offer completed,168,,3.0,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,F,73.0,2017-12-09,74000.0,1081.0
69218,84fb57a7fe8045a8bf6236738ee73a0f,offer completed,168,,10.0,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,F,64.0,2017-06-15,113000.0,1258.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
297625,6ba2450a438540999e633a5d99c7c7a0,offer completed,672,,5.0,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,M,28.0,2015-01-24,61000.0,2131.0
299471,f39fe7ea4e5946378e6d224504b77797,offer completed,684,,5.0,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,discount,1.0,0.0,0.0,1.0,240.0,F,54.0,2017-08-07,98000.0,1205.0
304756,0785f1fce0b04ba08e01c7d2ebab4917,offer completed,708,,5.0,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,discount,1.0,0.0,0.0,1.0,240.0,F,51.0,2017-08-15,78000.0,1197.0
305551,b7e216b6472b46648272c29a52a86702,offer completed,714,,2.0,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,discount,1.0,1.0,1.0,1.0,240.0,M,53.0,2018-07-08,113000.0,870.0


In [16]:
total_dist = total.drop_duplicates()

In [17]:
user_offer_relation = total_dist[['cust_id','offer_id']]
user_offer_relation

Unnamed: 0,cust_id,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9
1,a03223e636434f42ac4c3df47e8bac43,0b1e1539f2cc45b7b9fa7c272da2e1d7
2,e2127556f4f64592b11af22de27a7932,2906b810c7d4411798c6938adc9daaa5
3,8ec6ce2a7e7949b1bf142def7d0e0586,fafdcd668e3743c1bb461111dcafc2a4
4,68617ca6246f4fbc85e91a2a49552598,4d5c57ea9a6940dd891ad53e9dbe8da0
...,...,...
306529,b3a1272bc9904337b331bf348c3e8c17,
306530,68213b08d99a4ae1b0dcb72aebd9aa35,
306531,a00058cf10334a308c68e7631c529907,
306532,76ddbd6576844afe811f1a3c0fbb5bec,


In [18]:
combinations = user_offer_relation.drop_duplicates()
combinations

Unnamed: 0,cust_id,offer_id
0,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9
1,a03223e636434f42ac4c3df47e8bac43,0b1e1539f2cc45b7b9fa7c272da2e1d7
2,e2127556f4f64592b11af22de27a7932,2906b810c7d4411798c6938adc9daaa5
3,8ec6ce2a7e7949b1bf142def7d0e0586,fafdcd668e3743c1bb461111dcafc2a4
4,68617ca6246f4fbc85e91a2a49552598,4d5c57ea9a6940dd891ad53e9dbe8da0
...,...,...
306116,542c41f5afc049e7ae7d4721ace9d286,
306203,448dabde725040978b8a247a20bac126,
306259,7718656997f3453db0f5aeca9cd35240,
306278,54463e5d95124b7fb3133fc1eae71952,


### Adding features to profile

In [19]:
total_dist['count'] = 1
total_dist.sample(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  total_dist['count'] = 1


Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member,count
132942,720d1757d1d8444294aea1f0b05cf3fc,offer viewed,354,,,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,F,20.0,2014-03-06,47000.0,2455.0,1
44066,1738163ca571451b9ea0cd9df19f2a6f,transaction,108,21.32,,,,,,,,,,,M,58.0,2018-02-12,89000.0,1016.0,1
16051,44f1f503047642ac83f70785c4992032,offer viewed,6,,,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,M,67.0,2018-06-19,89000.0,889.0,1
275524,5a5bffb68f8d4527b532dbfb19da7d3f,offer completed,606,,3.0,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,M,58.0,2015-05-13,52000.0,2022.0,1
18521,683e5e45c24849fdb1b0bcde6121cc28,transaction,12,2.41,,,,,,,,,,,M,71.0,2016-04-26,60000.0,1673.0,1
235512,ba4b4ee409d24af89e7a350fc51efac9,offer completed,546,,2.0,2906b810c7d4411798c6938adc9daaa5,2.0,10.0,discount,1.0,1.0,0.0,1.0,168.0,M,68.0,2016-12-14,90000.0,1441.0,1
195835,a1183855451e4250adb676f87d5f8398,transaction,480,0.23,,,,,,,,,,,M,45.0,2014-05-13,54000.0,2387.0,1
162024,37a60351fd89457d8ea4b19df2b0b75f,offer received,408,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,discount,1.0,0.0,0.0,1.0,240.0,M,20.0,2018-03-26,70000.0,974.0,1
180060,01443a2afce54939a323c978f467c540,transaction,438,28.39,,,,,,,,,,,F,61.0,2015-10-10,118000.0,1872.0,1
2940,f86cd307248848cda600b65e282836b9,offer received,0,,,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,,,NaT,,,1


#### Total counts for all events

In [20]:
count_of_events = pd.pivot_table(total_dist, values='count', index = 'cust_id', columns='event', aggfunc=np.sum)
count_of_events.rename(columns=lambda x: "Total "+x, inplace=True)
count_of_events

event,Total offer completed,Total offer received,Total offer viewed,Total transaction
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0009655768c64bdeb2e877511632db8f,3.0,5.0,4.0,8.0
00116118485d4dfda04fdbaba9a87b5c,,2.0,2.0,3.0
0011e0d4e6b944f998e987f904e8c1e5,3.0,5.0,5.0,5.0
0020c2b971eb4e9188eac86d93036a77,3.0,5.0,3.0,8.0
0020ccbbb6d84e358d3414a3ff76cffd,3.0,4.0,4.0,12.0
...,...,...,...,...
fff3ba4757bd42088c044ca26d73817a,3.0,6.0,3.0,11.0
fff7576017104bcc8677a8d63322b5e1,3.0,5.0,4.0,6.0
fff8957ea8b240a6b5e634b6ee8eafcf,,3.0,2.0,5.0
fffad4f4828548d1b5583907f2e9906b,3.0,4.0,4.0,12.0


In [21]:
profile_features = profile
profile_features = profile_features.merge(count_of_events,how='left',on='cust_id')
profile_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14825 entries, 0 to 14824
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   gender                 14825 non-null  object        
 1   age                    14825 non-null  int64         
 2   cust_id                14825 non-null  object        
 3   became_member_on       14825 non-null  datetime64[ns]
 4   income                 14825 non-null  float64       
 5   days_as_member         14825 non-null  int64         
 6   Total offer completed  11986 non-null  float64       
 7   Total offer received   14820 non-null  float64       
 8   Total offer viewed     14675 non-null  float64       
 9   Total transaction      14492 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(2), object(2)
memory usage: 1.2+ MB


#### Individual event counts based on offer type and counts of their modes of delivery 

In [22]:
event_group = total_dist.groupby('event')
offer_counts = dict()

for event, group in event_group:
    if event != 'transaction':        
        offer_counts[event] = pd.pivot_table(group, values='count', index='cust_id', columns='offer_type', aggfunc=np.sum)
        offer_counts[event].rename(columns=lambda x: event+" "+x+" count", inplace=True)
        count_of_delivery_by_event = group[['cust_id', 'email', 'social', 'mobile', 'web']]
        count_of_delivery_by_event = count_of_delivery_by_event.groupby(['cust_id']).sum()
        count_of_delivery_by_event.rename(columns=lambda x: event+"_via_"+x, inplace=True)
        profile_features = profile_features.merge(count_of_delivery_by_event,how='left',on='cust_id')
    else:
        offer_counts[event] = group.groupby(['cust_id'])['amount'].agg('sum').to_frame()
        offer_counts[event].rename(columns=lambda x: "Total transaction "+x, inplace=True)
    profile_features = profile_features.merge(offer_counts[event],how='left',on='cust_id')

profile_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14825 entries, 0 to 14824
Data columns (total 31 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   gender                              14825 non-null  object        
 1   age                                 14825 non-null  int64         
 2   cust_id                             14825 non-null  object        
 3   became_member_on                    14825 non-null  datetime64[ns]
 4   income                              14825 non-null  float64       
 5   days_as_member                      14825 non-null  int64         
 6   Total offer completed               11986 non-null  float64       
 7   Total offer received                14820 non-null  float64       
 8   Total offer viewed                  14675 non-null  float64       
 9   Total transaction                   14492 non-null  float64       
 10  offer completed_via_em

In [23]:
profile_features.fillna(0, inplace=True)    
profile_features.sample(10)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer received discount count,offer received informational count,offer viewed_via_email,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount
8104,O,43,301fd1800ffd4897aed469da5705e223,2017-07-18,70000.0,1225,3.0,5.0,5.0,12.0,...,0.0,0.0,5.0,4.0,5.0,3.0,5.0,0.0,0.0,187.22
13363,M,28,40c7f044b1cb4eda9de3789b08025b7e,2016-11-02,60000.0,1483,3.0,5.0,2.0,9.0,...,1.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,1.0,150.23
13029,F,54,15e8cabbde0f40228658245d340af15d,2016-03-27,88000.0,1703,4.0,6.0,5.0,8.0,...,3.0,1.0,5.0,5.0,5.0,3.0,2.0,2.0,1.0,162.93
7765,M,62,1ee3e36220f5437fb0523779f9318b85,2018-02-20,90000.0,1008,1.0,2.0,1.0,5.0,...,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,87.6
4481,F,61,9670cd63114f48ebb1b8ff3858904c82,2016-07-27,106000.0,1581,1.0,5.0,4.0,4.0,...,1.0,2.0,4.0,4.0,4.0,3.0,2.0,1.0,1.0,100.6
253,F,48,9c64321646bf40f6ad1bc98864eaa6b8,2016-10-14,53000.0,1502,3.0,4.0,3.0,4.0,...,3.0,0.0,3.0,2.0,3.0,3.0,1.0,2.0,0.0,55.99
10439,F,69,7da25b87262f4c75bac253cf5e5d9039,2017-04-06,109000.0,1328,1.0,3.0,2.0,4.0,...,2.0,1.0,2.0,2.0,2.0,1.0,0.0,1.0,1.0,118.68
12228,F,26,b887fc9ffbe84892990236bc2e2d710f,2015-12-01,62000.0,1820,2.0,4.0,3.0,15.0,...,1.0,2.0,3.0,2.0,3.0,3.0,1.0,1.0,1.0,212.27
1950,F,67,23dfcdb8237347dca870bf78d7fa823c,2017-01-09,86000.0,1415,3.0,3.0,3.0,11.0,...,0.0,0.0,3.0,2.0,3.0,3.0,3.0,0.0,0.0,1058.24
425,M,43,9c48c9b259b042f2b7cebe52df7fc36b,2014-07-01,40000.0,2338,1.0,4.0,3.0,22.0,...,1.0,2.0,3.0,1.0,3.0,3.0,1.0,0.0,2.0,71.03


#### Average difficulty score and duration

In [24]:
avg_difficulty_duration = total_dist[['cust_id', 'difficulty', 'duration']]
avg_difficulty_duration = avg_difficulty_duration.groupby(['cust_id']).mean()
avg_difficulty_duration.rename(columns=lambda x: "avg_"+x, inplace=True)
avg_difficulty_duration.sample(10)

Unnamed: 0_level_0,avg_difficulty,avg_duration
cust_id,Unnamed: 1_level_1,Unnamed: 2_level_1
b89f5ad3d676421a93b967dade2cccb4,9.142857,154.285714
e0cfbeda9c01448e86010ce2e65884ff,7.461538,156.923077
3c0eb58dc0d7443fa97c4cdc2996728a,6.363636,141.818182
0f54094029ca4107bfd36203aeb3ed14,5.545455,128.727273
a9996f30c54d47d695bf54afbf7bcb4c,11.75,204.0
e2602891e0704a9abd1989fc568cc8ed,10.0,201.6
889d81fe350643e280f90c0faff01118,5.6,148.8
b3f19a6e6036426d91de0d2d09dbca9f,6.666667,144.0
a0a5f5a4a5e449ad8fb7b922684dfff9,13.333333,216.0
013f2c82889f4641a9b847a48861cce0,8.181818,144.0


In [25]:
profile_features = profile_features.merge(avg_difficulty_duration,how='left',on='cust_id')
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer viewed_via_email,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration
13214,F,65,c00d1f6e8cba4787a00766232ae4711c,2017-08-05,71000.0,1207,0.0,5.0,4.0,0.0,...,4.0,3.0,4.0,4.0,0.0,3.0,1.0,0.0,5.777778,152.0
8420,M,67,6613c82a3acd4cbca267d7b056b35780,2016-07-22,38000.0,1586,0.0,4.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.25,17.5,222.0
6784,M,81,ad9e18d628fd4771bb3afdc2993cfdd7,2017-11-10,68000.0,1110,0.0,6.0,5.0,5.0,...,5.0,5.0,5.0,2.0,2.0,0.0,3.0,120.58,2.727273,91.636364
5815,F,65,388bda1fea454949abd8d927b04cbd6b,2016-07-10,98000.0,1598,2.0,3.0,2.0,7.0,...,2.0,2.0,2.0,1.0,1.0,0.0,1.0,193.48,10.0,140.571429
6911,F,61,dd2d770b5b9f41bd9382964971dde25e,2016-05-12,81000.0,1657,2.0,5.0,5.0,8.0,...,5.0,2.0,5.0,5.0,1.0,2.0,2.0,172.15,4.666667,132.0


#### Total rewards received

In [26]:
rewards = total_dist[['cust_id', 'reward_received']]
sum_rewards = rewards.groupby(['cust_id']).sum()
sum_rewards.rename(columns=lambda x: "total_"+x, inplace=True)
sum_rewards.sample(10)

Unnamed: 0_level_0,total_reward_received
cust_id,Unnamed: 1_level_1
50f4cc9c76be42dcb51805d48c033408,2.0
e0f1ca131f4b40a7a5f232b11efa4aea,3.0
9bf243bbdc364f2a8efc698555cfc1d9,2.0
f44b44b53c0548d5b27c812fda4c358a,9.0
b6d51d4c675149cda7524218e8e82364,17.0
86dd4706a3054adbbaf7287b7fa9aade,7.0
2d34608251f544bab8cc9c888d979037,0.0
ae5f3528d8324fcbbdbc51cf8a46cffa,5.0
c727c0939de24b92a35d4fd7c9fec518,0.0
fd3505ed43464f8ca88b7b0fc19c94d1,9.0


In [27]:
profile_features = profile_features.merge(sum_rewards,how='left',on='cust_id')
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer viewed_via_social,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration,total_reward_received
4559,M,20,bb272cd4cac24fcc8f0168cfb5c6f063,2013-11-16,40000.0,2565,2.0,4.0,3.0,20.0,...,3.0,3.0,2.0,3.0,0.0,0.0,54.1,7.222222,141.333333,10.0
2378,F,22,66c9c5773a8444a79abce6644401de88,2016-09-28,43000.0,1518,1.0,5.0,2.0,9.0,...,2.0,2.0,0.0,0.0,0.0,2.0,122.34,5.625,129.0,2.0
14339,F,27,fd375e03ad394a7bb85b62eb15263f14,2016-01-13,53000.0,1777,5.0,5.0,5.0,18.0,...,5.0,5.0,3.0,4.0,1.0,0.0,255.22,9.0,163.2,37.0
2488,M,67,463fead979df403484f2d357f3619aea,2016-05-06,60000.0,1663,2.0,3.0,3.0,12.0,...,3.0,3.0,2.0,1.0,1.0,1.0,53.59,6.375,126.0,13.0
14773,F,89,da7bf9d84fd74a72bdee595007bcca7a,2017-04-13,68000.0,1321,5.0,5.0,4.0,21.0,...,3.0,3.0,4.0,1.0,3.0,0.0,323.78,9.714286,188.571429,20.0


####  Total rewards defined

In [28]:
offer_received_data = total_dist[total_dist['event'] == 'offer received']
offer_received_data.sample(5)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,offer_type,email,mobile,social,web,duration,gender,age,became_member_on,income,days_as_member,count
205355,95e94202440a43939f3195fc975e7367,offer received,504,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,discount,1.0,0.0,0.0,1.0,240.0,M,50.0,2017-02-09,43000.0,1384.0,1
6531,c381677d2703470a9c1f7fddc39a32a6,offer received,0,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,discount,1.0,0.0,0.0,1.0,240.0,F,66.0,2018-05-01,62000.0,938.0,1
112112,680fc72906a1408abdb350adc8c35667,offer received,336,,,ae264e3637204a6fb9bb56bc8210ddfd,10.0,10.0,bogo,1.0,1.0,1.0,0.0,168.0,F,43.0,2015-12-12,55000.0,1809.0,1
155014,8c8c323df8e2415fbda5b2bbf253ca7b,offer received,408,,,9b98b8c7a33c4b65b9aebfe6a799e6d9,5.0,5.0,bogo,1.0,1.0,0.0,1.0,168.0,M,60.0,2015-12-03,105000.0,1818.0,1
207342,17c0490a3b724ae6b5c109a20e890fce,offer received,504,,,5a8bc65990b245e5a138643cd4eb9837,0.0,0.0,informational,1.0,1.0,1.0,0.0,72.0,M,77.0,2017-10-23,54000.0,1128.0,1


In [29]:
rewards = offer_received_data[['cust_id', 'reward_defined']]
sum_rewards = rewards.groupby(['cust_id']).sum()
sum_rewards.rename(columns=lambda x: "total_"+x, inplace=True)
sum_rewards.sample(10)

Unnamed: 0_level_0,total_reward_defined
cust_id,Unnamed: 1_level_1
484cd5288cd2460aab5d3d3ddf8646ad,17.0
cabe4c65156b43039a30511ec6bfdfae,17.0
49589dc19aec449f813a3486fc004af3,26.0
1b72ba4f73a54e6382f1a5eab5342bdc,33.0
a7a47698ba4643cf94a254ca8aedb92d,2.0
b75f60f949e54f649375f68f980a771f,15.0
834d166621774353815b037a1d1fcadd,22.0
cf0ee2e478b7446c9c07a63c35508464,45.0
67ce1a3e90f748b2b21723186529bdc2,7.0
c80e5ece0bfc4cec9e2116935adc3958,40.0


In [30]:
profile_features = profile_features.merge(sum_rewards,how='left',on='cust_id')
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer viewed_via_mobile,offer viewed_via_web,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration,total_reward_received,total_reward_defined
11411,F,94,dcb02091ffbd4d6ab1375ee64a8b180b,2015-10-03,84000.0,1879,2.0,2.0,1.0,8.0,...,1.0,1.0,0.0,1.0,0.0,139.02,14.0,196.8,7.0,7.0
13313,M,31,09752271072d4a259ee5f35e077bbe3e,2014-06-24,58000.0,2345,1.0,5.0,2.0,9.0,...,2.0,2.0,0.0,2.0,0.0,27.59,10.0,195.0,2.0,11.0
11642,M,74,59ee82e069fc4a2084f3c784cc702d33,2015-09-08,89000.0,1904,1.0,3.0,3.0,5.0,...,3.0,2.0,1.0,0.0,2.0,113.44,4.285714,99.428571,10.0,10.0
7073,F,62,772e849945e24482b88b25800fa17f55,2015-03-23,69000.0,2073,3.0,5.0,4.0,23.0,...,4.0,4.0,2.0,2.0,0.0,92.48,9.166667,160.0,14.0,24.0
13890,M,62,cd34d07a21c84d72b9856515a04f8efa,2018-05-05,93000.0,934,0.0,5.0,4.0,0.0,...,3.0,4.0,2.0,1.0,1.0,0.0,10.0,165.333333,0.0,25.0


#### View and completion ratio

In [31]:
profile_features['total_view_ratio'] = round(profile_features['Total offer viewed']/profile_features['Total offer received'], 2)
profile_features['total_completion_ratio'] = round(profile_features['Total offer completed']/profile_features['Total offer received'], 3) 
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration,total_reward_received,total_reward_defined,total_view_ratio,total_completion_ratio
243,F,74,28416e7d96d746fb9c1fb8223d9e4914,2015-09-24,75000.0,1888,2.0,4.0,2.0,5.0,...,1.0,0.0,1.0,99.47,11.25,171.0,15.0,20.0,0.5,0.5
14040,M,85,a60dc180138d4278ae6c6eba2c364356,2016-06-24,53000.0,1614,5.0,5.0,5.0,15.0,...,2.0,3.0,0.0,160.5,11.4,196.8,30.0,30.0,1.0,1.0
4317,F,82,5871252c52ca470ea8ebddc48f0ea988,2017-08-22,65000.0,1190,0.0,4.0,3.0,5.0,...,0.0,2.0,1.0,18.84,4.857143,150.857143,0.0,5.0,0.75,0.0
3715,M,62,9fa090f47ea946b7a715b8a7ff1eaa97,2017-06-21,72000.0,1252,1.0,5.0,1.0,10.0,...,1.0,0.0,0.0,31.49,10.714286,178.285714,2.0,24.0,0.2,0.2
11811,F,59,d091dfa1a1d7414da78027d8832ff681,2017-01-31,95000.0,1393,3.0,5.0,4.0,9.0,...,2.0,1.0,1.0,190.15,7.5,150.0,22.0,22.0,0.8,0.6


#### Avg_transaction_amt and transactions_per_membership_days

In [32]:
profile_features['avg_transaction_amt'] = round(profile_features['Total transaction amount']/profile_features['Total transaction'], 2)
profile_features['transactions_per_membership_days'] = profile_features['Total transaction']/profile_features['days_as_member']
profile_features.fillna(0, inplace=True)
profile_features.sample(5)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer viewed informational count,Total transaction amount,avg_difficulty,avg_duration,total_reward_received,total_reward_defined,total_view_ratio,total_completion_ratio,avg_transaction_amt,transactions_per_membership_days
14764,F,57,32f08cbbcf93435d9c32b29833e47d55,2017-12-29,81000.0,1061,1.0,3.0,3.0,3.0,...,0.0,55.03,7.142857,168.0,10.0,20.0,1.0,0.333,18.34,0.002828
2404,F,66,83313f9487fe41ca990d393127406161,2016-09-06,108000.0,1540,3.0,5.0,3.0,8.0,...,1.0,945.09,8.636364,150.545455,12.0,22.0,0.6,0.6,118.14,0.005195
967,F,38,fb1cc1b009db4a6caf274f2a02571ca2,2017-08-09,60000.0,1203,3.0,5.0,3.0,9.0,...,2.0,124.6,6.818182,150.545455,12.0,12.0,0.6,0.6,13.84,0.007481
5718,M,67,654250dd0ada40ec8bab4e7f17ce9ce2,2018-04-12,41000.0,957,1.0,6.0,4.0,7.0,...,2.0,24.16,6.363636,152.727273,2.0,17.0,0.67,0.167,3.45,0.007315
1222,F,52,842fda9faeff4c42a708e20adbccd1dc,2017-12-01,80000.0,1089,2.0,6.0,6.0,2.0,...,1.0,46.32,9.285714,149.142857,15.0,35.0,1.0,0.333,23.16,0.001837


In [33]:
len(profile_features[profile_features['Total offer viewed'] < profile_features['Total offer completed']])

1932

In [34]:
profile_features.to_csv('../data/xtr_profile.csv')