# Data preprocessing

## Importing required libraries and reading the input json files:

In [1]:
import os, datetime, json
import pandas as pd
import numpy as np

In [2]:
transcript = pd.read_json('../data/transcript.json', orient='records', lines=True)
portfolio = pd.read_json('../data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('../data/profile.json', orient='records', lines=True)

## Data cleansing operations

### Cleaning profile.json

In [3]:
profile.dropna(axis=0,inplace=True)
profile.rename(columns={'id': 'cust_id'}, inplace=True)
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'].astype(str), format='%Y%m%d')
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days
profile.sample(20)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member
1227,F,40,c18f1cdfae78482aa7d67712d423c83e,2016-06-28,41000.0,1610
908,M,36,a02bb1e96a9b452cbd38a5a3c250b2c6,2018-07-21,67000.0,857
12856,F,48,8828821b64b24731a6da16806b3f50c5,2018-05-06,41000.0,933
1282,M,74,3c3f422206914e7dbc91af612257a376,2016-10-05,75000.0,1511
16703,F,57,ff138957836849ccadfb5b232a3dc1b4,2016-07-14,92000.0,1594
1809,F,69,d19849e046cc441fb7cfc4c8a68dbf6d,2016-11-15,89000.0,1470
4212,F,43,3347e6cc276f45d6b7a3b18e955f7bb9,2017-01-25,69000.0,1399
7437,M,26,2fd7a38e7640433facbad7081668f671,2017-02-05,55000.0,1388
6299,F,48,8ba88f5a415f49d6be2eb2ffa484cf56,2017-05-12,63000.0,1292
995,M,63,784bc75af56e403e9030470bd9c49fcf,2017-09-07,64000.0,1174


In [4]:
profile.to_csv('../data/cln_profile.csv')

### Cleaning portfolio.json

In [5]:
portfolio.rename(columns={'id': 'offer_id'}, inplace=True)
portfolio.set_index('offer_id', inplace=True)
portfolio.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, ae264e3637204a6fb9bb56bc8210ddfd to 2906b810c7d4411798c6938adc9daaa5
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reward      10 non-null     int64 
 1   channels    10 non-null     object
 2   difficulty  10 non-null     int64 
 3   duration    10 non-null     int64 
 4   offer_type  10 non-null     object
dtypes: int64(3), object(2)
memory usage: 480.0+ bytes


#### Unpacking channels

In [6]:
medium = portfolio['channels'].str.join(',').str.get_dummies(',')
portfolio = pd.concat([portfolio, medium], axis=1) # concat medium columns
portfolio = portfolio.drop(['channels'], axis=1)
portfolio['duration_in_hours'] = portfolio['duration']*24
portfolio

Unnamed: 0_level_0,reward,difficulty,duration,offer_type,email,mobile,social,web,duration_in_hours
offer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ae264e3637204a6fb9bb56bc8210ddfd,10,10,7,bogo,1,1,1,0,168
4d5c57ea9a6940dd891ad53e9dbe8da0,10,10,5,bogo,1,1,1,1,120
3f207df678b143eea3cee63160fa8bed,0,0,4,informational,1,1,0,1,96
9b98b8c7a33c4b65b9aebfe6a799e6d9,5,5,7,bogo,1,1,0,1,168
0b1e1539f2cc45b7b9fa7c272da2e1d7,5,20,10,discount,1,0,0,1,240
2298d6c36e964ae4a3e7e9706d1fb8c2,3,7,7,discount,1,1,1,1,168
fafdcd668e3743c1bb461111dcafc2a4,2,10,10,discount,1,1,1,1,240
5a8bc65990b245e5a138643cd4eb9837,0,0,3,informational,1,1,1,0,72
f19421c1d4aa40978ebb69ca19b0e20d,5,5,5,bogo,1,1,1,1,120
2906b810c7d4411798c6938adc9daaa5,2,10,7,discount,1,1,0,1,168


In [7]:
portfolio.to_csv('../data/cln_portfolio.csv')

### Cleaning and grouping transcript.json based on cust_id, event

In [8]:
transcript.rename(columns={'person': 'cust_id'}, inplace=True)
transcript.rename(columns={'time':'hours_till_action'},inplace=True)
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 4 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   cust_id            306534 non-null  object
 1   event              306534 non-null  object
 2   value              306534 non-null  object
 3   hours_till_action  306534 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 9.4+ MB


#### Unpacking value 

In [9]:
transcript = pd.concat([transcript, transcript['value'].apply(pd.Series)], axis=1)
transcript['offer_id_new'] = np.where(transcript['offer id'].isnull() & transcript['offer_id'].notnull(),transcript['offer_id'],transcript['offer id'])
transcript.drop(['offer id','offer_id'],axis=1,inplace=True)
transcript.drop(['value'],axis=1,inplace=True)
transcript.rename(columns={'offer_id_new':'offer_id'},inplace=True)
transcript.sample(10)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward,offer_id
92179,9e784b0f6f3c412db22ad03f7a415f31,transaction,240,15.79,,
296923,8dbfa485249f409aa223a2130f40634a,transaction,672,7.41,,
84752,2a2c5ee51e784e0bb4a956a4d36ae159,transaction,210,11.53,,
10570,c7a6bf2558554bf5954ed506eb3968f1,offer received,0,,,ae264e3637204a6fb9bb56bc8210ddfd
293771,693f27280c58443eb4177ee91d67dfb9,transaction,660,0.08,,
147241,21b02228cd4441b49bf0b5ff046f4b75,transaction,396,2.07,,
101133,d2925895f1094ecb99cf24d0a62ce1c2,transaction,276,0.22,,
242395,14ccd761d68248979e0fb857ae5e9aa5,transaction,564,11.63,,
1983,7be78ebad1ac4467bbe93813fe0b7d28,offer received,0,,,4d5c57ea9a6940dd891ad53e9dbe8da0
303424,32042db985ad426a8e3c792b1585291a,transaction,702,407.87,,


In [10]:
transcript.to_csv('../data/cln_transcript.csv')

### Creating total df

In [11]:
total = transcript.merge(portfolio,how='left',on='offer_id')
total = total.merge(profile,how='left',on='cust_id')
total.rename(columns={'reward_x':'reward_received', 'reward_y':'reward_defined'},inplace=True)
total.sample(10)

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,duration,offer_type,email,mobile,social,web,duration_in_hours,gender,age,became_member_on,income,days_as_member
6256,33019dc9074843f5be84c2bba9c2438a,offer received,0,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0,120.0,F,50.0,2017-09-13,63000.0,1168.0
222711,530cc2d0fec84812acccc4214cd87fef,transaction,516,13.58,,,,,,,,,,,,M,46.0,2016-01-15,68000.0,1775.0
257243,cd8a88670e404eef9d4edc3010dd84aa,offer received,576,,,4d5c57ea9a6940dd891ad53e9dbe8da0,10.0,10.0,5.0,bogo,1.0,1.0,1.0,1.0,120.0,,,NaT,,
106613,47e925c3997c4781b36d5b16047dd4cb,transaction,306,1.03,,,,,,,,,,,,M,34.0,2018-01-07,31000.0,1052.0
276571,5231af830b174aef98790a75b95f00f3,offer completed,606,,2.0,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0,240.0,M,75.0,2016-06-07,97000.0,1631.0
221950,91d4029efe87429984d0152b669cea65,offer viewed,516,,,5a8bc65990b245e5a138643cd4eb9837,0.0,0.0,3.0,informational,1.0,1.0,1.0,0.0,72.0,,,NaT,,
261363,e061a38b56834564937652a7c2fe75a7,offer viewed,576,,,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0,240.0,,,NaT,,
83034,df1ec0d168064e74a20fb178bc9082ae,transaction,204,16.9,,,,,,,,,,,,F,42.0,2017-08-09,71000.0,1203.0
224852,1e0e4e6416164848b930f12d2719e750,offer viewed,516,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0,240.0,F,34.0,2018-01-29,31000.0,1030.0
286730,cf3bd0d77cdc487fb6a46a636b9e36ab,offer completed,636,,5.0,f19421c1d4aa40978ebb69ca19b0e20d,5.0,5.0,5.0,bogo,1.0,1.0,1.0,1.0,120.0,M,42.0,2018-06-13,65000.0,895.0


In [12]:
total[total['hours_till_action'] > total['duration_in_hours']]

Unnamed: 0,cust_id,event,hours_till_action,amount,reward_received,offer_id,reward_defined,difficulty,duration,offer_type,email,mobile,social,web,duration_in_hours,gender,age,became_member_on,income,days_as_member
36952,1ccdd48ea41247248269cb0c5805ca12,offer viewed,78,,,5a8bc65990b245e5a138643cd4eb9837,0.0,0.0,3.0,informational,1.0,1.0,1.0,0.0,72.0,F,83.0,2017-12-23,99000.0,1067.0
36981,b5c5091888604fefb1219e5fa0aece97,offer viewed,78,,,5a8bc65990b245e5a138643cd4eb9837,0.0,0.0,3.0,informational,1.0,1.0,1.0,0.0,72.0,M,71.0,2017-05-02,87000.0,1302.0
36987,5137dcb4eff644888b63af6dcaf8b560,offer viewed,78,,,5a8bc65990b245e5a138643cd4eb9837,0.0,0.0,3.0,informational,1.0,1.0,1.0,0.0,72.0,M,59.0,2016-04-08,64000.0,1691.0
37004,422a5be8f91e4a65854b715254db72e4,offer viewed,78,,,5a8bc65990b245e5a138643cd4eb9837,0.0,0.0,3.0,informational,1.0,1.0,1.0,0.0,72.0,M,48.0,2016-02-06,52000.0,1753.0
37068,2b6d8eb54c964e73aa022301f51a621a,offer viewed,78,,,5a8bc65990b245e5a138643cd4eb9837,0.0,0.0,3.0,informational,1.0,1.0,1.0,0.0,72.0,F,52.0,2017-03-16,42000.0,1349.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
306497,a6f84f4e976f44508c358cc9aba6d2b3,offer completed,714,,3.0,2298d6c36e964ae4a3e7e9706d1fb8c2,3.0,7.0,7.0,discount,1.0,1.0,1.0,1.0,168.0,,,NaT,,
306506,b895c57e8cd047a8872ce02aa54759d6,offer completed,714,,2.0,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0,240.0,,,NaT,,
306507,8dda575c2a1d44b9ac8e8b07b93d1f8e,offer viewed,714,,,0b1e1539f2cc45b7b9fa7c272da2e1d7,5.0,20.0,10.0,discount,1.0,0.0,0.0,1.0,240.0,F,60.0,2017-09-08,64000.0,1173.0
306509,8431c16f8e1d440880db371a68f82dd0,offer completed,714,,2.0,fafdcd668e3743c1bb461111dcafc2a4,2.0,10.0,10.0,discount,1.0,1.0,1.0,1.0,240.0,M,39.0,2018-06-27,39000.0,881.0


In [13]:
transcript.to_csv('../data/cln_transcript.csv')

### Adding features to profile

#### Total of all events

In [14]:
total['count_of_events'] = 1
count_of_events = pd.pivot_table(total, values='count_of_events', index = 'cust_id', columns='event', aggfunc=np.sum)
count_of_events.rename(columns=lambda x: "Total "+x, inplace=True)

profile_features = profile
profile_features = profile_features.merge(count_of_events,how='left',on='cust_id')

#### Individual event counts based on offer type

In [15]:
event_group = total.groupby('event')
offer_counts = dict()

for event, group in event_group:
    if event != 'transaction':        
        group['offer_counts'] = 1
        offer_counts[event] = pd.pivot_table(group, values='offer_counts', index='cust_id', columns='offer_type', aggfunc=np.sum)
        offer_counts[event].rename(columns=lambda x: event+" "+x+" count", inplace=True)
        
    else:
        offer_counts[event] = group.groupby(['cust_id'])['amount'].agg('sum').to_frame()
        offer_counts[event].rename(columns=lambda x: "Total transaction "+x, inplace=True)
    profile_features = profile_features.merge(offer_counts[event],how='left',on='cust_id')

profile_features.fillna(0, inplace=True)    
profile_features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['offer_counts'] = 1


Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,offer completed bogo count,offer completed discount count,offer received bogo count,offer received discount count,offer received informational count,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount
0,F,55,0610b486422d4921ae7d2bf64640c50b,2017-07-15,112000.0,1228,1.0,2.0,0.0,3.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,77.01
1,F,75,78afa995795e4d85b5d9ceeca43f5fef,2017-05-09,100000.0,1295,3.0,4.0,4.0,7.0,3.0,0.0,3.0,0.0,1.0,3.0,0.0,1.0,159.27
2,M,68,e2127556f4f64592b11af22de27a7932,2018-04-26,70000.0,943,2.0,4.0,3.0,3.0,1.0,1.0,1.0,2.0,1.0,1.0,2.0,0.0,57.73
3,M,65,389bc3fa690240e798340f5a15918d5c,2018-02-09,53000.0,1019,5.0,6.0,6.0,3.0,3.0,2.0,4.0,2.0,0.0,4.0,2.0,0.0,36.43
4,M,58,2eeac8d8feae4a8cad5a6af0499a211d,2017-11-11,51000.0,1109,1.0,3.0,2.0,4.0,0.0,1.0,0.0,2.0,1.0,0.0,2.0,0.0,15.62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14820,F,45,6d5f3a774f3d4714ab0c092238f3a1d7,2018-06-04,54000.0,904,0.0,3.0,3.0,7.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,20.03
14821,M,61,2cb4f97358b841b9a9773a7aa05a9d77,2018-07-13,72000.0,865,1.0,3.0,1.0,7.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,1.0,25.97
14822,M,49,01d26f638c274aa0b965d24cefe3183f,2017-01-26,73000.0,1398,0.0,3.0,1.0,8.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,39.74
14823,F,83,9dc1421481194dcd9400aec7c9ae6366,2016-03-07,50000.0,1723,3.0,3.0,3.0,14.0,3.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,189.67


In [19]:
profile_features['total_view_ratio'] = round(profile_features['Total offer viewed']/profile_features['Total offer received'], 2)
profile_features['total_completion_ratio'] = round(profile_features['Total offer completed']/profile_features['Total offer received'], 3)
profile_features['avg_transaction_amt'] = round(profile_features['Total transaction amount']/profile_features['Total transaction'], 2)
profile_features['transactions_per_membership_days'] = profile_features['Total transaction']/profile_features['days_as_member']
profile_features.fillna(0, inplace=True) 
profile_features.sample(10)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,Total offer completed,Total offer received,Total offer viewed,Total transaction,...,offer received discount count,offer received informational count,offer viewed bogo count,offer viewed discount count,offer viewed informational count,Total transaction amount,total_view_ratio,total_completion_ratio,avg_transaction_amt,transactions_per_membership_days
8098,F,67,084afe9e774d4e54b5ccaf786cd29f67,2016-08-20,52000.0,1557,2.0,5.0,5.0,8.0,...,1.0,3.0,1.0,1.0,3.0,93.49,1.0,0.4,11.69,0.005138
1035,F,70,b4c29d77c6d045698bf125545d1520f8,2016-07-04,87000.0,1604,2.0,4.0,3.0,2.0,...,1.0,1.0,2.0,0.0,1.0,52.24,0.75,0.5,26.12,0.001247
7852,O,36,378a544efddc47609c80a9e825c285b5,2018-03-08,88000.0,992,2.0,5.0,5.0,2.0,...,1.0,1.0,3.0,1.0,1.0,61.69,1.0,0.4,30.84,0.002016
3582,M,40,1f72aa042368414d90fb3de6801238ac,2016-09-16,70000.0,1530,5.0,6.0,6.0,13.0,...,4.0,1.0,1.0,4.0,1.0,251.27,1.0,0.833,19.33,0.008497
172,M,61,3f279ee9fd0d4195b0b9574ce0ae6923,2017-09-26,82000.0,1155,1.0,3.0,1.0,3.0,...,1.0,1.0,1.0,0.0,0.0,62.65,0.33,0.333,20.88,0.002597
10355,F,56,5902c7eb48c4438c90b17439f2f6988c,2015-12-17,76000.0,1804,1.0,4.0,4.0,6.0,...,0.0,3.0,1.0,0.0,3.0,126.42,1.0,0.25,21.07,0.003326
1296,M,89,7d66deb1596942dbb626b4f31a4b44f8,2018-06-18,82000.0,890,1.0,3.0,2.0,1.0,...,2.0,1.0,0.0,2.0,0.0,18.08,0.67,0.333,18.08,0.001124
11689,F,51,f105b8f61dda45739cd5b0d64807ec0a,2016-05-20,118000.0,1649,2.0,3.0,1.0,7.0,...,1.0,0.0,1.0,0.0,0.0,196.05,0.33,0.667,28.01,0.004245
2525,F,82,715b22238df84b9fbad9485f23192c4d,2018-03-25,75000.0,975,3.0,4.0,2.0,3.0,...,1.0,0.0,2.0,0.0,0.0,58.63,0.5,0.75,19.54,0.003077
10598,M,68,df6c3af2b21049fe9e676c9599ccc0ef,2013-09-06,93000.0,2636,2.0,4.0,4.0,8.0,...,1.0,2.0,1.0,1.0,2.0,190.49,1.0,0.5,23.81,0.003035


In [20]:
len(profile_features[profile_features['Total offer viewed'] < profile_features['Total offer completed']])

2005

In [21]:
profile_features.to_csv('../data/xtr_profile.csv')