# Data preprocessing

## Importing required libraries and reading the input json files:

In [1]:
import os, datetime, json
import pandas as pd

In [2]:
transcript = pd.read_json('../data/transcript.json', orient='records', lines=True)
portfolio = pd.read_json('../data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('../data/profile.json', orient='records', lines=True)

## Data cleansing operations

### Cleaning profile.json

In [3]:
profile.dropna(axis=0,inplace=True)
profile.rename(columns={'id': 'cust_id'}, inplace=True)
profile['became_member_on'] = pd.to_datetime(profile['became_member_on'].astype(str), format='%Y%m%d')
profile['days_as_member'] = (datetime.datetime.today() - profile['became_member_on']).dt.days
profile.sample(20)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member
676,F,48,5647967694bb4af49ea07dd2dc835ea8,2015-09-23,87000.0,1885
1650,M,32,ed67dd8bc29b455ab00407fa2500b7d6,2015-05-01,47000.0,2030
4367,M,46,530cc2d0fec84812acccc4214cd87fef,2016-01-15,68000.0,1771
8290,M,39,1b7a81560a7c489fb80865bcc4e10ddd,2013-10-15,36000.0,2593
3702,F,63,fd45aee3900f4b3c88a35b53d0bf630b,2018-03-28,65000.0,968
12319,M,64,072ee847d4c348db9c24669902dc6abb,2018-05-28,32000.0,907
10613,M,55,3a53a6aa0b7642be9f387c9573a136fd,2015-09-21,35000.0,1887
15353,M,22,5cbb77c95e4e4371a82ac774d9ad35d5,2018-05-19,39000.0,916
955,F,89,44302ac6a6c34a55aec6de40b61e286f,2017-09-02,81000.0,1175
5120,O,42,19210a70016c4ea98606657cd0c55d8b,2016-10-27,58000.0,1485


### Cleaning portfolio.json

In [4]:
portfolio.rename(columns={'id': 'offer_id'}, inplace=True)
portfolio.set_index('offer_id', inplace=True)
offer_dict = portfolio.to_dict(orient='index')
offer_dict

{'ae264e3637204a6fb9bb56bc8210ddfd': {'reward': 10,
  'channels': ['email', 'mobile', 'social'],
  'difficulty': 10,
  'duration': 7,
  'offer_type': 'bogo'},
 '4d5c57ea9a6940dd891ad53e9dbe8da0': {'reward': 10,
  'channels': ['web', 'email', 'mobile', 'social'],
  'difficulty': 10,
  'duration': 5,
  'offer_type': 'bogo'},
 '3f207df678b143eea3cee63160fa8bed': {'reward': 0,
  'channels': ['web', 'email', 'mobile'],
  'difficulty': 0,
  'duration': 4,
  'offer_type': 'informational'},
 '9b98b8c7a33c4b65b9aebfe6a799e6d9': {'reward': 5,
  'channels': ['web', 'email', 'mobile'],
  'difficulty': 5,
  'duration': 7,
  'offer_type': 'bogo'},
 '0b1e1539f2cc45b7b9fa7c272da2e1d7': {'reward': 5,
  'channels': ['web', 'email'],
  'difficulty': 20,
  'duration': 10,
  'offer_type': 'discount'},
 '2298d6c36e964ae4a3e7e9706d1fb8c2': {'reward': 3,
  'channels': ['web', 'email', 'mobile', 'social'],
  'difficulty': 7,
  'duration': 7,
  'offer_type': 'discount'},
 'fafdcd668e3743c1bb461111dcafc2a4': {'r

In [5]:
portfolio.to_csv('../data/cln_portfolio.csv')

### Cleaning and grouping transcript.json based on cust_id, event

In [6]:
transcript.rename(columns={'person': 'cust_id'}, inplace=True)
transcript['days'] = round(transcript['time']/24, 3)
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   cust_id  306534 non-null  object 
 1   event    306534 non-null  object 
 2   value    306534 non-null  object 
 3   time     306534 non-null  int64  
 4   days     306534 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 11.7+ MB


#### Creating offer expired label

In [7]:
completes = transcript.loc[(transcript['event'] == "offer completed")]

completes['offer_completed_id'] = completes['value'].apply(lambda x: list(x.values())[0])
completes['offer_completed_reward'] = completes['value'].apply(lambda x: list(x.values())[1])
completes['offer_duration'] = completes['offer_completed_id'].apply(lambda x: offer_dict[x]['duration'])


indexes = completes[completes['days'] > completes['offer_duration']].index
transcript['event'][indexes] = "offer expired"


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  completes['offer_completed_id'] = completes['value'].apply(lambda x: list(x.values())[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  completes['offer_completed_reward'] = completes['value'].apply(lambda x: list(x.values())[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  completes['offer_dura

In [8]:
transcript['event'].value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer expired       27698
offer completed      5881
Name: event, dtype: int64

In [9]:
transcript.to_csv('../data/cln_transcript.csv')

In [10]:
for label, group in transcript.groupby(['cust_id', 'event']):
    custid = label[0]
    event_name = label[1]
    if (event_name == "offer completed"):
        profile.loc[profile['cust_id'] == custid, 'offers_completed'] = len(group)
    elif (event_name == "offer received"):
        profile.loc[profile['cust_id'] == custid, 'offers_received'] = len(group)
    elif (event_name == "offer viewed"):
        profile.loc[profile['cust_id'] == custid, 'offers_viewed'] = len(group)

In [11]:
profile.fillna(0, inplace=True)
profile[profile['offers_viewed'] < profile['offers_completed']]

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,offers_received,offers_viewed,offers_completed
684,F,53,05beec1cbeb8481d9bfb5844343ba374,2015-12-14,83000.0,1803,5.0,1.0,2.0
793,F,49,b860d355ef6e4c66b5d5a837c56ef32d,2017-12-26,110000.0,1060,4.0,0.0,1.0
989,F,88,7eb42a9d454c4cb8b18d363026192959,2017-03-25,71000.0,1336,5.0,0.0,1.0
1116,F,58,b16af11771c84527b9f6ba177f33b661,2016-09-06,77000.0,1536,5.0,1.0,2.0
1476,F,51,3dde94fa581145cb9f206624f1a94d5a,2017-11-14,56000.0,1102,4.0,1.0,2.0
3434,F,65,0a43626c07724771836f653098a19ec8,2017-08-10,112000.0,1198,4.0,1.0,2.0
4082,M,78,c10dcc31b83b419d8e577f4edad59600,2018-07-25,103000.0,849,4.0,1.0,2.0
5241,M,55,f3d42b8e20a94d4ea5f6d1efce18b2c2,2015-03-23,34000.0,2069,2.0,1.0,2.0
5788,F,61,6fae0fe809b34a45bd87deb5e317adec,2018-05-26,77000.0,909,4.0,0.0,1.0
10213,M,28,0aea37c300ba4262b523881c7112484d,2015-04-20,74000.0,2041,3.0,0.0,1.0


In [12]:
len(profile[profile['offers_viewed'] < profile['offers_completed']])

24

In [13]:
profile['view_ratio'] = round(profile['offers_viewed']/profile['offers_received'], 2)
profile['completion_ratio'] = round(profile['offers_completed']/profile['offers_received'], 3)
profile.sample(10)

Unnamed: 0,gender,age,cust_id,became_member_on,income,days_as_member,offers_received,offers_viewed,offers_completed,view_ratio,completion_ratio
3591,F,92,f94d9d9d979649799c38a27971dc9ca5,2017-12-07,37000.0,1079,5.0,4.0,1.0,0.8,0.2
4219,F,56,97784e4285e24be886c6dad4fb111df2,2018-03-31,98000.0,965,5.0,4.0,1.0,0.8,0.2
3364,F,28,f80eb9caef944ca899f2bef52b6c2f5e,2017-11-11,58000.0,1105,5.0,5.0,0.0,1.0,0.0
9905,M,31,39af105b5ff94f609fcd37297911791d,2016-11-10,46000.0,1471,4.0,4.0,1.0,1.0,0.25
10948,M,28,c20f50e786bc4ee7951bffd091a171a7,2017-09-17,43000.0,1160,5.0,5.0,1.0,1.0,0.2
6641,F,49,a8dc1ad7fda243edad2b45982dd541bb,2018-01-09,73000.0,1046,6.0,4.0,0.0,0.67,0.0
16560,M,59,f34df6b46f3f42c78f9ef4e9ab7b6fe4,2018-04-15,47000.0,950,3.0,2.0,0.0,0.67,0.0
5237,F,60,cec6feb9507e4fb3b152745e779d5660,2017-08-03,58000.0,1205,5.0,5.0,0.0,1.0,0.0
4692,M,56,eb026cdc87f0460f8959c0198b89af12,2018-04-02,60000.0,963,4.0,3.0,0.0,0.75,0.0
7341,F,47,6e23a5486c6f424ba3ea13019f6a5806,2018-01-20,93000.0,1035,5.0,5.0,0.0,1.0,0.0


In [14]:
profile.to_csv('../data/cln_profile.csv')