# Data preprocessing

## Importing required libraries and reading the input json files:

In [1]:
import os, datetime, json
import pandas as pd

In [2]:
transcript = pd.read_json('../data/transcript.json', orient='records', lines=True)
portfolio = pd.read_json('../data/portfolio.json', orient='records', lines=True)
profile = pd.read_json('../data/profile.json', orient='records', lines=True)

## Data cleansing operations

### Cleaning profile.json

In [3]:
profile.dropna(axis=0,inplace=True)
profile.rename(columns={'id': 'cust_id'}, inplace=True)
profile.head()

Unnamed: 0,gender,age,cust_id,became_member_on,income
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
5,M,68,e2127556f4f64592b11af22de27a7932,20180426,70000.0
8,M,65,389bc3fa690240e798340f5a15918d5c,20180209,53000.0
12,M,58,2eeac8d8feae4a8cad5a6af0499a211d,20171111,51000.0


### Cleaning portfolio.json

In [4]:
portfolio.rename(columns={'id': 'offer_id'}, inplace=True)
portfolio.set_index('offer_id', inplace=True)
offer_dict = portfolio.to_dict(orient='index')
offer_dict

{'ae264e3637204a6fb9bb56bc8210ddfd': {'reward': 10,
  'channels': ['email', 'mobile', 'social'],
  'difficulty': 10,
  'duration': 7,
  'offer_type': 'bogo'},
 '4d5c57ea9a6940dd891ad53e9dbe8da0': {'reward': 10,
  'channels': ['web', 'email', 'mobile', 'social'],
  'difficulty': 10,
  'duration': 5,
  'offer_type': 'bogo'},
 '3f207df678b143eea3cee63160fa8bed': {'reward': 0,
  'channels': ['web', 'email', 'mobile'],
  'difficulty': 0,
  'duration': 4,
  'offer_type': 'informational'},
 '9b98b8c7a33c4b65b9aebfe6a799e6d9': {'reward': 5,
  'channels': ['web', 'email', 'mobile'],
  'difficulty': 5,
  'duration': 7,
  'offer_type': 'bogo'},
 '0b1e1539f2cc45b7b9fa7c272da2e1d7': {'reward': 5,
  'channels': ['web', 'email'],
  'difficulty': 20,
  'duration': 10,
  'offer_type': 'discount'},
 '2298d6c36e964ae4a3e7e9706d1fb8c2': {'reward': 3,
  'channels': ['web', 'email', 'mobile', 'social'],
  'difficulty': 7,
  'duration': 7,
  'offer_type': 'discount'},
 'fafdcd668e3743c1bb461111dcafc2a4': {'r

In [5]:
portfolio.to_csv('../data/cln_portfolio.csv')

### Cleaning and grouping transcript.json based on cust_id, event

In [6]:
transcript.rename(columns={'person': 'cust_id'}, inplace=True)
transcript['days'] = round(transcript['time']/24, 3)
transcript.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   cust_id  306534 non-null  object 
 1   event    306534 non-null  object 
 2   value    306534 non-null  object 
 3   time     306534 non-null  int64  
 4   days     306534 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 11.7+ MB


#### Creating offer expired label

In [7]:
completes = transcript.loc[(transcript['event'] == "offer completed")]

completes['offer_completed_id'] = completes['value'].apply(lambda x: list(x.values())[0])
completes['offer_completed_reward'] = completes['value'].apply(lambda x: list(x.values())[1])
completes['offer_duration'] = completes['offer_completed_id'].apply(lambda x: offer_dict[x]['duration'])


indexes = completes.loc[completes['days'] > completes['offer_duration']].index
transcript['event'][indexes] = "offer expired"


#for val in completes[completes['days'] > completes['offer_duration']].index.values:
#    transcript['event'][val] = "offer expired"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  completes['offer_completed_id'] = completes['value'].apply(lambda x: list(x.values())[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  completes['offer_completed_reward'] = completes['value'].apply(lambda x: list(x.values())[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  completes['offer_dura

In [8]:
transcript['event'].value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer expired       27698
offer completed      5881
Name: event, dtype: int64

In [9]:
transcript.to_csv('../data/cln_transcript.csv')

In [10]:
for label, group in transcript.groupby(['cust_id', 'event']):
    custid = label[0]
    event_name = label[1]
    if (event_name == "offer completed"):
        profile.loc[profile['cust_id'] == custid, 'offers_completed'] = len(group)
    elif (event_name == "offer received"):
        profile.loc[profile['cust_id'] == custid, 'offers_received'] = len(group)
    elif (event_name == "offer viewed"):
        profile.loc[profile['cust_id'] == custid, 'offers_viewed'] = len(group)

In [11]:
profile.fillna(0, inplace=True)
profile[profile['offers_viewed'] < profile['offers_completed']]

In [13]:
len(profile[profile['offers_viewed'] < profile['offers_completed']])

24

In [15]:
profile['view_ratio'] = round(profile['offers_viewed']/profile['offers_received'], 2)
profile['completion_ratio'] = round(profile['offers_completed']/profile['offers_received'], 3)
profile.sample(10)

Unnamed: 0,gender,age,cust_id,became_member_on,income,offers_received,offers_viewed,offers_completed,view_ratio,completion_ratio
1396,F,58,ab2cb6b756004af9b12787be834a3e63,20170408,93000.0,5.0,4.0,0.0,0.8,0.0
9751,M,55,4d4216b868fe43ddb9c9f0b77212c0cb,20180302,56000.0,6.0,6.0,2.0,1.0,0.333
13979,M,76,a0b6b36c5f2b445680b119b488e84df3,20160627,81000.0,5.0,5.0,0.0,1.0,0.0
14706,M,61,9292d9ed365349babd237a99f004ddf7,20160502,38000.0,5.0,3.0,1.0,0.6,0.2
8865,F,22,c1026690b6f14bdeb431557ae9ef855e,20140612,73000.0,5.0,4.0,1.0,0.8,0.2
9687,F,52,72badd9799484051b932f217a75f11b5,20171106,92000.0,3.0,2.0,0.0,0.67,0.0
16353,F,78,27b2dc1b194f4451b0966de7a29c55a9,20171130,78000.0,4.0,2.0,1.0,0.5,0.25
8057,F,72,434895bf1bd14ee4b064961958b9d1a4,20180118,109000.0,4.0,3.0,0.0,0.75,0.0
10124,M,64,882346bfc610473c8505199a5179d302,20170910,69000.0,3.0,3.0,0.0,1.0,0.0
7595,F,62,67270b961f1c422e991373e75a4f2f40,20140922,99000.0,5.0,3.0,1.0,0.6,0.2


In [16]:
profile.to_csv('../data/cln_profile.csv')