<a href="https://colab.research.google.com/github/nthammadi-uncc/StarbucksPromotionAnalysis/blob/main/Jupyter%20Notebooks/Exploratory_Data_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Exploratory Data Analysis

### Load necessary libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

### Read data files

In [2]:
#read the csv files and convert into dataframe
parent_url='https://raw.githubusercontent.com/nthammadi-uncc/StarbucksPromotionAnalysis/main/data/raw/'
portfolio_url=parent_url+'portfolio.csv'
profile_url=parent_url+'profile.csv'
transcript_url=parent_url+'transcript.csv'

portfolio_df=pd.read_csv(portfolio_url)
portfolio_df = portfolio_df.iloc[: , 1:]
profile_df=pd.read_csv(profile_url)
profile_df = profile_df.iloc[: , 1:]
transcript_df=pd.read_csv(transcript_url)
transcript_df = transcript_df.iloc[: , 1:]

### Feature Engineering

##### Offer Portfolio

In [3]:
portfolio_df.head(20)

Unnamed: 0,reward,channels,difficulty,duration,offer_type,id
0,10,"['email', 'mobile', 'social']",10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd
1,10,"['web', 'email', 'mobile', 'social']",10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0
2,0,"['web', 'email', 'mobile']",0,4,informational,3f207df678b143eea3cee63160fa8bed
3,5,"['web', 'email', 'mobile']",5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9
4,5,"['web', 'email']",20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7
5,3,"['web', 'email', 'mobile', 'social']",7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2
6,2,"['web', 'email', 'mobile', 'social']",10,10,discount,fafdcd668e3743c1bb461111dcafc2a4
7,0,"['email', 'mobile', 'social']",0,3,informational,5a8bc65990b245e5a138643cd4eb9837
8,5,"['web', 'email', 'mobile', 'social']",5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d
9,2,"['web', 'email', 'mobile']",10,7,discount,2906b810c7d4411798c6938adc9daaa5


#### Since every offer is sent over email, we can drop that channel as it does not offer us anything distinct from the other offers

In [4]:
# create media channels for each source
portfolio_df['web_channel']=np.where(portfolio_df.channels.str.contains("web"),1,0)
portfolio_df['mobile_channel']=np.where(portfolio_df.channels.str.contains("mobile"),1,0)
portfolio_df['social_channel']=np.where(portfolio_df.channels.str.contains("social"),1,0)

#drop the channels column as it is now redundant
portfolio_df.drop(['channels'], axis=1, inplace=True)

In [5]:
portfolio_df.head(20)

Unnamed: 0,reward,difficulty,duration,offer_type,id,web_channel,mobile_channel,social_channel
0,10,10,7,bogo,ae264e3637204a6fb9bb56bc8210ddfd,0,1,1
1,10,10,5,bogo,4d5c57ea9a6940dd891ad53e9dbe8da0,1,1,1
2,0,0,4,informational,3f207df678b143eea3cee63160fa8bed,1,1,0
3,5,5,7,bogo,9b98b8c7a33c4b65b9aebfe6a799e6d9,1,1,0
4,5,20,10,discount,0b1e1539f2cc45b7b9fa7c272da2e1d7,1,0,0
5,3,7,7,discount,2298d6c36e964ae4a3e7e9706d1fb8c2,1,1,1
6,2,10,10,discount,fafdcd668e3743c1bb461111dcafc2a4,1,1,1
7,0,0,3,informational,5a8bc65990b245e5a138643cd4eb9837,0,1,1
8,5,5,5,bogo,f19421c1d4aa40978ebb69ca19b0e20d,1,1,1
9,2,10,7,discount,2906b810c7d4411798c6938adc9daaa5,1,1,0


##### User Profiles

In [6]:
profile_df.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,20170212,
1,F,55,0610b486422d4921ae7d2bf64640c50b,20170715,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,20180712,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,20170509,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,20170804,


In [7]:
# check the type on columns
profile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            14825 non-null  object 
 1   age               17000 non-null  int64  
 2   id                17000 non-null  object 
 3   became_member_on  17000 non-null  int64  
 4   income            14825 non-null  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 664.2+ KB


became_member_on is not formatted in the right date format. Convert it to proper date format

In [8]:
profile_df[['became_member_on']]=profile_df[['became_member_on']].applymap(str).applymap(lambda d: datetime.strptime(d, '%Y%m%d').strftime('%m/%d/%Y'))

In [9]:
# check the type on columns
profile_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            14825 non-null  object 
 1   age               17000 non-null  int64  
 2   id                17000 non-null  object 
 3   became_member_on  17000 non-null  object 
 4   income            14825 non-null  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 664.2+ KB


In [10]:
profile_df.head()

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,02/12/2017,
1,F,55,0610b486422d4921ae7d2bf64640c50b,07/15/2017,112000.0
2,,118,38fe809add3b4fcf9315a9694bb96ff5,07/12/2018,
3,F,75,78afa995795e4d85b5d9ceeca43f5fef,05/09/2017,100000.0
4,,118,a03223e636434f42ac4c3df47e8bac43,08/04/2017,


In [11]:
# check for duplicates
profile_df[profile_df.duplicated()]

Unnamed: 0,gender,age,id,became_member_on,income


In [12]:
#check for null values in each column
profile_df.isnull().sum()

gender              2175
age                    0
id                     0
became_member_on       0
income              2175
dtype: int64

In [13]:
profile_df.describe()

Unnamed: 0,age,income
count,17000.0,14825.0
mean,62.531412,65404.991568
std,26.73858,21598.29941
min,18.0,30000.0
25%,45.0,49000.0
50%,58.0,64000.0
75%,73.0,80000.0
max,118.0,120000.0


In [14]:
profile_df['age'].value_counts().sort_index()

18       70
19      135
20      135
21      140
22      131
       ... 
98        5
99        5
100      12
101       5
118    2175
Name: age, Length: 85, dtype: int64

In [15]:
profile_df[profile_df['age']==100].head(15)

Unnamed: 0,gender,age,id,became_member_on,income
283,F,100,24c1287eb7a84cfd80472a82e05b1d57,08/28/2015,63000.0
1171,F,100,28bbebb2b76f4057b1115dd80441e37a,09/05/2017,118000.0
1612,F,100,857fc6e18cf74f7b8ebce26bbfbd3028,01/23/2018,109000.0
4277,M,100,b12e8e0f14ae4ad0b576f7c016e7e89b,09/11/2015,98000.0
6061,F,100,9b675a966bd040eeac75f9aa01b14773,04/10/2018,70000.0
6902,F,100,85f4e5ea5c0044619482bfe16ae32c01,12/31/2017,71000.0
7129,F,100,be79870ff776461d97e1aee515269644,11/26/2017,47000.0
10939,O,100,9eb812054d564bdba180dc365f186e23,06/29/2016,83000.0
13120,F,100,edbb23d321174301a15049a09df4072b,05/07/2017,71000.0
15164,F,100,3b34370727654cfca5322bca2aba9ffd,03/30/2018,96000.0


In [16]:
profile_df[profile_df['age']==101].head(15)

Unnamed: 0,gender,age,id,became_member_on,income
1556,F,101,4d2ccfcbbebf4bd9baf4b7e433d0e288,10/04/2017,43000.0
4100,F,101,d2fdc2be8ab64e4ba04830d441e53fd5,05/26/2015,99000.0
14846,F,101,e0ea90ddd2f147e082d21e97f51ec1b1,11/09/2017,56000.0
15800,F,101,047ad0135cfe4c0ea5ba019da4de9c52,03/09/2017,59000.0
16864,F,101,1593d617fac246ef8e50dbb0ffd77f5f,11/27/2017,82000.0


In [17]:
profile_df[profile_df['age']>101].head(10)

Unnamed: 0,gender,age,id,became_member_on,income
0,,118,68be06ca386d4c31939f3a4f0e3dd783,02/12/2017,
2,,118,38fe809add3b4fcf9315a9694bb96ff5,07/12/2018,
4,,118,a03223e636434f42ac4c3df47e8bac43,08/04/2017,
6,,118,8ec6ce2a7e7949b1bf142def7d0e0586,09/25/2017,
7,,118,68617ca6246f4fbc85e91a2a49552598,10/02/2017,
9,,118,8974fc5686fe429db53ddde067b88302,11/22/2016,
10,,118,c4863c7985cf408faee930f111475da3,08/24/2017,
11,,118,148adfcaa27d485b82f323aaaad036bd,09/19/2015,
17,,118,744d603ef08c4f33af5a61c8c7628d1c,08/01/2017,
23,,118,2b826eba31074a059d63b0ae8f50b7d5,09/07/2017,


In [18]:
profile_df[profile_df['age']>101].describe()

Unnamed: 0,age,income
count,2175.0,0.0
mean,118.0,
std,0.0,
min,118.0,
25%,118.0,
50%,118.0,
75%,118.0,
max,118.0,


In [19]:
profile_df[profile_df['age']>101].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2175 entries, 0 to 16994
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            0 non-null      object 
 1   age               2175 non-null   int64  
 2   id                2175 non-null   object 
 3   became_member_on  2175 non-null   object 
 4   income            0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 102.0+ KB


In [20]:
#confirming null values check
profile_df[profile_df['age']>101].isnull().sum()

gender              2175
age                    0
id                     0
became_member_on       0
income              2175
dtype: int64

#### We see an age of 118 repeated many times in the dataset. This can be a potential data entry error since the gender and income are also NaN values. We can separate these values from the data set to analyze later.

In [21]:
error_profile_df=profile_df[profile_df['age']>101]
profile_df=profile_df[profile_df['age']<118]

In [22]:
print(error_profile_df.shape,profile_df.shape)

(2175, 5) (14825, 5)


##### Offer Transactions

In [23]:
transcript_df.head()

Unnamed: 0,person,event,value,time
0,78afa995795e4d85b5d9ceeca43f5fef,offer received,{'offer id': '9b98b8c7a33c4b65b9aebfe6a799e6d9'},0
1,a03223e636434f42ac4c3df47e8bac43,offer received,{'offer id': '0b1e1539f2cc45b7b9fa7c272da2e1d7'},0
2,e2127556f4f64592b11af22de27a7932,offer received,{'offer id': '2906b810c7d4411798c6938adc9daaa5'},0
3,8ec6ce2a7e7949b1bf142def7d0e0586,offer received,{'offer id': 'fafdcd668e3743c1bb461111dcafc2a4'},0
4,68617ca6246f4fbc85e91a2a49552598,offer received,{'offer id': '4d5c57ea9a6940dd891ad53e9dbe8da0'},0


In [24]:
# check the type on columns
transcript_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306534 entries, 0 to 306533
Data columns (total 4 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   person  306534 non-null  object
 1   event   306534 non-null  object
 2   value   306534 non-null  object
 3   time    306534 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 9.4+ MB


In [25]:
#check for null values
transcript_df.isnull().sum()

person    0
event     0
value     0
time      0
dtype: int64

In [26]:
transcript_df['event'].value_counts()

transaction        138953
offer received      76277
offer viewed        57725
offer completed     33579
Name: event, dtype: int64

#### Since each of the events has a different customer reaction to the offer, we will be separating them into different dataframes

##### Transactions

In [27]:
transaction_df=transcript_df[transcript_df['event']=='transaction'].copy()
#drop the event column as it is now redundant
transaction_df.drop(['event'], axis=1, inplace=True)
transaction_df.head()

Unnamed: 0,person,value,time
12654,02c083884c7d45b39cc68e1314fec56c,{'amount': 0.8300000000000001},0
12657,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,{'amount': 34.56},0
12659,54890f68699049c2a04d415abc25e717,{'amount': 13.23},0
12670,b2f1cd155b864803ad8334cdf13c4bd2,{'amount': 19.51},0
12671,fe97aa22dd3e48c8b143116a8403dd52,{'amount': 18.97},0


In [28]:
transaction_df[['value']]=transaction_df[['value']].applymap(str).applymap(lambda x: str(x.replace("{",""))).applymap(lambda x: str(x.replace("}",""))).applymap(lambda x: x.split(': ')[1]).applymap(float)
transaction_df.rename(columns={'value': 'transaction_amount'}, inplace=True) 
transaction_df.head()

Unnamed: 0,person,transaction_amount,time
12654,02c083884c7d45b39cc68e1314fec56c,0.83,0
12657,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,34.56,0
12659,54890f68699049c2a04d415abc25e717,13.23,0
12670,b2f1cd155b864803ad8334cdf13c4bd2,19.51,0
12671,fe97aa22dd3e48c8b143116a8403dd52,18.97,0


In [29]:
transaction_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138953 entries, 12654 to 306533
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   person              138953 non-null  object 
 1   transaction_amount  138953 non-null  float64
 2   time                138953 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 4.2+ MB


In [30]:
# check for duplicates
transaction_df[transaction_df.duplicated()]

Unnamed: 0,person,transaction_amount,time


#### Function to transform offer dataframes

In [31]:
#function that removes redundant colum and extracts the offer id from value column
def transform_offer_data(df):
    #drop the event column as it is now redundant
    df.drop(['event'], axis=1, inplace=True)
    df[['value']]=df[['value']].applymap(str).applymap(lambda x: str(x.replace("{",""))).applymap(lambda x: str(x.replace("}",""))).applymap(lambda x: x.split(': ')[1]).applymap(lambda x: x.split("'")[1]).applymap(str)
    df.rename(columns={'value': 'offer_id'}, inplace=True)

##### Offer Received

In [32]:
offer_received_df=transcript_df[transcript_df['event']=='offer received'].copy()
transform_offer_data(offer_received_df)
offer_received_df.head()

Unnamed: 0,person,offer_id,time
0,78afa995795e4d85b5d9ceeca43f5fef,9b98b8c7a33c4b65b9aebfe6a799e6d9,0
1,a03223e636434f42ac4c3df47e8bac43,0b1e1539f2cc45b7b9fa7c272da2e1d7,0
2,e2127556f4f64592b11af22de27a7932,2906b810c7d4411798c6938adc9daaa5,0
3,8ec6ce2a7e7949b1bf142def7d0e0586,fafdcd668e3743c1bb461111dcafc2a4,0
4,68617ca6246f4fbc85e91a2a49552598,4d5c57ea9a6940dd891ad53e9dbe8da0,0


In [33]:
offer_received_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 76277 entries, 0 to 257886
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   person    76277 non-null  object
 1   offer_id  76277 non-null  object
 2   time      76277 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


In [34]:
# check for duplicates
offer_received_df[offer_received_df.duplicated()]

Unnamed: 0,person,offer_id,time


##### Offer Viewed

In [35]:
offer_viewed_df=transcript_df[transcript_df['event']=='offer viewed'].copy()
transform_offer_data(offer_viewed_df)
offer_viewed_df.head()

Unnamed: 0,person,offer_id,time
12650,389bc3fa690240e798340f5a15918d5c,f19421c1d4aa40978ebb69ca19b0e20d,0
12651,d1ede868e29245ea91818a903fec04c6,5a8bc65990b245e5a138643cd4eb9837,0
12652,102e9454054946fda62242d2e176fdce,4d5c57ea9a6940dd891ad53e9dbe8da0,0
12653,02c083884c7d45b39cc68e1314fec56c,ae264e3637204a6fb9bb56bc8210ddfd,0
12655,be8a5d1981a2458d90b255ddc7e0d174,5a8bc65990b245e5a138643cd4eb9837,0


In [36]:
offer_viewed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57725 entries, 12650 to 306507
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   person    57725 non-null  object
 1   offer_id  57725 non-null  object
 2   time      57725 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.8+ MB


In [37]:
# check for duplicates
offer_viewed_df[offer_viewed_df.duplicated()]

Unnamed: 0,person,offer_id,time


##### Offer Completed

In [38]:
offer_completed_df=transcript_df[transcript_df['event']=='offer completed'].copy()
transform_offer_data(offer_completed_df)
offer_completed_df.head()

Unnamed: 0,person,offer_id,time
12658,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,2906b810c7d4411798c6938adc9daaa5,0
12672,fe97aa22dd3e48c8b143116a8403dd52,fafdcd668e3743c1bb461111dcafc2a4,0
12679,629fc02d56414d91bca360decdfa9288,9b98b8c7a33c4b65b9aebfe6a799e6d9,0
12692,676506bad68e4161b9bbaffeb039626b,ae264e3637204a6fb9bb56bc8210ddfd,0
12697,8f7dd3b2afe14c078eb4f6e6fe4ba97d,4d5c57ea9a6940dd891ad53e9dbe8da0,0


In [39]:
offer_completed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33579 entries, 12658 to 306527
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   person    33579 non-null  object
 1   offer_id  33579 non-null  object
 2   time      33579 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [40]:
# check for duplicates
offer_completed_df[offer_completed_df.duplicated()]

Unnamed: 0,person,offer_id,time
66123,3dde94fa581145cb9f206624f1a94d5a,2906b810c7d4411798c6938adc9daaa5,168
66783,e9fb6ed2cecb4980ba98c86abc9c91e3,ae264e3637204a6fb9bb56bc8210ddfd,168
67614,a7dc060f6fc94ca7bf71fbb188187dca,9b98b8c7a33c4b65b9aebfe6a799e6d9,168
68562,30478a4c1e884a63a822aa87b833ed7a,2298d6c36e964ae4a3e7e9706d1fb8c2,168
69218,84fb57a7fe8045a8bf6236738ee73a0f,ae264e3637204a6fb9bb56bc8210ddfd,168
...,...,...,...
297625,6ba2450a438540999e633a5d99c7c7a0,9b98b8c7a33c4b65b9aebfe6a799e6d9,672
299471,f39fe7ea4e5946378e6d224504b77797,0b1e1539f2cc45b7b9fa7c272da2e1d7,684
304756,0785f1fce0b04ba08e01c7d2ebab4917,0b1e1539f2cc45b7b9fa7c272da2e1d7,708
305551,b7e216b6472b46648272c29a52a86702,fafdcd668e3743c1bb461111dcafc2a4,714


### Combine dataframes

In [41]:
#merge transactions and people profiles
transaction_df=pd.merge(transaction_df,profile_df,left_on='person',right_on='id',how='left')

#drop redundant person id column
transaction_df.drop(['id'], axis=1, inplace=True)

#rename id column
transaction_df.rename(columns={'person': 'person_id'}, inplace=True)

#reorder columns
transaction_df=transaction_df.iloc[:,[0,3,4,5,6,1,2]]
transaction_df.head()

Unnamed: 0,person_id,gender,age,became_member_on,income,transaction_amount,time
0,02c083884c7d45b39cc68e1314fec56c,F,20.0,07/11/2016,30000.0,0.83,0
1,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,M,42.0,01/17/2016,96000.0,34.56,0
2,54890f68699049c2a04d415abc25e717,M,36.0,12/28/2017,56000.0,13.23,0
3,b2f1cd155b864803ad8334cdf13c4bd2,F,55.0,10/16/2017,94000.0,19.51,0
4,fe97aa22dd3e48c8b143116a8403dd52,F,39.0,12/17/2017,67000.0,18.97,0


#### Function to merge offer datasets with person and portfolio

In [42]:
def merge_datasets(df):
    #merge offer(received/viewed/completed) and portfolios(offer details)
    df=pd.merge(df,portfolio_df,left_on='offer_id',right_on='id',how='left')

    #drop redundant offer id column
    df.drop(['id'], axis=1, inplace=True)

    df=pd.merge(df,profile_df,left_on='person',right_on='id',how='left')

    #drop redundant person id column
    df.drop(['id'], axis=1, inplace=True)

    #rename person column
    df.rename(columns={'person': 'person_id'}, inplace=True)

    #reorder columns
    df=df.iloc[:,[1,2,3,4,5,6,7,8,9,0,10,11,12,13]]
    
    return df

In [43]:
#combine offer received with person and portfolio details 
offer_received_df=merge_datasets(offer_received_df)
offer_received_df.head()

Unnamed: 0,offer_id,time,reward,difficulty,duration,offer_type,web_channel,mobile_channel,social_channel,person_id,gender,age,became_member_on,income
0,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,5,5,7,bogo,1,1,0,78afa995795e4d85b5d9ceeca43f5fef,F,75.0,05/09/2017,100000.0
1,0b1e1539f2cc45b7b9fa7c272da2e1d7,0,5,20,10,discount,1,0,0,a03223e636434f42ac4c3df47e8bac43,,,,
2,2906b810c7d4411798c6938adc9daaa5,0,2,10,7,discount,1,1,0,e2127556f4f64592b11af22de27a7932,M,68.0,04/26/2018,70000.0
3,fafdcd668e3743c1bb461111dcafc2a4,0,2,10,10,discount,1,1,1,8ec6ce2a7e7949b1bf142def7d0e0586,,,,
4,4d5c57ea9a6940dd891ad53e9dbe8da0,0,10,10,5,bogo,1,1,1,68617ca6246f4fbc85e91a2a49552598,,,,


In [44]:
#combine offer viewed with person and portfolio details 
offer_viewed_df=merge_datasets(offer_viewed_df)
offer_viewed_df.head()

Unnamed: 0,offer_id,time,reward,difficulty,duration,offer_type,web_channel,mobile_channel,social_channel,person_id,gender,age,became_member_on,income
0,f19421c1d4aa40978ebb69ca19b0e20d,0,5,5,5,bogo,1,1,1,389bc3fa690240e798340f5a15918d5c,M,65.0,02/09/2018,53000.0
1,5a8bc65990b245e5a138643cd4eb9837,0,0,0,3,informational,0,1,1,d1ede868e29245ea91818a903fec04c6,O,53.0,09/16/2017,52000.0
2,4d5c57ea9a6940dd891ad53e9dbe8da0,0,10,10,5,bogo,1,1,1,102e9454054946fda62242d2e176fdce,F,69.0,08/14/2016,57000.0
3,ae264e3637204a6fb9bb56bc8210ddfd,0,10,10,7,bogo,0,1,1,02c083884c7d45b39cc68e1314fec56c,F,20.0,07/11/2016,30000.0
4,5a8bc65990b245e5a138643cd4eb9837,0,0,0,3,informational,0,1,1,be8a5d1981a2458d90b255ddc7e0d174,M,39.0,05/27/2014,51000.0


In [45]:
#combine offer completed with person and portfolio details 
offer_completed_df=merge_datasets(offer_completed_df)
offer_completed_df.head()

Unnamed: 0,offer_id,time,reward,difficulty,duration,offer_type,web_channel,mobile_channel,social_channel,person_id,gender,age,became_member_on,income
0,2906b810c7d4411798c6938adc9daaa5,0,2,10,7,discount,1,1,0,9fa9ae8f57894cc9a3b8a9bbe0fc1b2f,M,42.0,01/17/2016,96000.0
1,fafdcd668e3743c1bb461111dcafc2a4,0,2,10,10,discount,1,1,1,fe97aa22dd3e48c8b143116a8403dd52,F,39.0,12/17/2017,67000.0
2,9b98b8c7a33c4b65b9aebfe6a799e6d9,0,5,5,7,bogo,1,1,0,629fc02d56414d91bca360decdfa9288,M,52.0,06/05/2018,72000.0
3,ae264e3637204a6fb9bb56bc8210ddfd,0,10,10,7,bogo,0,1,1,676506bad68e4161b9bbaffeb039626b,M,37.0,05/15/2017,92000.0
4,4d5c57ea9a6940dd891ad53e9dbe8da0,0,10,10,5,bogo,1,1,1,8f7dd3b2afe14c078eb4f6e6fe4ba97d,M,48.0,09/03/2015,62000.0


### Save files to be used later

In [46]:
#this set of code downloads all dataframes as individual files:
#skip running this cell if you do not want to download the CSV files. all these are available in Github Repository under folder data --> clean 
from google.colab import files

portfolio_df.to_csv('portfolio.csv', encoding = 'utf-8-sig') 
files.download('portfolio.csv')

profile_df.to_csv('profile.csv', encoding = 'utf-8-sig') 
files.download('profile.csv')

error_profile_df.to_csv('error_profile.csv', encoding = 'utf-8-sig') 
files.download('error_profile.csv')

transaction_df.to_csv('transactions.csv', encoding = 'utf-8-sig') 
files.download('transactions.csv')

offer_received_df.to_csv('offer_received.csv', encoding = 'utf-8-sig') 
files.download('offer_received.csv')

offer_viewed_df.to_csv('offer_viewed.csv', encoding = 'utf-8-sig') 
files.download('offer_viewed.csv')

offer_completed_df.to_csv('offer_completed.csv', encoding = 'utf-8-sig') 
files.download('offer_completed.csv')

#save to csv files
#portfolio_df.to_csv('../data/clean/portfolio.csv')
#profile_df.to_csv('../data/clean/profile.csv')
#error_profile_df.to_csv('../data/clean/error_profile.csv')
#transaction_df.to_csv('../data/clean/transactions.csv')
#offer_received_df.to_csv('../data/clean/offer_received.csv')
#offer_viewed_df.to_csv('../data/clean/offer_viewed.csv')
#offer_completed_df.to_csv('../data/clean/offer_completed.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>