In [299]:
import pandas as pd
import numpy as np

## Loading the Data

In [300]:
df_campaign_data = pd.read_csv('../marketing_department/campaign_data.csv')
df_transaction_campaign_data = pd.read_csv('../marketing_department/transactional_campaign_data.csv')

## Data Profiling - df_campaign_data

In [301]:
df_campaign_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 1 columns):
 #   Column                                                    Non-Null Count  Dtype 
---  ------                                                    --------------  ----- 
 0   	campaign_id	campaign_name	campaign_description	discount  10 non-null     object
dtypes: object(1)
memory usage: 212.0+ bytes


In [302]:
df_campaign_data.head(10)

Unnamed: 0,\tcampaign_id\tcampaign_name\tcampaign_description\tdiscount
0,"0\tCAMPAIGN24776\twouldn't you know it\t""""""Twee retro vinyl single-origin coffee sartorial fanny pack brunch offal health."""" - Raleigh Senger""\t1%"
1,"1\tCAMPAIGN33679\tcould be written on the back of a postage stamp\t""""""Fanny pack gentrify cardigan messenger bag."""" - Bradley Stamm""\t1pct"
2,"2\tCAMPAIGN49972\tme neither\t""""""DIY pug leggings everyday craft beer cardigan knausgaard +1 crucifix flannel."""" - Tremayne Nader""\t10%%"
3,"3\tCAMPAIGN61872\ton the huh\t""""""Trust fund pinterest chambray."""" - Claude Aufderhar""\t5%"
4,"4\tCAMPAIGN03110\tstick a fork in it\t""""""YOLO tumblr Yuccie austin."""" - Jordi Kunde""\t1percent"
5,"5\tCAMPAIGN46302\tyou must be new here\t""""""Craft beer XOXO hella tacos chillwave cred organic letterpress disrupt artisan."""" - Rodrick Lebsack""\t1pct"
6,"6\tCAMPAIGN11190\tmind your own beeswax\t""""""Street shoreditch viral before they sold out yr ramps skateboard skateboard bitters pabst."""" - Brendan Miller""\t10pct"
7,"7\tCAMPAIGN53595\thow do I get to the train station\t""""""Craft beer venmo lomo fixie readymade marfa."""" - Benny Bogan""\t20pct"
8,"8\tCAMPAIGN29983\twould it hurt\t""""""Vegan migas ramps keytar wolf cray kickstarter five dollar toast."""" - Adeline Brakus""\t1%"
9,"9\tCAMPAIGN52447\tpound for pound\t""""""Semiotics biodiesel everyday craft beer etsy semiotics keffiyeh meditation single-origin coffee."""" - Bernadette Pollich""\t1percent"


In [303]:
print(df_campaign_data.shape)

(10, 1)


In [304]:
df_campaign_data.duplicated().sum()

0

## Data Cleaning - df_campaign_data

### Separate the into different columns

In [305]:
#separating one cell to multiple cells
df_campaign_data[['Unnamed', 'campaign_id' , 'campaign_name', 'campaign_description', 'discount']] = df_campaign_data['\tcampaign_id\tcampaign_name\tcampaign_description\tdiscount'].str.split('\t', expand=True)

In [306]:
df_campaign_data.drop('Unnamed', inplace=True, axis=1)

In [307]:
df_campaign_data.drop('\tcampaign_id\tcampaign_name\tcampaign_description\tdiscount', inplace=True, axis=1)

### Separate campaign description with writer

In [308]:
#separating campaign description with writer
df_campaign_data[['campaign_description', 'campaign_writer']] = df_campaign_data['campaign_description'].str.split(" - ", expand=True)

### Removing unnecessary symbols

In [309]:
#cleaning campaign description by removing unnecessary symbols
df_campaign_data['campaign_description'] = df_campaign_data['campaign_description'].str.replace('"', '')
df_campaign_data['campaign_writer'] = df_campaign_data['campaign_writer'].str.replace('"', '')

In [310]:
#cleaning structure in discount column
df_campaign_data['discount'] = df_campaign_data['discount'].str.replace('%%', '%')
df_campaign_data['discount'] = df_campaign_data['discount'].str.replace('pct', '%')
df_campaign_data['discount'] = df_campaign_data['discount'].str.replace('percent', '%')

In [311]:
#setting up width of column
pd.set_option('display.max_colwidth', None)

### Renaming discount to campaign_discount

In [312]:
df_campaign_data = df_campaign_data.rename(columns={'discount': 'campaign_discount'})

### Making it into uppercase

In [313]:
#campaign_name to uppercase
df_campaign_data['campaign_name'] = df_campaign_data['campaign_name'].str.upper()
df_campaign_data['campaign_writer'] = df_campaign_data['campaign_writer'].str.upper()

In [314]:
# Uppercase all columns
df_campaign_data.columns = df_campaign_data.columns.str.upper()

In [315]:
df_campaign_data.head(10)

Unnamed: 0,CAMPAIGN_ID,CAMPAIGN_NAME,CAMPAIGN_DESCRIPTION,CAMPAIGN_DISCOUNT,CAMPAIGN_WRITER
0,CAMPAIGN24776,WOULDN'T YOU KNOW IT,Twee retro vinyl single-origin coffee sartorial fanny pack brunch offal health.,1%,RALEIGH SENGER
1,CAMPAIGN33679,COULD BE WRITTEN ON THE BACK OF A POSTAGE STAMP,Fanny pack gentrify cardigan messenger bag.,1%,BRADLEY STAMM
2,CAMPAIGN49972,ME NEITHER,DIY pug leggings everyday craft beer cardigan knausgaard +1 crucifix flannel.,10%,TREMAYNE NADER
3,CAMPAIGN61872,ON THE HUH,Trust fund pinterest chambray.,5%,CLAUDE AUFDERHAR
4,CAMPAIGN03110,STICK A FORK IN IT,YOLO tumblr Yuccie austin.,1%,JORDI KUNDE
5,CAMPAIGN46302,YOU MUST BE NEW HERE,Craft beer XOXO hella tacos chillwave cred organic letterpress disrupt artisan.,1%,RODRICK LEBSACK
6,CAMPAIGN11190,MIND YOUR OWN BEESWAX,Street shoreditch viral before they sold out yr ramps skateboard skateboard bitters pabst.,10%,BRENDAN MILLER
7,CAMPAIGN53595,HOW DO I GET TO THE TRAIN STATION,Craft beer venmo lomo fixie readymade marfa.,20%,BENNY BOGAN
8,CAMPAIGN29983,WOULD IT HURT,Vegan migas ramps keytar wolf cray kickstarter five dollar toast.,1%,ADELINE BRAKUS
9,CAMPAIGN52447,POUND FOR POUND,Semiotics biodiesel everyday craft beer etsy semiotics keffiyeh meditation single-origin coffee.,1%,BERNADETTE POLLICH


## Data Profiling - df_transactional_campaign_data

In [316]:
df_transaction_campaign_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124887 entries, 0 to 124886
Data columns (total 6 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Unnamed: 0         124887 non-null  int64 
 1   transaction_date   124887 non-null  object
 2   campaign_id        124887 non-null  object
 3   order_id           124887 non-null  object
 4   estimated arrival  124887 non-null  object
 5   availed            124887 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 5.7+ MB


In [317]:
df_transaction_campaign_data.head(10)

Unnamed: 0.1,Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed
0,2,2023-08-14,CAMPAIGN49972,0612c246-57f1-40e8-9993-0f8d41992049,10days,1
1,4,2021-10-12,CAMPAIGN46302,b4c411de-2fd3-4806-91ae-165edc9baa12,13days,0
2,6,2023-01-30,CAMPAIGN29983,26de6b40-db2d-40b9-a64c-58736eaf0381,3days,1
3,8,2022-01-05,CAMPAIGN46302,26b60a4e-aafe-4b99-bace-034d088a4a53,8days,1
4,9,2023-09-19,CAMPAIGN46302,4aab29ae-e610-46bf-92af-199f6f420cee,14days,1
5,16,2020-07-01,CAMPAIGN46302,3312851e-28dc-4508-8003-09ba304ce9e2,11days,1
6,22,2022-07-18,CAMPAIGN29983,9ae58335-164a-467c-ae22-fe5dabe806b9,10days,1
7,24,2020-08-28,CAMPAIGN29983,65abaa49-3ae1-44bb-8366-ef9d7e264a4c,7days,1
8,27,2023-11-13,CAMPAIGN03110,fd92cc32-e28e-4f14-b8f8-4526ea58fb3f,3days,1
9,33,2022-09-21,CAMPAIGN46302,61c61e9b-9838-4c95-8390-4343fbac7a18,9days,1


In [318]:
df_transaction_campaign_data.tail(10)

Unnamed: 0.1,Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed
124877,499963,2020-02-04,CAMPAIGN24776,a397c17b-956f-4dce-8b0c-d8d6234f1de8,14days,1
124878,499965,2021-11-23,CAMPAIGN29983,d5dbea32-829e-474c-8d88-cadcb993a944,8days,1
124879,499967,2022-01-06,CAMPAIGN33679,238f761a-f1bf-4f6c-a37a-fff7f6574f8c,10days,1
124880,499982,2022-02-14,CAMPAIGN46302,ceafa665-307f-4c9e-b1bd-79577a450eb9,12days,1
124881,499986,2022-01-23,CAMPAIGN11190,f628455c-ad0a-444f-96c3-931f6a1912e9,9days,0
124882,499990,2022-04-14,CAMPAIGN61872,6a5163c2-ba62-461c-a729-5b06fe266bd9,12days,0
124883,499991,2022-11-23,CAMPAIGN11190,883e2fe2-bbdf-4b5e-8348-c2bcba07da0e,15days,1
124884,499992,2023-07-28,CAMPAIGN33679,575b0a2d-5539-4565-a10b-1f4f6516cd27,12days,1
124885,499995,2020-04-12,CAMPAIGN49972,ba506489-1505-481d-b0b1-8cdbf04a515f,6days,1
124886,499997,2023-03-24,CAMPAIGN52447,2b2e775f-d6cd-4659-9a88-b4950bc326c0,15days,1


In [319]:
df_transaction_campaign_data['campaign_id'].nunique()

10

In [320]:
df_transaction_campaign_data.iloc[300:500]

Unnamed: 0.1,Unnamed: 0,transaction_date,campaign_id,order_id,estimated arrival,availed
300,1103,2020-09-22,CAMPAIGN29983,4f1da4b2-1782-4ebe-b6c9-7dd2256e3189,9days,1
301,1104,2023-08-30,CAMPAIGN52447,65872179-0051-48cb-9390-8484ee4a5f56,15days,1
302,1110,2022-05-27,CAMPAIGN46302,175aba9f-ba56-4d55-80ac-55fe5ced4cbd,9days,0
303,1123,2021-11-16,CAMPAIGN11190,957a8b8c-3996-4780-84cf-ebc92485d3cc,13days,0
304,1126,2021-03-02,CAMPAIGN46302,54f13dae-2641-4e71-8201-4a1e4e762d9d,11days,1
...,...,...,...,...,...,...
495,1950,2021-08-25,CAMPAIGN29983,9eade34d-7cfe-4797-af25-78ce21e9ff23,8days,0
496,1952,2021-05-01,CAMPAIGN53595,d2030212-0411-46a2-af96-f37128864655,11days,1
497,1954,2022-06-30,CAMPAIGN24776,fe1f4d70-34cb-4a7b-8927-d5078acad8bc,12days,1
498,1955,2022-06-02,CAMPAIGN46302,b448a2a2-3a5e-415b-89d2-680adb32714f,8days,1


In [321]:
df_transaction_campaign_data.duplicated().sum()

0

In [322]:
df_transaction_campaign_data['transaction_date'].describe()

count         124887
unique          1432
top       2023-07-01
freq             178
Name: transaction_date, dtype: object

### Total number of avails per campaign

In [323]:
total_avails_per_campaign = df_transaction_campaign_data.groupby('campaign_id')['availed'].sum().reset_index()
print(total_avails_per_campaign)

     campaign_id  availed
0  CAMPAIGN03110     8872
1  CAMPAIGN11190     8747
2  CAMPAIGN24776     8696
3  CAMPAIGN29983     8692
4  CAMPAIGN33679     8663
5  CAMPAIGN46302     8785
6  CAMPAIGN49972     8997
7  CAMPAIGN52447     8719
8  CAMPAIGN53595     8892
9  CAMPAIGN61872     8474


## Data Cleaning - df_transactional_campaign_data

### Rename column estimated arrival

In [324]:
#rename column
df_transaction_campaign_data = df_transaction_campaign_data.rename(columns={'estimated arrival': 'estimated_arrival'})

### Converting transaction_date to date

In [325]:
df_transaction_campaign_data['transaction_date'] = pd.to_datetime(df_transaction_campaign_data['transaction_date'])

### Adding space between the number and days

In [326]:
df_transaction_campaign_data = df_transaction_campaign_data.assign(
    estimated_arrival = df_transaction_campaign_data['estimated_arrival'].str.extract(r'(\d+)') + " " + df_transaction_campaign_data['estimated_arrival'].str.extract(r'(\D+)')
    )

### Remove unnamed column

In [327]:
# Removal of unnamed column (NOT YET SUPER SURE - JUSTIFY THIS)
df_transaction_campaign_data.drop('Unnamed: 0', inplace=True, axis=1)

### Change order of column

In [328]:
df_transaction_campaign_data = df_transaction_campaign_data[['order_id', 'campaign_id', 'transaction_date', 'estimated_arrival', 'availed']]

In [329]:
df_transaction_campaign_data['estimated_arrival'].unique()

array(['10 days', '13 days', '3 days', '8 days', '14 days', '11 days',
       '7 days', '9 days', '4 days', '5 days', '12 days', '6 days',
       '15 days'], dtype=object)

In [330]:
#checking for null values
null_prod_type = df_transaction_campaign_data.isnull().any(axis=1)
null_rows = df_transaction_campaign_data[null_prod_type]
print(null_rows)

Empty DataFrame
Columns: [order_id, campaign_id, transaction_date, estimated_arrival, availed]
Index: []


### Uppercase all columns

In [331]:
df_transaction_campaign_data.columns = df_transaction_campaign_data.columns.str.upper()

In [332]:
df_transaction_campaign_data['ORDER_ID'] = df_transaction_campaign_data['ORDER_ID'].str.upper()
df_transaction_campaign_data['CAMPAIGN_ID'] = df_transaction_campaign_data['CAMPAIGN_ID'].str.upper()
df_transaction_campaign_data['ESTIMATED_ARRIVAL'] = df_transaction_campaign_data['ESTIMATED_ARRIVAL'].str.upper()