# **Data Wrangling Project 2024**
---

In fulfillment of the course *Data Wrangling XB_0014* at the *Vrije Universiteit Amsterdam*.<br>
This project was conducted by **Alvaro Pratama Maharto, Michael Evan Sutanto, Mahmoud Ashtar, and Miguel Sadorra.**

#### **TOPIC**: Marketing Strategy Effectivity
This research will be taking a look at the effects of different types of marketing campaigns and their effectivity in terms of clickthrough rate, customer engagement, and customer acquisition.
We will be having a look at the dataset with the following question in mind:
### **RESEARCH QUESTION**:
*How do different marketing strategies, particularly those used in bulk campaigns, impact the overall effectiveness of marketing initiatives, considering metrics such as clickthrough rates, customer engagement, and customer acquisition?*

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [61]:
campaigns_df = pd.read_csv("campaigns.csv")
messages_df = pd.read_csv("messages-demo.csv")

messages_df.head()

  messages_df = pd.read_csv("messages-demo.csv")


Unnamed: 0,id,message_id,campaign_id,message_type,client_id,channel,category,platform,email_provider,stream,date,sent_at,is_opened,opened_first_time_at,opened_last_time_at,is_clicked,clicked_first_time_at,clicked_last_time_at,is_unsubscribed,unsubscribed_at,is_hard_bounced,hard_bounced_at,is_soft_bounced,soft_bounced_at,is_complained,complained_at,is_blocked,blocked_at,is_purchased,purchased_at,created_at,updated_at
0,3527358,3f6aaad3-bab7-4886-b083-fe8c1f210066,31,transactional,1515915625489833514,email,,,mail.ru,desktop,2021-04-30,2021-04-30 11:27:43,t,2021-05-04 05:47:05,2021-05-04 05:47:05,t,2021-05-04 05:47:46,2021-05-04 05:47:46,f,,f,,f,,f,,f,,t,2021-05-06 16:40:38,2023-04-27 08:55:05.883908,2023-04-27 08:57:33.080129
1,3527619,0e670ecc-4549-44f6-86ed-469682d34837,32,transactional,1515915625489220445,email,,,yandex.ru,desktop,2021-04-30,2021-04-30 08:00:35,t,2021-05-04 15:38:20,2021-05-04 15:38:20,f,,,f,,f,,f,,f,,f,,f,,2023-04-27 08:55:06.265821,2023-04-27 08:56:18.60223
2,3527980,276b25cf-1bda-4faf-b3a4-98e4161f9357,32,transactional,1515915625489854185,email,,,mail.ru,desktop,2021-04-30,2021-04-30 05:56:37,t,2021-05-04 15:16:32,2021-05-04 15:16:32,f,,,f,,f,,f,,f,,f,,f,,2023-04-27 08:55:06.777039,2023-04-27 08:56:19.112546
3,3528369,4545aff2-09b3-45e3-9abd-c680357e5429,32,transactional,1515915625489101550,email,,,mail.ru,desktop,2021-04-30,2021-04-30 05:53:59,t,2021-05-04 09:47:25,2021-05-04 09:47:25,f,,,f,,f,,f,,f,,f,,f,,2023-04-27 08:55:07.325906,2023-04-27 08:56:19.590637
4,3528648,5850858d-2dcf-4f31-a0d3-5db5649b17c4,32,transactional,1515915625490455948,email,,,mail.ru,desktop,2021-04-30,2021-04-30 06:20:37,t,2021-05-04 18:56:16,2021-05-04 18:56:16,f,,,f,,f,,f,,f,,f,,f,,2023-04-27 08:55:07.727792,2023-04-27 08:56:19.926474


In [62]:
print(len(messages_df))
print(len(campaigns_df))

10000000
1907


### Sampling
Since the data is about 10 million rows big, the group decided to use only a sample of 40% of the data to cut down on computing time.

In [63]:
messages_df = messages_df[messages_df["message_type"] == "bulk"] # since bulk are over 93% of the campains we are only working with bulk messages and bulk campaings
campaigns_df = campaigns_df[campaigns_df["campaign_type"] == "bulk"]

messages_sample = messages_df.sample(frac= 0.40, random_state= 42, replace= False)

In [64]:
messages_sample.head()
print(len(messages_sample))

2824958


### Cleaning up the data
We will eliminate all of the unneccessary columns from the dataframes we created above and consolidate them into one `merged_df`

In [65]:
print("Messages Database:\n",list(messages_df.columns),"\n")
print("Campaigns Database:\n",list(campaigns_df.columns))

# our sample has many columns >>> some are disposable and some are missing the majority of the values => these will be dropped

messages_sample = messages_sample.drop(["id", "category", "platform", "stream", "hard_bounced_at", "soft_bounced_at",
                                         "is_soft_bounced", "is_hard_bounced",  "blocked_at", "updated_at", "is_complained",
                                         "complained_at", "opened_first_time_at", "clicked_last_time_at", "unsubscribed_at", 
                                         "purchased_at", "created_at", "message_type", "message_id", "date", "email_provider",
                                         "opened_last_time_at", "clicked_first_time_at"],
                                           axis= 1)

campaigns_df = campaigns_df.drop(["ab_test", "warmup_mode", "hour_limit", "is_test", "position", "campaign_type"], axis=1)



Messages Database:
 ['id', 'message_id', 'campaign_id', 'message_type', 'client_id', 'channel', 'category', 'platform', 'email_provider', 'stream', 'date', 'sent_at', 'is_opened', 'opened_first_time_at', 'opened_last_time_at', 'is_clicked', 'clicked_first_time_at', 'clicked_last_time_at', 'is_unsubscribed', 'unsubscribed_at', 'is_hard_bounced', 'hard_bounced_at', 'is_soft_bounced', 'soft_bounced_at', 'is_complained', 'complained_at', 'is_blocked', 'blocked_at', 'is_purchased', 'purchased_at', 'created_at', 'updated_at'] 

Campaigns Database:
 ['id', 'campaign_type', 'channel', 'topic', 'started_at', 'finished_at', 'total_count', 'ab_test', 'warmup_mode', 'hour_limit', 'subject_length', 'subject_with_personalization', 'subject_with_deadline', 'subject_with_emoji', 'subject_with_bonuses', 'subject_with_discount', 'subject_with_saleout', 'is_test', 'position']


In [66]:
display(messages_sample[:10])

Unnamed: 0,campaign_id,client_id,channel,sent_at,is_opened,is_clicked,is_unsubscribed,is_blocked,is_purchased
7286261,230,1515915625488886720,email,2021-05-27 08:12:44,f,f,f,f,f
848463,64,1515915625608891382,mobile_push,2021-04-30 08:32:01,f,f,f,f,f
5778692,150,1515915625486927106,email,2021-05-21 08:58:57,f,f,f,f,f
7539782,230,1515915625488084724,email,2021-05-27 10:32:50,t,f,f,f,f
6710082,152,1515915625490799486,email,2021-05-24 08:41:10,f,f,f,f,f
827892,64,1515915625571018047,mobile_push,2021-04-30 07:58:37,f,f,f,f,f
7945106,257,1515915625490421081,mobile_push,2021-05-28 13:00:24,f,f,f,f,f
7965469,257,1515915625501578324,mobile_push,2021-05-28 13:00:33,f,f,f,f,f
6111520,150,1515915625500830959,email,2021-05-21 07:17:51,t,f,f,f,f
9701465,366,1515915625500834555,email,2021-06-10 08:30:54,f,f,f,f,f


In [67]:
display(campaigns_df[:10])

Unnamed: 0,id,channel,topic,started_at,finished_at,total_count,subject_length,subject_with_personalization,subject_with_deadline,subject_with_emoji,subject_with_bonuses,subject_with_discount,subject_with_saleout
0,63,mobile_push,sale out,2021-04-30 07:22:36.615023,2021-04-30 07:23:41,48211.0,146.0,False,False,True,False,False,False
1,64,mobile_push,sale out,2021-04-30 09:02:50.817227,2021-04-30 09:04:08,1037337.0,97.0,False,False,True,False,False,False
2,78,mobile_push,sale out,2021-05-06 07:14:10.533318,2021-05-06 07:15:17,70080.0,146.0,False,False,True,False,False,False
3,79,mobile_push,sale out,2021-05-06 09:03:56.486750,2021-05-06 09:42:15,921838.0,97.0,False,False,True,False,False,False
4,89,mobile_push,,2021-05-07 11:54:06.168664,2021-05-07 11:54:38,45503.0,109.0,False,True,True,False,False,False
5,110,mobile_push,sale out,2021-05-12 07:38:32.980268,2021-05-12 07:40:16,90816.0,146.0,False,False,True,False,False,False
6,111,mobile_push,sale out,2021-05-12 07:44:41.865082,2021-05-12 08:16:07,1045217.0,97.0,False,False,True,False,False,False
7,129,email,sale out,2021-05-18 07:38:49.825780,2021-05-18 07:38:58,3.0,133.0,False,False,True,False,False,False
8,136,email,sale out,2021-05-19 07:00:11.121170,2021-05-19 08:03:19,177363.0,133.0,False,False,True,False,False,False
9,138,mobile_push,sale out,2021-05-19 07:00:11.105536,2021-05-19 07:21:42,37889.0,146.0,False,False,True,False,False,False


In [68]:
merged_df = pd.merge(messages_sample, campaigns_df, left_on = "campaign_id", right_on= "id" ,how= "inner")
print(merged_df.isna().sum())

campaign_id                         0
client_id                           0
channel_x                           0
sent_at                             0
is_opened                           0
is_clicked                          0
is_unsubscribed                     0
is_blocked                          0
is_purchased                        0
id                                  0
channel_y                           0
topic                           28757
started_at                          0
finished_at                         0
total_count                         0
subject_length                      0
subject_with_personalization        0
subject_with_deadline               0
subject_with_emoji                  0
subject_with_bonuses                0
subject_with_discount               0
subject_with_saleout                0
dtype: int64


In [125]:
print(merged_df['topic'].value_counts())
# I WOULD SAY ['TOPIC'] is a pretty irrelevant column (TO_DROP())

bool_channel = (merged_df['channel_x'] == merged_df['channel_y'])
print(bool_channel.value_counts())
# channel_x and channel_y are identical


bool_id = (merged_df['campaign_id'] == merged_df['id'])
print(bool_id.value_counts())
# campaign_id and id are identical


sale out    2789750
event          6451
Name: topic, dtype: int64
True    2824958
dtype: int64
True    2824958
dtype: int64


### Converting string and bool values to binary
Since the database contains a lot of string values to represent True or False, we will convert these values to binary to have an easier time understanding the database and using it to analyze the data.

In [69]:
def convert_to_binary(value):
    value_lower = str(value).lower()
    if value_lower == 't':
        return 1
    elif value_lower == 'f':
        return 0
    else:
        return value

merged_df = merged_df.applymap(lambda x: int(x) if isinstance(x, bool) else x)
merged_df = merged_df.applymap(convert_to_binary)


In [70]:
pd.set_option('display.max_columns', None)
display(merged_df[:10]) 

Unnamed: 0,campaign_id,client_id,channel_x,sent_at,is_opened,is_clicked,is_unsubscribed,is_blocked,is_purchased,id,channel_y,topic,started_at,finished_at,total_count,subject_length,subject_with_personalization,subject_with_deadline,subject_with_emoji,subject_with_bonuses,subject_with_discount,subject_with_saleout
0,230,1515915625488886720,email,2021-05-27 08:12:44,0,0,0,0,0,230,email,sale out,2021-05-27 08:00:10.980791,2021-05-27 10:35:06,651859.0,115.0,0,0,1,0,0,0
1,230,1515915625488084724,email,2021-05-27 10:32:50,1,0,0,0,0,230,email,sale out,2021-05-27 08:00:10.980791,2021-05-27 10:35:06,651859.0,115.0,0,0,1,0,0,0
2,230,1515915625501096510,email,2021-05-27 09:19:47,0,0,0,0,0,230,email,sale out,2021-05-27 08:00:10.980791,2021-05-27 10:35:06,651859.0,115.0,0,0,1,0,0,0
3,230,1515915625502848292,email,2021-05-27 08:16:56,0,0,1,0,0,230,email,sale out,2021-05-27 08:00:10.980791,2021-05-27 10:35:06,651859.0,115.0,0,0,1,0,0,0
4,230,1515915625487737557,email,2021-05-27 09:44:49,0,0,0,0,0,230,email,sale out,2021-05-27 08:00:10.980791,2021-05-27 10:35:06,651859.0,115.0,0,0,1,0,0,0
5,230,1515915625488545980,email,2021-05-27 10:02:39,0,0,0,0,0,230,email,sale out,2021-05-27 08:00:10.980791,2021-05-27 10:35:06,651859.0,115.0,0,0,1,0,0,0
6,230,1515915625488327822,email,2021-05-27 09:57:28,0,0,0,0,0,230,email,sale out,2021-05-27 08:00:10.980791,2021-05-27 10:35:06,651859.0,115.0,0,0,1,0,0,0
7,230,1515915625489229184,email,2021-05-27 09:41:51,0,0,0,0,0,230,email,sale out,2021-05-27 08:00:10.980791,2021-05-27 10:35:06,651859.0,115.0,0,0,1,0,0,0
8,230,1515915625500283542,email,2021-05-27 10:11:37,0,0,0,0,0,230,email,sale out,2021-05-27 08:00:10.980791,2021-05-27 10:35:06,651859.0,115.0,0,0,1,0,0,0
9,230,1515915625487439364,email,2021-05-27 08:05:55,0,0,0,0,0,230,email,sale out,2021-05-27 08:00:10.980791,2021-05-27 10:35:06,651859.0,115.0,0,0,1,0,0,0


In [71]:
merged_df['channel_x'].value_counts()

mobile_push    1690477
email          1134481
Name: channel_x, dtype: int64

Need to check whether each type of `subject_type` affects `is_opened` and `is_purchased`

### Analyzing the Data
The group will now be analyzing the data in this section. We will be looking at important characteristics of the marketing campaign subjects and its effectiveness with regard to clickthrough rate which campaigns result in client purchase.

In [171]:
pd.pivot_table(merged_df, 
                values='is_purchased', 
                index=['subject_with_emoji', 'subject_with_personalization'], 
                aggfunc='count')

Unnamed: 0_level_0,Unnamed: 1_level_0,is_purchased
subject_with_emoji,subject_with_personalization,Unnamed: 2_level_1
1,0,2822501
1,1,2457


In [184]:
# can remove since every row has an emoji
emoji = merged_df.groupby("subject_with_emoji")['is_purchased'].value_counts()

# can remove since none have bonuses lol
bonuses = merged_df.groupby("subject_with_bonuses")['is_purchased'].value_counts()
display(bonuses)
display(emoji)

subject_with_bonuses  is_purchased
0                     0               2823930
                      1                  1028
Name: is_purchased, dtype: int64

subject_with_emoji  is_purchased
1                   0               2823930
                    1                  1028
Name: is_purchased, dtype: int64

### Marketing effect on Purchases
We will look at the effectiveness of advertising **<u>subject customization</u>** on sales.

In [270]:
personalization = merged_df.groupby('subject_with_personalization')['is_purchased'].value_counts()
deadline = merged_df.groupby("subject_with_deadline")['is_purchased'].value_counts()
discounts = merged_df.groupby("subject_with_discount")['is_purchased'].value_counts()
saleout = merged_df.groupby('subject_with_saleout')['is_purchased'].value_counts()

result = pd.concat([personalization, deadline, discounts, saleout], axis=1, keys=['personalization', 'deadline', 'discount', 'saleout'])
index = pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0), (1, 1)], names=['is_purchased', ''])
final_result = pd.DataFrame(result, index=index)
final_result = final_result.unstack().fillna(0).astype(int)

In [271]:
display(final_result)

Unnamed: 0_level_0,personalization,personalization,deadline,deadline,discount,discount,saleout,saleout
Unnamed: 0_level_1,0,1,0,1,0,1,0,1
is_purchased,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,2821476,1025,2782147,826,2811739,1025,2731167,1017
1,2454,3,41783,202,12191,3,92763,11


After getting the correct dataframe with the effects of subject customization, we will then look at the improvement on sales per customization type (e.g. message personalization, messages with deadlines, message with discounts, message with saleouts).

In [272]:
final_result = final_result.transpose()
diff = final_result.div(final_result.sum(1),axis=0)
display(final_result.div(final_result.sum(1),axis=0))

Unnamed: 0,is_purchased,0,1
,,,
personalization,0.0,0.999131,0.000869
personalization,1.0,0.997082,0.002918
deadline,0.0,0.985204,0.014796
deadline,1.0,0.803502,0.196498
discount,0.0,0.995683,0.004317
discount,1.0,0.997082,0.002918
saleout,0.0,0.967151,0.032849
saleout,1.0,0.9893,0.0107


<strong>Marketing results on Sales</strong>

In [273]:
def getdiff(diff_df, feature):
    diff_1 = diff_df[1][(feature, 1)] - diff_df[1][(feature, 0)]
    percentage_inc = (diff_1 / diff_df[1][(feature, 0)]) * 100
    return diff_1, percentage_inc
    
personalization_diff, personalization_percent_inc = getdiff(diff, 'personalization')
deadline_diff, deadline_percent_inc = getdiff(diff, 'deadline')
discount_diff, discount_percent_inc = getdiff(diff, 'discount')
saleout_diff, saleout_percent_inc = getdiff(diff, 'saleout')

sales_performance = pd.DataFrame({'Difference': [personalization_diff, deadline_diff, discount_diff, saleout_diff], 
                                  'Percent Increase': [personalization_percent_inc,deadline_percent_inc, discount_percent_inc, saleout_percent_inc]}, 
                                  index = ['Personalization', 'Deadline', 'Discount', 'Saleout'])

display(sales_performance)

sales_performance['Percent Increase'].plot(kind='bar', figsize=(8, 5))
plt.title('Sales Performance by Feature')
plt.xlabel('Feature')
plt.ylabel('Values')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

Unnamed: 0,Difference,Percent Increase
Personalization,0.002049,235.820736
Deadline,0.181702,1228.044303
Discount,-0.001399,-32.400616
Saleout,-0.022149,-67.425428


### Marketing Performance on Clickthrough Rate


In [274]:
personalization = merged_df.groupby('subject_with_personalization')['is_clicked'].value_counts()
deadline = merged_df.groupby("subject_with_deadline")['is_clicked'].value_counts()
discounts = merged_df.groupby("subject_with_discount")['is_clicked'].value_counts()
saleout = merged_df.groupby('subject_with_saleout')['is_clicked'].value_counts()

result = pd.concat([personalization, deadline, discounts, saleout], axis=1, keys=['personalization', 'deadline', 'discount', 'saleout'])
index = pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0), (1, 1)], names=['is_clicked', ''])
final_result = pd.DataFrame(result, index=index)
final_result = final_result.unstack().fillna(0).astype(int)

In [275]:
final_result = final_result.transpose()
diff = final_result.div(final_result.sum(1),axis=0)
display(final_result.div(final_result.sum(1),axis=0))

Unnamed: 0,is_clicked,0,1
,,,
personalization,0.0,0.999165,0.000835
personalization,1.0,0.996348,0.003652
deadline,0.0,0.985534,0.014466
deadline,1.0,0.953079,0.046921
discount,0.0,0.995718,0.004282
discount,1.0,0.992871,0.007129
saleout,0.0,0.967038,0.032962
saleout,1.0,0.976931,0.023069


In [278]:
def getdiff(diff_df, feature):
    diff_1 = diff_df[1][(feature, 1)] - diff_df[1][(feature, 0)]
    percentage_inc = (diff_1 / diff_df[1][(feature, 0)]) * 100
    return diff_1, percentage_inc
    
personalization_diff, personalization_percent_inc = getdiff(diff, 'personalization')
deadline_diff, deadline_percent_inc = getdiff(diff, 'deadline')
discount_diff, discount_percent_inc = getdiff(diff, 'discount')
saleout_diff, saleout_percent_inc = getdiff(diff, 'saleout')

sales_performance = pd.DataFrame({'Difference': [personalization_diff, deadline_diff, discount_diff, saleout_diff], 
                                  'Percent Increase': [personalization_percent_inc,deadline_percent_inc, discount_percent_inc, saleout_percent_inc]}, 
                                  index = ['Personalization', 'Deadline', 'Discount', 'Saleout'])
display(sales_performance)

Unnamed: 0,Difference,Percent Increase
Personalization,0.002816,337.140407
Deadline,0.032455,224.357402
Discount,0.002848,66.50703
Saleout,-0.009893,-30.01229


### Marketing Performance on Customer Engagement

In [279]:
personalization = merged_df.groupby('subject_with_personalization')['is_unsubscribed'].value_counts()
deadline = merged_df.groupby("subject_with_deadline")['is_unsubscribed'].value_counts()
discounts = merged_df.groupby("subject_with_discount")['is_unsubscribed'].value_counts()
saleout = merged_df.groupby('subject_with_saleout')['is_unsubscribed'].value_counts()

result = pd.concat([personalization, deadline, discounts, saleout], axis=1, keys=['personalization', 'deadline', 'discount', 'saleout'])
index = pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0), (1, 1)], names=['is_unsubscribed', ''])
final_result = pd.DataFrame(result, index=index)
final_result = final_result.unstack().fillna(0).astype(int)

In [280]:
final_result = final_result.transpose()
diff = final_result.div(final_result.sum(1),axis=0)
display(final_result.div(final_result.sum(1),axis=0))

Unnamed: 0,is_unsubscribed,0,1
,,,
personalization,0.0,0.999094,0.000906
personalization,1.0,0.999966,3.4e-05
deadline,0.0,0.984588,0.015412
deadline,1.0,0.997871,0.002129
discount,0.0,0.995636,0.004364
discount,1.0,0.996777,0.003223
saleout,0.0,0.96585,0.03415
saleout,1.0,0.997478,0.002522


In [283]:
def getdiff(diff_df, feature):
    diff_1 = diff_df[0][(feature, 1)] - diff_df[0][(feature, 0)]
    percentage_inc = (diff_1 / diff_df[0][(feature, 0)]) * 100
    return diff_1, percentage_inc
    
personalization_diff, personalization_percent_inc = getdiff(diff, 'personalization')
deadline_diff, deadline_percent_inc = getdiff(diff, 'deadline')
discount_diff, discount_percent_inc = getdiff(diff, 'discount')
saleout_diff, saleout_percent_inc = getdiff(diff, 'saleout')

sales_performance = pd.DataFrame({'Difference': [personalization_diff, deadline_diff, discount_diff, saleout_diff], 
                                  'Percent Increase': [personalization_percent_inc,deadline_percent_inc, discount_percent_inc, saleout_percent_inc]}, 
                                  index = ['Personalization', 'Deadline', 'Discount', 'Saleout'])
display(sales_performance)

sales_performance['Percent Increase'].plot(kind='bar', figsize=(8, 5))
plt.title('Sales Performance by Feature')
plt.xlabel('Feature')
plt.ylabel('Values')
plt.xticks(rotation=0)
plt.grid(axis='y', linestyle='--')
plt.tight_layout()
plt.show()

Unnamed: 0,Difference,Percent Increase
Personalization,0.000872,0.087242
Deadline,0.013283,1.3491
Discount,0.00114,0.114527
Saleout,0.031628,3.274621


### BLAH BLAH EDIT LATER

In [277]:
channel_type = merged_df.groupby('channel_x')
channel_purchases = channel_type['is_purchased'].value_counts()
channel_clicks = channel_type['is_clicked'].value_counts()

result = pd.concat([channel_purchases, channel_clicks], axis=1, keys=['purchased', 'clicked'])
display(result)

Unnamed: 0_level_0,Unnamed: 1_level_0,purchased,clicked
channel_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
email,0,1133453.0,1102125
email,1,1028.0,32356
mobile_push,0,1690477.0,1688328
mobile_push,1,,2149


### The Impact of Subject Length on Email Open Rates

In [None]:
grouped_data = merged_df.groupby(['subject_length', 'is_opened']).size().reset_index(name='count')
opened_data = grouped_data[grouped_data['is_opened'] == 1]
ignored_data = grouped_data[grouped_data['is_opened'] == 0]

pivot_data = grouped_data.pivot(index='subject_length', columns='is_opened', values='count')
pivot_data['percentage'] = (pivot_data[1] / (pivot_data[1] + pivot_data[0])) * 100
pivot_data.reset_index(inplace=True)
print(pivot_data[['subject_length', 'percentage']])

pivot_data.plot(kind='bar', x='subject_length', y='percentage', color='olive')
plt.xlabel('Subject Length')
plt.ylabel('Percentage of Opened Email')
plt.title('Percentage of Opened Email Count by Subject Length')
plt.show()

stacked_bar = grouped_data.pivot(index='subject_length', columns='is_opened', values='count')
stacked_bar.plot(kind='bar', stacked='True', color=['red', 'blue'])
plt.xlabel('Subject Length')
plt.ylabel('Count')
plt.title('Opened and Ignored Email Count by Subject Length')
plt.legend(title='is_opened', labels=['Ignored', 'Opened'])
plt.show()

### Checking if holidays correspond to client purchases

In [143]:
holidays_df = pd.read_csv('holidays.csv')
first_purchase_df = pd.read_csv('client_first_purchase_date.csv')

from datetime import datetime
import pytz

In [152]:
print(first_purchase_df['first_purchase_date'][0])
print(pd.to_datetime(holidays_df['date'][0]))
print((first_purchase_df['first_purchase_date'][101]))
print(pd.to_datetime(first_purchase_dfpurchase_df['first_purchase_date'][101]))

2022-03-04
2021-01-01 00:00:00
2023-08-22
2023-08-22 00:00:00
