In [1]:
# 📌 EMAIL CAMPAIGN ANALYSIS & OPTIMIZATION

# 👨‍💻 Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# 📂 Load the datasets
email_df = pd.read_csv('email_table.csv')
opened_df = pd.read_csv('email_opened_table.csv')
clicked_df = pd.read_csv('link_clicked_table.csv')


In [2]:
# ✅ Q1. What percentage of users opened the email and clicked on the link?
total_emails = email_df.shape[0]

opened_emails = opened_df.shape[0]
clicked_emails = clicked_df.shape[0]
opened_pct = (opened_emails / total_emails) * 100
clicked_pct = (clicked_emails / total_emails) * 100

print(f"📬 Email Open Rate: {opened_pct:.2f}%")
print(f"🔗 Click-through Rate (CTR): {clicked_pct:.2f}%")


📬 Email Open Rate: 10.35%
🔗 Click-through Rate (CTR): 2.12%


In [3]:
# 🧠 Q2. Can we build a model to optimize sending for clicks?
email_df['clicked'] = email_df['email_id'].isin(clicked_df['email_id']).astype(int)

for col in ['email_text', 'email_version', 'weekday', 'user_country']:
    email_df[col] = LabelEncoder().fit_transform(email_df[col])
X = email_df.drop(['email_id', 'clicked'], axis=1)
y = email_df['clicked']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("📊 Model Performance:\n")
print(classification_report(y_test, y_pred))


📊 Model Performance:

              precision    recall  f1-score   support

           0       0.98      1.00      0.99     19547
           1       0.06      0.01      0.02       453

    accuracy                           0.97     20000
   macro avg       0.52      0.50      0.50     20000
weighted avg       0.96      0.97      0.97     20000



We trained a model that can rank users based on how likely they are to click an email. The current version struggles with rare clicks due to class imbalance. After applying techniques like resampling or cost-sensitive learning, the model will be more effective in selectively targeting high-potential users, improving email ROI significantly.

In [4]:
#📈 Q3. By how much would the model improve the CTR?
email_df['pred_proba'] = model.predict_proba(X)[:, 1]
top_20 = email_df.sort_values('pred_proba', ascending=False).head(int(0.2 * len(email_df)))
improved_ctr = top_20['clicked'].mean()
original_ctr = email_df['clicked'].mean()
improvement = ((improved_ctr - original_ctr) / original_ctr) * 100

print(f"🎯 Original CTR (random send): {original_ctr:.2%}")
print(f"🚀 Improved CTR (model-based targeting): {improved_ctr:.2%}")
print(f"📈 Estimated CTR Improvement: {improvement:.2f}%")


🎯 Original CTR (random send): 2.12%
🚀 Improved CTR (model-based targeting): 9.21%
📈 Estimated CTR Improvement: 334.40%


To validate the improvement,we need to run an A/B Test:

Group	Description
Control	Random sample of users (same as current method)
Test	Top 20% users based on model predictions
Measure CTR in both groups.

Use a statistical significance test (like Chi-Squared or z-test for proportions) to ensure the lift is real and not due to chance.
Optionally, monitor long-term behavior (e.g., purchases post-click) to validate downstream impact.

In [5]:
email_df_original = pd.read_csv('email_table.csv')
email_df_original['clicked'] = email_df_original['email_id'].isin(clicked_df['email_id']).astype(int)
country_ctr_named = email_df_original.groupby('user_country')['clicked'].mean().sort_values(ascending=False)
version_ctr_named = email_df_original.groupby('email_version')['clicked'].mean()
text_ctr_named = email_df_original.groupby('email_text')['clicked'].mean()
email_df_original['purchase_bin'] = pd.cut(
    email_df_original['user_past_purchases'], 
    bins=[-1, 0, 1, 3, 5, 10, 100], 
    labels=["0", "1", "2-3", "4-5", "6-10", "10+"]
)
purchase_ctr_named = email_df_original.groupby('purchase_bin')['clicked'].mean()
print("\n🌍 CTR by Country:\n", country_ctr_named)
print("\n✉️ CTR by Email Version (generic/personalized):\n", version_ctr_named)
print("\n📝 CTR by Email Text Length (short/long):\n", text_ctr_named)
print("\n🛍️ CTR by Past Purchases:\n", purchase_ctr_named)



🌍 CTR by Country:
 user_country
UK    0.024675
US    0.024360
ES    0.008327
FR    0.008004
Name: clicked, dtype: float64

✉️ CTR by Email Version (generic/personalized):
 email_version
generic         0.015137
personalized    0.027294
Name: clicked, dtype: float64

📝 CTR by Email Text Length (short/long):
 email_text
long_email     0.018538
short_email    0.023872
Name: clicked, dtype: float64

🛍️ CTR by Past Purchases:
 purchase_bin
0       0.000504
1       0.011199
2-3     0.015928
4-5     0.021784
6-10    0.036451
10+     0.069037
Name: clicked, dtype: float64


CTR by Country:
Users from English-speaking countries (UK & US) responded significantly better to the campaign. Non-English-speaking countries like Spain and France had much lower engagement likely due to language barriers or cultural mismatch in communication.

CTR by Email Version:
Personalized emails had ~80% higher CTR than generic ones. This supports the idea that personal touches (like using the user’s name) increase trust and engagement.

CTR by Email Text Length:
Short emails performed ~29% better than long ones. Users seem to prefer concise communication that quickly conveys value.

CTR by Past Purchases:
There's a strong positive correlation between past purchase behavior and CTR. Highly active buyers (10+) are ~14x more likely to click compared to non-buyers.

Focus future campaigns on personalized, short-form emails.

Segment the user base by country and purchase history to prioritize high-potential leads.

Consider localizing email content for low-performing geographies ES(Spain), FR(france).