In [None]:
# Libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.metrics import confusion_matrix, classification_report
import matplotlib.pyplot as plt

In [None]:
# Upload Dataset
df = pd.read_csv('/content/digital_marketing_campaign_dataset.csv') # Kaggle link: https://www.kaggle.com/datasets/zafarali27/digital-marketing-campaign?resource=download
df.head()

Unnamed: 0,CustomerID,Age,Gender,Income,CampaignChannel,CampaignType,AdSpend,ClickThroughRate,ConversionRate,WebsiteVisits,PagesPerVisit,TimeOnSite,SocialShares,EmailOpens,EmailClicks,PreviousPurchases,LoyaltyPoints,AdvertisingPlatform,AdvertisingTool,Conversion
0,8000,56,Female,136912,Social Media,Awareness,6497.870068,0.043919,0.088031,0,2.399017,7.396803,19,6,9,4,688,IsConfid,ToolConfid,1
1,8001,69,Male,41760,Email,Retention,3898.668606,0.155725,0.182725,42,2.917138,5.352549,5,2,7,2,3459,IsConfid,ToolConfid,1
2,8002,46,Female,88456,PPC,Awareness,1546.429596,0.27749,0.076423,2,8.223619,13.794901,0,11,2,8,2337,IsConfid,ToolConfid,1
3,8003,32,Female,44085,PPC,Conversion,539.525936,0.137611,0.088004,47,4.540939,14.688363,89,2,2,0,2463,IsConfid,ToolConfid,1
4,8004,60,Female,83964,PPC,Conversion,1678.043573,0.252851,0.10994,0,2.046847,13.99337,6,6,6,8,4345,IsConfid,ToolConfid,1


In [None]:
# Remove irrelevant features
df = df.drop(['CustomerID', 'AdvertisingPlatform', 'AdvertisingTool'], axis=1)
df.head()

Unnamed: 0,Age,Gender,Income,CampaignChannel,CampaignType,AdSpend,ClickThroughRate,ConversionRate,WebsiteVisits,PagesPerVisit,TimeOnSite,SocialShares,EmailOpens,EmailClicks,PreviousPurchases,LoyaltyPoints,Conversion
0,56,Female,136912,Social Media,Awareness,6497.870068,0.043919,0.088031,0,2.399017,7.396803,19,6,9,4,688,1
1,69,Male,41760,Email,Retention,3898.668606,0.155725,0.182725,42,2.917138,5.352549,5,2,7,2,3459,1
2,46,Female,88456,PPC,Awareness,1546.429596,0.27749,0.076423,2,8.223619,13.794901,0,11,2,8,2337,1
3,32,Female,44085,PPC,Conversion,539.525936,0.137611,0.088004,47,4.540939,14.688363,89,2,2,0,2463,1
4,60,Female,83964,PPC,Conversion,1678.043573,0.252851,0.10994,0,2.046847,13.99337,6,6,6,8,4345,1


In [None]:
# Check null values
df.isna().sum()

Unnamed: 0,0
Age,0
Gender,0
Income,0
CampaignChannel,0
CampaignType,0
AdSpend,0
ClickThroughRate,0
ConversionRate,0
WebsiteVisits,0
PagesPerVisit,0


In [None]:
# Convert string values to numbers
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df['CampaignChannel'] = le.fit_transform(df['CampaignChannel'])
df['CampaignType'] = le.fit_transform(df['CampaignType'])
df.head()

Unnamed: 0,Age,Gender,Income,CampaignChannel,CampaignType,AdSpend,ClickThroughRate,ConversionRate,WebsiteVisits,PagesPerVisit,TimeOnSite,SocialShares,EmailOpens,EmailClicks,PreviousPurchases,LoyaltyPoints,Conversion
0,56,0,136912,4,0,6497.870068,0.043919,0.088031,0,2.399017,7.396803,19,6,9,4,688,1
1,69,1,41760,0,3,3898.668606,0.155725,0.182725,42,2.917138,5.352549,5,2,7,2,3459,1
2,46,0,88456,1,0,1546.429596,0.27749,0.076423,2,8.223619,13.794901,0,11,2,8,2337,1
3,32,0,44085,1,2,539.525936,0.137611,0.088004,47,4.540939,14.688363,89,2,2,0,2463,1
4,60,0,83964,1,2,1678.043573,0.252851,0.10994,0,2.046847,13.99337,6,6,6,8,4345,1


In [None]:
x = df.drop('Conversion', axis=1)
y = df['Conversion']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)

In [None]:
# Hyperparameter tuning using GridSearchCV
param_grid = {
    'max_depth': [3, 5, 7, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

In [None]:
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42),
                         param_grid=param_grid,
                         cv=5,
                         scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get best model
best_tree = grid_search.best_estimator_

# Print results
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)
print("Test set accuracy:", best_tree.score(X_test, y_test))

Best parameters: {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best cross-validation accuracy: 0.8778124999999999
Test set accuracy: 0.880625


In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': best_tree.feature_importances_
})
print("\nFeature Importance:")
print(feature_importance.sort_values('importance', ascending=False))


Feature Importance:
              feature  importance
14  PreviousPurchases    0.264279
9       PagesPerVisit    0.253879
6    ClickThroughRate    0.208392
15      LoyaltyPoints    0.135826
5             AdSpend    0.091736
10         TimeOnSite    0.045888
0                 Age    0.000000
1              Gender    0.000000
2              Income    0.000000
3     CampaignChannel    0.000000
4        CampaignType    0.000000
7      ConversionRate    0.000000
8       WebsiteVisits    0.000000
11       SocialShares    0.000000
12         EmailOpens    0.000000
13        EmailClicks    0.000000


In [None]:
# Detailed classification report
y_pred = best_tree.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.55      0.08      0.14       194
           1       0.89      0.99      0.94      1406

    accuracy                           0.88      1600
   macro avg       0.72      0.54      0.54      1600
weighted avg       0.85      0.88      0.84      1600



In [None]:
# Visualize the tree
plt.figure(figsize=(20,10))
plot_tree(best_tree, feature_names=X.columns, class_names=['0', '1'],
          filled=True, rounded=True, fontsize=10)
plt.savefig('decision_tree.png')
plt.close()

# Print text representation of the tree
print("\nText representation of the tree:")
print(export_text(best_tree, feature_names=list(X.columns)))


Text representation of the tree:
|--- PreviousPurchases <= 1.50
|   |--- ClickThroughRate <= 0.09
|   |   |--- PagesPerVisit <= 3.28
|   |   |   |--- class: 0
|   |   |--- PagesPerVisit >  3.28
|   |   |   |--- class: 1
|   |--- ClickThroughRate >  0.09
|   |   |--- AdSpend <= 2837.00
|   |   |   |--- class: 1
|   |   |--- AdSpend >  2837.00
|   |   |   |--- class: 1
|--- PreviousPurchases >  1.50
|   |--- PagesPerVisit <= 2.99
|   |   |--- LoyaltyPoints <= 968.50
|   |   |   |--- class: 1
|   |   |--- LoyaltyPoints >  968.50
|   |   |   |--- class: 1
|   |--- PagesPerVisit >  2.99
|   |   |--- TimeOnSite <= 4.98
|   |   |   |--- class: 1
|   |   |--- TimeOnSite >  4.98
|   |   |   |--- class: 1

