In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
import plotly.express as px

In [None]:
# Load campaign data
df = pd.read_csv('marketing_campaign.csv', sep=';')

In [None]:
# Print Head
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,5,0,0,0,0,0,0,3,11,0


In [None]:
# Check for null data
df.isnull().sum()

ID                      0
Year_Birth              0
Education               0
Marital_Status          0
Income                 24
Kidhome                 0
Teenhome                0
Dt_Customer             0
Recency                 0
MntWines                0
MntFruits               0
MntMeatProducts         0
MntFishProducts         0
MntSweetProducts        0
MntGoldProds            0
NumDealsPurchases       0
NumWebPurchases         0
NumCatalogPurchases     0
NumStorePurchases       0
NumWebVisitsMonth       0
AcceptedCmp3            0
AcceptedCmp4            0
AcceptedCmp5            0
AcceptedCmp1            0
AcceptedCmp2            0
Complain                0
Z_CostContact           0
Z_Revenue               0
Response                0
segment                 0
dtype: int64

In [None]:
# Fill the null values with mean for Income column
df['Income'] = df['Income'].fillna(df['Income'].mean())

In [None]:
# Verify null values again
df.isnull().sum()

ID                     0
Year_Birth             0
Education              0
Marital_Status         0
Income                 0
Kidhome                0
Teenhome               0
Dt_Customer            0
Recency                0
MntWines               0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
AcceptedCmp3           0
AcceptedCmp4           0
AcceptedCmp5           0
AcceptedCmp1           0
AcceptedCmp2           0
Complain               0
Z_CostContact          0
Z_Revenue              0
Response               0
segment                0
dtype: int64

In [None]:
# Percentage of people educated by category
x=df.Education.value_counts()
fig=px.pie(df,values=x,names=x.index,title='Education')
fig.show()

In [None]:
# Relationship status
x=df.Marital_Status.value_counts()
fig=px.pie(df,values=x,names=x.index,title='Martial Status')
fig.show()

In [None]:
# Response rate by age
fig = px.histogram(df, x="Year_Birth", color="Response")
fig.update_layout(title="Response Rate by Age")
fig.show()

In [None]:
# Purchase frequency by income level
fig = px.box(df, y="Income", x="NumStorePurchases")
fig.update_layout(title="Purchase Frequency by Income")
fig.show()

In [None]:
# Replace missing values with column mean and Predict response rate
X = df[['Year_Birth', 'Income', 'NumCatalogPurchases']]
X = X.fillna(X.mean())
y = df['Response']

model = LogisticRegression()
model.fit(X, y)
print('Predicted response rate:', model.predict(X).mean())

Predicted response rate: 0.0004464285714285714


In [None]:
# Predict number of campaigns for conversion
X = df[['Year_Birth', 'Income', 'NumStorePurchases']]
X = X.fillna(X.mean())
y = df['AcceptedCmp3']

model = LogisticRegression()
model.fit(X, y)
print('Expected campaigns till conversion:', 1/model.predict_proba(X)[:,1])

Expected campaigns till conversion: [11.69540231  9.99154167 20.07842727 ... 28.35041086 20.01447672
 11.81364181]


In [None]:
# Response prediction model performance
from sklearn.model_selection import cross_validate

X = df[['Year_Birth', 'Income', 'NumCatalogPurchases']]
X = X.fillna(X.mean())
y = df['Response']

model = LogisticRegression()
scores = cross_validate(model, X, y, scoring=['accuracy', 'precision', 'recall', 'f1'])

fig = px.line(x=list(range(1,6)), y=scores['test_accuracy'], title="Response Model Accuracy")
fig.update_layout(yaxis_title="Accuracy")
fig.show()


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
# Cluster customers
X = df[['Year_Birth', 'Income', 'Teenhome', 'NumCatalogPurchases', 'NumStorePurchases']]
X = X.fillna(X.mean())
kmeans = KMeans(n_clusters=5).fit(X)
df['segment'] = kmeans.labels_
print(df.groupby('segment').agg(['mean', 'count']))





                  ID         Year_Birth               Income         Kidhome  \
                mean count         mean count           mean count      mean   
segment                                                                        
0        5746.324000   500  1974.134000   500   24177.364000   500  0.792000   
1        5680.541284   654  1965.882263   654   63427.415902   654  0.188073   
2        9432.000000     1  1977.000000     1  666666.000000     1  1.000000   
3        5590.873418   395  1967.848101   395   82990.162025   395  0.073418   
4        5391.847826   690  1968.252174   690   43186.061562   666  0.646377   

               Teenhome        ... AcceptedCmp2        Complain        \
        count      mean count  ...         mean count      mean count   
segment                        ...                                      
0         500  0.228000   500  ...     0.000000   500  0.010000   500   
1         654  0.709480   654  ...     0.018349   654  0.007645   6





In [None]:
# Cluster segments
fig = px.parallel_categories(df, dimensions=["Year_Birth", "Income", "Teenhome"],
                             color="segment",
                             color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(title="Customer Segments")
fig.show()