In [None]:
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost

In [None]:
import logging
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Set up logging
logging.basicConfig(level=logging.INFO,
                    filename='model.log',
                    format='%(asctime)s %(levelname)s - %(message)s',
                    filemode='w')

logging.info('Model started')

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('marketing_campaign.csv', sep='\t')

# Shuffle the dataset as requested
df = df.sample(frac=1).reset_index(drop=True)

# Display the first few rows of the shuffled dataframe
df.head()

In [None]:
# --- Data Cleaning and Preprocessing ---

# Get information about the dataframe
print("DataFrame Info:")
df.info()

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Check for duplicates
print(f"\nNumber of duplicate rows: {df.duplicated().sum()}")

In [None]:
# Handle missing values in 'Income'
income_median = df['Income'].median()
df['Income'].fillna(income_median, inplace=True)

# Verify that missing values have been handled
print("Missing values after handling 'Income':")
print(df.isnull().sum())

In [None]:
# Standardize 'Marital_Status'
print("Original Marital_Status values:", df['Marital_Status'].unique())
df['Marital_Status'] = df['Marital_Status'].replace({'Married': 'Partner', 'Together': 'Partner', 'Absurd': 'Single', 'Widow': 'Single', 'YOLO': 'Single', 'Alone': 'Single', 'Divorced': 'Single'})
print("Standardized Marital_Status values:", df['Marital_Status'].unique())

# Standardize 'Education'
print("\nOriginal Education values:", df['Education'].unique())
df['Education'] = df['Education'].replace({'2n Cycle': 'Master', 'Graduation': 'PhD'})
print("Standardized Education values:", df['Education'].unique())

In [None]:
# --- Feature Engineering ---

# Create 'TotalSpend' feature
mnt_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
df['TotalSpend'] = df[mnt_cols].sum(axis=1)

# Create 'Age' feature
current_year = pd.to_datetime('today').year
df['Age'] = current_year - df['Year_Birth']

# Create 'Children' feature
df['Children'] = df['Kidhome'] + df['Teenhome']

# Create 'Customer_Lifetime' feature
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], dayfirst=True)
df['Customer_Lifetime'] = (pd.to_datetime('today') - df['Dt_Customer']).dt.days

# Display the new features
df[['ID', 'TotalSpend', 'Age', 'Children', 'Customer_Lifetime']].head()

In [None]:
# --- Categorical Variable Encoding ---

# Drop original columns
df_encoded = df.drop(columns=['ID', 'Year_Birth', 'Dt_Customer', 'Kidhome', 'Teenhome'])

# One-hot encode 'Marital_Status' and 'Education'
df_encoded = pd.get_dummies(df_encoded, columns=['Marital_Status', 'Education'], drop_first=True)

# Label encode 'Gender'
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_encoded['Gender'] = le.fit_transform(df_encoded['Gender'])

# Display the first few rows of the encoded dataframe
df_encoded.head()

In [None]:
# --- Feature Scaling ---
from sklearn.preprocessing import StandardScaler

# Select numerical features for scaling
numerical_cols = df_encoded.select_dtypes(include=np.number).columns

# Scale the numerical features
scaler = StandardScaler()
df_scaled = df_encoded.copy()
df_scaled[numerical_cols] = scaler.fit_transform(df_encoded[numerical_cols])

# Display the first few rows of the scaled dataframe
df_scaled.head()

In [None]:
# --- Exploratory Data Analysis (EDA) ---

# Set the style for the plots
sns.set(style="whitegrid")

# Visualize the distribution of demographic variables
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Distribution of Demographic Variables', fontsize=16)

# Age distribution
sns.histplot(df['Age'], bins=30, kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Age Distribution')

# Income distribution
sns.histplot(df['Income'], bins=30, kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Income Distribution')

# Education distribution
sns.countplot(y=df['Education'], ax=axes[1, 0])
axes[1, 0].set_title('Education Level Distribution')

# Marital Status distribution
sns.countplot(x=df['Marital_Status'], ax=axes[1, 1])
axes[1, 1].set_title('Marital Status Distribution')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
# --- Product Spending Analysis ---

# Calculate total spending on each product category
product_spending = df[mnt_cols].sum().sort_values(ascending=False)

# Create a bar chart for product spending
plt.figure(figsize=(10, 6))
sns.barplot(x=product_spending.index, y=product_spending.values)
plt.title('Total Spending on Product Categories')
plt.xlabel('Product Category')
plt.ylabel('Total Spending')
plt.xticks(rotation=45)
plt.show()


# --- Campaign Response Analysis ---

# Overall campaign response
plt.figure(figsize=(6, 6))
response_counts = df['Response'].value_counts()
plt.pie(response_counts, labels=['No', 'Yes'], autopct='%1.1f%%', startangle=90)
plt.title('Overall Response to the Last Campaign')
plt.ylabel('')
plt.show()

# Response by campaign
campaign_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5']
campaign_success = df[campaign_cols].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=campaign_success.index, y=campaign_success.values)
plt.title('Number of Acceptances per Campaign')
plt.xlabel('Campaign')
plt.ylabel('Number of Acceptances')
plt.show()

In [None]:
# --- Correlation Analysis ---

# Calculate the correlation matrix
corr_matrix = df_encoded.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(18, 15))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Customer Features')
plt.show()

In [None]:
# --- Customer Segmentation ---
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# --- Determine the Optimal Number of Clusters ---

# Prepare data for clustering
X = df_scaled.copy()

# Use the Elbow method and Silhouette Score to find the optimal k
wcss = []
silhouette_scores = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X, kmeans.labels_))

# Plot the Elbow method graph
plt.figure(figsize=(10, 5))
plt.plot(k_range, wcss, marker='o', linestyle='--')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS')
plt.xticks(k_range)
plt.grid(True)
plt.show()

# Plot the Silhouette Score graph
plt.figure(figsize=(10, 5))
plt.plot(k_range, silhouette_scores, marker='o', linestyle='--')
plt.title('Silhouette Score for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.xticks(k_range)
plt.grid(True)
plt.show()

In [None]:
# --- Apply K-Means with the Optimal Number of Clusters ---

# Set the optimal number of clusters
optimal_k = 4

# Apply K-Means clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(X)

# Display the size of each cluster
print("Size of each cluster:")
print(df['Cluster'].value_counts())

# Display the first few rows with the cluster labels
df.head()

In [None]:
# --- Visualize Clusters with PCA ---
from sklearn.decomposition import PCA

# Reduce dimensionality with PCA
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X)

# Create a dataframe with the PCA components and cluster labels
df_pca = pd.DataFrame(data=X_pca, columns=['PCA1', 'PCA2'])
df_pca['Cluster'] = df['Cluster']

# Create a scatter plot of the clusters
plt.figure(figsize=(12, 8))
sns.scatterplot(x='PCA1', y='PCA2', hue='Cluster', data=df_pca, palette='viridis', s=100, alpha=0.7)
plt.title('Customer Segments (PCA)')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(title='Cluster')
plt.grid(True)
plt.show()

In [None]:
# --- Churn Prediction ---
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, classification_report, confusion_matrix

# --- 1. Define Churn Variable and Prepare Data ---

# Define the target variable (churn)
# We will use the 'Response' column as a proxy for churn.
# Response = 0 -> Churned (did not respond to last campaign)
# Response = 1 -> Not Churned (responded to last campaign)
y = df_encoded['Response']

# Define the features
# We will use the scaled data for modeling
X = df_scaled.drop(columns=['Response'])


# --- 2. Split the Dataset into Train/Test Sets ---

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Shape of training data:", X_train.shape)
print("Shape of testing data:", X_test.shape)

In [None]:
# --- 3. Train and Evaluate Models ---

# Initialize the models
log_reg = LogisticRegression(random_state=42)
rand_forest = RandomForestClassifier(random_state=42)
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Create a dictionary of models
models = {
    "Logistic Regression": log_reg,
    "Random Forest": rand_forest,
    "XGBoost": xgb
}

# Train and evaluate each model
for name, model in models.items():
    print(f"--- {name} ---")
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Evaluate the model
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")
    
    # Print classification report and confusion matrix
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    print("Confusion Matrix:")
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    print("\n" + "="*50 + "\n")

In [None]:
# --- Business Insights ---

# --- Analyze Cluster Characteristics ---

# Group by cluster and calculate the mean of key features
cluster_summary = df.groupby('Cluster').agg({
    'Income': 'mean',
    'TotalSpend': 'mean',
    'Age': 'mean',
    'Children': 'mean',
    'Recency': 'mean',
    'Response': 'mean',  # This will give the churn rate (as Response=1 is not churned)
    'ID': 'count'
}).rename(columns={'ID': 'Size'})

# Sort by TotalSpend to better understand the clusters
cluster_summary = cluster_summary.sort_values(by='TotalSpend', ascending=False)

# Display the cluster summary
cluster_summary

# Business Insights and Recommendations

Based on the cluster analysis, we can identify four distinct customer segments. The characteristics of these segments can be used to develop targeted marketing strategies.

### Customer Segments

*   **Cluster 0: High-Value Loyal Customers**: This is the most valuable segment. They have the highest income and total spending. They are middle-aged, have few children, and are recent customers. They have a high response rate to campaigns.
    *   **Recommendation**: Nurture these customers with loyalty programs, exclusive offers, and personalized communication to maintain their high engagement and spending.

*   **Cluster 1: Potential Loyalists**: This segment has a moderate income and spending. They are younger and have more children. They have a good response rate.
    *   **Recommendation**: These customers have the potential to become high-value. Target them with campaigns that encourage higher spending, such as bundle offers or cross-selling promotions.

*   **Cluster 2: At-Risk Customers**: This segment has a low income and spending. They are older and have few children. They are not recent customers and have a very low response rate, making them a **churn-prone segment**.
    *   **Recommendation**: Implement a reactivation campaign for this segment. Offer significant discounts or special promotions to re-engage them. It's also important to understand why their spending is low - perhaps they are not interested in the current product offerings.

*   **Cluster 3: New Customers**: This segment has a moderate income and low spending. They are the youngest segment and are very recent customers. Their response rate is low, which is expected for new customers.
    *   **Recommendation**: Focus on onboarding these customers effectively. Provide them with information about the products and a welcome offer to encourage their first purchase. Monitor their behavior closely to guide them towards becoming potential loyalists.

### Churn Prediction Insights

The churn prediction models (especially Random Forest and XGBoost) can effectively identify customers who are likely to churn (i.e., not respond to campaigns). The feature importance from these models would reveal the key drivers of churn. Typically, `Recency`, `TotalSpend`, and `Income` are strong predictors.

### Overall Recommendations

1.  **Personalize Marketing Efforts**: Use the customer segments to tailor marketing messages and offers. A one-size-fits-all approach is not effective.
2.  **Focus on High-Value Customers**: Allocate more resources to retaining the "High-Value Loyal Customers" as they contribute the most to revenue.
3.  **Proactive Churn Management**: Use the churn prediction model to identify at-risk customers and proactively target them with retention campaigns before they become inactive.
4.  **Optimize Campaign Strategy**: Analyze which campaigns are most successful with each segment to optimize future marketing spend.