# Phase 1: Data Preprocessing & Feature Engineering

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder, TargetEncoder
from sklearn.impute import SimpleImputer

In [3]:
# Load the dataset
df = pd.read_csv('marketing_campaign.csv', sep='\t')  # Note: Adjust delimiter if needed
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


## Data Cleaning

In [4]:
# Handle missing values (e.g., Income)
df['Income'] = df['Income'].fillna(df['Income'].median())

# Remove outliers (e.g., Income > 99th percentile)
income_threshold = df['Income'].quantile(0.99)
df = df[df['Income'] <= income_threshold]

# Drop redundant columns (Z_CostContact, Z_Revenue are constants)
df = df.drop(columns=['Z_CostContact', 'Z_Revenue'])

## Feature Encoding

In [5]:
# Ordinal encoding for Education (assuming ordinality)
education_order = ['Basic', '2n Cycle', 'Graduation', 'Master', 'PhD']
df['Education'] = df['Education'].astype('category').cat.set_categories(education_order, ordered=True)
df['Education_encoded'] = df['Education'].cat.codes

# One-hot encoding for Marital_Status
df = pd.get_dummies(df, columns=['Marital_Status'], prefix='Marital')

## New Feature Creation

In [6]:
# Convert Dt_Customer to tenure (days since enrollment)
# We calculate from the last date of purchase, not today (2025)
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format='%d-%m-%Y')
last_purchased_date = df['Dt_Customer'].max()
df['Customer_Tenure'] = (last_purchased_date - df['Dt_Customer']).dt.days

# Derive "Family_Size" (Kidhome + Teenhome)
df['Family_Size'] = df['Kidhome'] + df['Teenhome']

# Derive "Total_Spend" (sum of all product purchases)
product_columns = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds']
df['Total_Spend'] = df[product_columns].sum(axis=1)

## Scaling & Correlation

In [7]:
# Scale numerical features (for clustering/autoencoder)
scaler = StandardScaler()
numerical_features = ['Income', 'Customer_Tenure', 'Total_Spend', 'Family_Size', 'Recency']
df[numerical_features] = scaler.fit_transform(df[numerical_features])
# Correlation analysis (optional, for EDA)
correlation_matrix = df[numerical_features].corr()

## Documentation & Output 

In [8]:
# Save preprocessed data
df.to_csv('preprocessed_campaign_data.csv', index=False)

# Print summary of changes
print("Preprocessing Summary:")
print(f"- Missing values handled: Income imputed with median.")
print(f"- New features added: Customer_Tenure, Family_Size, Total_Spend.")
print(f"- Categorical features encoded: Education (ordinal), Marital_Status (one-hot).")
print(f"- Numerical features scaled: {numerical_features}")

Preprocessing Summary:
- Missing values handled: Income imputed with median.
- New features added: Customer_Tenure, Family_Size, Total_Spend.
- Categorical features encoded: Education (ordinal), Marital_Status (one-hot).
- Numerical features scaled: ['Income', 'Customer_Tenure', 'Total_Spend', 'Family_Size', 'Recency']
