In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

# Set up visualization styles
sns.set_style('whitegrid')
plt.style.use('seaborn-v0_8-whitegrid')

# Load the dataset
# The dataset is delimited by semicolons, so we specify 'sep=';'
df = pd.read_csv('Camp_Market.csv', sep=';')

# --- Data Cleaning and Preprocessing ---

# Drop irrelevant columns as specified in the prompt
df = df.drop(columns=['Z_CostContact', 'Z_Revenue'])

# Fill missing values in 'Income' with the mean income
df['Income'] = df['Income'].fillna(df['Income'].mean())

# Convert 'Dt_Customer' to datetime objects
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'])

In [5]:
# --- Feature Engineering ---

# Calculate customer's age from birth year
df['Age'] = 2015 - df['Year_Birth']

# Calculate the number of days since the customer's enrollment with the company
df['Tenure_Days'] = (pd.to_datetime('2015-01-01') - df['Dt_Customer']).dt.days

# Calculate total spending and total purchases
spending_cols = [
    'MntWines', 'MntFruits', 'MntMeatProducts', 
    'MntFishProducts', 'MntSweetProducts', 'MntGoldProds'
]
df['Total_Spending'] = df[spending_cols].sum(axis=1)

purchase_cols = [
    'NumWebPurchases', 'NumCatalogPurchases', 
    'NumStorePurchases', 'NumDealsPurchases'
]
df['Total_Purchases'] = df[purchase_cols].sum(axis=1)

# Calculate total number of children at home
df['Dependents'] = df['Kidhome'] + df['Teenhome']

# Remove rows with absurd age (e.g., Year_Birth before 1900)
df = df[df['Age'] < 100]

# Create dummy variables for categorical features for clustering
df_encoded = pd.get_dummies(df, columns=['Education', 'Marital_Status'], drop_first=True)

print("Data Preparation Complete.")
print("Updated DataFrame Info:")
df_encoded.info()

Data Preparation Complete.
Updated DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2237 entries, 0 to 2239
Data columns (total 41 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   ID                       2237 non-null   int64         
 1   Year_Birth               2237 non-null   int64         
 2   Income                   2237 non-null   float64       
 3   Kidhome                  2237 non-null   int64         
 4   Teenhome                 2237 non-null   int64         
 5   Dt_Customer              2237 non-null   datetime64[ns]
 6   Recency                  2237 non-null   int64         
 7   MntWines                 2237 non-null   int64         
 8   MntFruits                2237 non-null   int64         
 9   MntMeatProducts          2237 non-null   int64         
 10  MntFishProducts          2237 non-null   int64         
 11  MntSweetProducts         2237 non-null   int64   