In [1]:
import numpy as np
import pandas as pd

# Creating a larger dataset with 100 records
np.random.seed(42)  # For reproducibility

# Customer IDs
customer_ids = np.arange(1, 101)

# Age: Generating age patterns for different segments
age = np.concatenate([np.random.randint(18, 35, size=30),  # Younger customers
                      # Middle-aged customers
                      np.random.randint(35, 55, size=40),
                      np.random.randint(55, 70, size=30)])  # Older customers

# Annual Income: Generating income patterns based on the segments
annual_income = np.concatenate([np.random.randint(10, 40, size=30),  # Lower income group
                                # Middle income group
                                np.random.randint(40, 80, size=40),
                                np.random.randint(80, 120, size=30)])  # Higher income group

# Spending Score: Patterns based on income and age, with some randomness
spending_score = np.concatenate([np.random.randint(20, 80, size=30),  # Moderate spenders
                                 # Higher spenders
                                 np.random.randint(40, 90, size=40),
                                 np.random.randint(10, 70, size=30)])  # Lower spenders

# Create DataFrame
data = {
    'CustomerID': customer_ids,
    'Age': age,
    'Annual_Income': annual_income,
    'Spending_Score': spending_score
}

df_large = pd.DataFrame(data)

# Save as CSV
csv_file_path_large = 'customer_data_100.csv'
df_large.to_csv(csv_file_path_large, index=False)

# print(csv_file_path_large)

In [None]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.spatial import ConvexHull
import numpy as np
from econml.dml import LinearDML
from sklearn.ensemble import RandomForestRegressor

# Step 1: Data Preprocessing - Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(
    dataset[['Age', 'Annual_Income', 'Spending_Score']])

# Step 2: Apply K-means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
dataset['Cluster'] = kmeans.fit_predict(scaled_features)

# Step 3: Set up the color palette for clusters
palette = sns.color_palette("Set1", n_colors=kmeans.n_clusters)

# Step 4: Plot the clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Annual_Income', y='Spending_Score', hue='Cluster', data=dataset, palette=palette, s=100, alpha=0.7)

# Adding plot labels and title
plt.title('Customer Segmentation Based on Income and Spending Score')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend(title='Cluster')

# Step 5: Draw Convex Hulls around the clusters and plot causal regression lines
for i in range(kmeans.n_clusters):
    # Get the points for the current cluster
    cluster_data = dataset[dataset['Cluster'] == i]
    cluster_points = cluster_data[['Annual_Income', 'Spending_Score']].values

    # Calculate the ConvexHull for the cluster
    if len(cluster_points) > 2:  # ConvexHull needs at least 3 points
        hull = ConvexHull(cluster_points)

        # Get the vertices of the hull
        hull_points = cluster_points[hull.vertices]

        # Close the polygon by repeating the first point at the end
        hull_points = np.append(hull_points, [hull_points[0]], axis=0)

        # Plot the convex hull as a polygon
        plt.plot(hull_points[:, 0], hull_points[:, 1], color=palette[i], lw=2)
        plt.fill(hull_points[:, 0], hull_points[:, 1],
                 color=palette[i], alpha=0.2)

    # Step 6: Causal Inference with econml for each cluster
    # Define treatment, outcome, and confounder
    T_cluster = cluster_data[['Annual_Income']
                             ].values  # Ensuring T is a 2D array
    Y_cluster = cluster_data['Spending_Score'].values  # Y should be 1D
    X_cluster = cluster_data[['Age']].values  # Confounder, also 2D

    # Define the models for treatment and outcome
    model_t = RandomForestRegressor(random_state=42)
    model_y = RandomForestRegressor(random_state=42)

    # Initialize the LinearDML estimator
    dml_estimator = LinearDML(
        model_y=model_y, model_t=model_t, random_state=42)

    # Fit the model to estimate the conditional average treatment effect
    dml_estimator.fit(Y=Y_cluster, T=T_cluster, X=X_cluster)

    # Generate the causal estimates (CATE) for the range of Annual_Income in this cluster
    min_income = T_cluster.min()
    max_income = T_cluster.max()
    x_range = np.linspace(min_income, max_income, 100).reshape(-1, 1)

    # Calculate the CATE for the average age in the cluster
    mean_age = X_cluster.mean(axis=0).reshape(1, -1)
    cate_estimates = dml_estimator.const_marginal_effect(mean_age) * x_range

    # Shift the CATE line to center it at the median Spending_Score of the cluster
    median_spending_score = np.median(Y_cluster)
    cate_estimates_centered = cate_estimates + median_spending_score

    # Plot the causal regression line for each cluster centered within the cluster
    plt.plot(x_range, cate_estimates_centered,
             color=palette[i], lw=2, linestyle='--', label=f"Cluster {i} Causal Effect")

# Adjust axis limits to improve visualization
plt.xlim(dataset['Annual_Income'].min() - 10, dataset['Annual_Income'].max() + 10)
plt.ylim(dataset['Spending_Score'].min() - 10, dataset['Spending_Score'].max() + 10)

plt.grid(True)

# Show the plot
plt.show()