In [None]:
# Customer Segmentation Analysis

## 1. Setup and Data Preparation


In [None]:
pip install pandas numpy scikit-learn matplotlib seaborn scipy

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
data = pd.read_csv('customer_data.csv')

# Display basic information
print(data.info())
print(data.describe())

# Data Cleaning
data = data.dropna()
data = data.drop_duplicates()

# Data Transformation
data = pd.get_dummies(data, drop_first=True)

# Feature selection
features = data[['age', 'income', 'spending_score']]  # Example feature columns

# Normalization
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)


In [None]:
## 2. Exploratory Data Analysis (EDA)


In [None]:
# Univariate Analysis
plt.figure(figsize=(10, 6))
sns.histplot(data['income'], bins=30, kde=True)
plt.title('Income Distribution')
plt.xlabel('Income')
plt.ylabel('Frequency')
plt.show()

# Bivariate Analysis
plt.figure(figsize=(10, 6))
sns.scatterplot(x='age', y='income', data=data)
plt.title('Age vs Income')
plt.xlabel('Age')
plt.ylabel('Income')
plt.show()

# Multivariate Analysis (PCA for visualization)
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_features)
pca_df = pd.DataFrame(data=pca_result, columns=['PC1', 'PC2'])

plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC2', data=pca_df)
plt.title('PCA of Customer Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


In [None]:
## 3. K-Means Clustering


In [None]:
# Determine the optimal number of clusters using the Elbow Method
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=42)
    kmeans.fit(scaled_features)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), wcss)
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.show()

# Fit K-Means with the chosen number of clusters (e.g., 4)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, init='k-means++', max_iter=300, n_init=10, random_state=42)
clusters = kmeans.fit_predict(scaled_features)

# Add cluster information to the original data
data['Cluster'] = clusters

# Cluster Analysis
print(data.groupby('Cluster').mean())


In [None]:
## 4. Hierarchical Clustering


In [None]:
import scipy.cluster.hierarchy as sch

# Compute the distance matrix
plt.figure(figsize=(10, 6))
dendrogram = sch.dendrogram(sch.linkage(scaled_features, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Euclidean Distance')
plt.show()

# Fit Hierarchical Clustering
hc = sch.AgglomerativeClustering(n_clusters=optimal_k, affinity='euclidean', linkage='ward')
hc_clusters = hc.fit_predict(scaled_features)

# Add cluster information to the original data
data['HC_Cluster'] = hc_clusters

# Cluster Analysis
print(data.groupby('HC_Cluster').mean())


In [None]:
## 5. DBSCAN Clustering


In [None]:
from sklearn.cluster import DBSCAN

# Fit DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_clusters = dbscan.fit_predict(scaled_features)

# Add cluster information to the original data
data['DBSCAN_Cluster'] = dbscan_clusters

# Cluster Analysis
print(data.groupby('DBSCAN_Cluster').mean())


In [None]:
## 6. Visualization of Clusters


In [None]:
# Visualize K-Means Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=data['Cluster'], palette='viridis')
plt.title('K-Means Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

# Visualize Hierarchical Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=data['HC_Cluster'], palette='viridis')
plt.title('Hierarchical Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

# Visualize DBSCAN Clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=pca_df['PC1'], y=pca_df['PC2'], hue=data['DBSCAN_Cluster'].astype(str), palette='viridis')
plt.title('DBSCAN Clusters')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()


In [None]:
## 7. Reporting

- **Technical Report:** Document the methodology, results, and interpretations.
- **Executive Summary:** Summarize the key findings and actionable insights.
- **Dashboards:** Use tools like Tableau or Power BI for interactive visualizations if needed.
