<a href="https://colab.research.google.com/github/pavithraus/Task-8-Clustering/blob/main/clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# kmeans_clustering.py

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

# -----------------------------
# Configuration
# -----------------------------
OUTPUT_DIR = "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)
sns.set(style="whitegrid")

# -----------------------------
# 1. Load Dataset
# -----------------------------
df = pd.read_csv("Mall_Customers.csv")

# -----------------------------
# 2. EDA - Basic Info
# -----------------------------
print(" Dataset Info:")
print(df.info())
print("\n Statistical Summary:")
print(df.describe())

# -----------------------------
# 3. Preprocessing
# -----------------------------
# Drop ID, convert Gender to numerical
df = df.drop('CustomerID', axis=1)
df['Gender'] = df['Gender'].map({'Male': 0, 'Female': 1})

# Features for clustering
features = ['Gender', 'Age', 'Annual Income (k$)', 'Spending Score (1-100)']
X = df[features]

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# 4. Visual EDA
# -----------------------------
# Correlation heatmap
plt.figure(figsize=(6, 5))
sns.heatmap(df[features].corr(), annot=True, cmap='coolwarm')
plt.title(" Feature Correlation")
plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/correlation_heatmap.png")
plt.close()

# Pairplot
sns.pairplot(df[features])
plt.suptitle(" Feature Pairplot", y=1.02)
plt.savefig(f"{OUTPUT_DIR}/pairplot.png")
plt.close()

# Distribution of features
for col in features:
    plt.figure()
    sns.histplot(df[col], kde=True, bins=20)
    plt.title(f" Distribution of {col}")
    plt.savefig(f"{OUTPUT_DIR}/dist_{col}.png")
    plt.close()

# -----------------------------
# 5. PCA for 2D Visualization
# -----------------------------
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# -----------------------------
# 6. Elbow Method + Silhouette
# -----------------------------
wcss = []
silhouette_scores = []

for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(X_scaled)
    wcss.append(kmeans.inertia_)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)

# Plot Elbow & Silhouette Score
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
ax[0].plot(range(2, 11), wcss, 'bo--')
ax[0].set_title(" Elbow Method")
ax[0].set_xlabel("Number of Clusters")
ax[0].set_ylabel("WCSS")

ax[1].plot(range(2, 11), silhouette_scores, 'go--')
ax[1].set_title(" Silhouette Score")
ax[1].set_xlabel("Number of Clusters")
ax[1].set_ylabel("Score")

plt.tight_layout()
plt.savefig(f"{OUTPUT_DIR}/elbow_silhouette.png")
plt.close()

# -----------------------------
# 7. Final KMeans Clustering
# -----------------------------
optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
labels = kmeans.fit_predict(X_scaled)

df['Cluster'] = labels

# -----------------------------
# 8. Visualize Final Clusters in 2D PCA
# -----------------------------
plt.figure(figsize=(8, 6))
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=labels, palette='tab10', s=60)
plt.title(f"K-Means Clusters (K={optimal_k}) - PCA View")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title='Cluster')
plt.savefig(f"{OUTPUT_DIR}/pca_clusters.png")
plt.close()

# -----------------------------
# 9. Evaluation: Silhouette Score
# -----------------------------
final_score = silhouette_score(X_scaled, labels)
print(f"\n Final Silhouette Score (K={optimal_k}): {final_score:.4f}")

# -----------------------------
# 10. Cluster Summary
# -----------------------------
summary = df.groupby('Cluster')[features].mean().round(2)
print("\n Cluster Summary (Means of Each Feature):")
print(summary)
summary.to_csv(f"{OUTPUT_DIR}/cluster_summary.csv")


 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB
None

 Statistical Summary:
       CustomerID         Age  Annual Income (k$)  Spending Score (1-100)
count  200.000000  200.000000          200.000000              200.000000
mean   100.500000   38.850000           60.560000               50.200000
std     57.879185   13.969007           26.264721               25.823522
min      1.000000   18.000000           15.000000                1.000000
25%     50.750000   28.750000           41.500000               34.75