In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('Mall_Customers.csv')

# Display first 5 rows
df.head()


In [1]:
# Check for missing values
df.isnull().sum()


NameError: name 'df' is not defined

In [None]:
# Set plot style
sns.set(style='whitegrid')

# Age distribution
plt.figure(figsize=(8,4))
sns.histplot(df['Age'], bins=20, kde=True)
plt.title('Age Distribution')
plt.show()

# Annual Income distribution
plt.figure(figsize=(8,4))
sns.histplot(df['Annual Income (k$)'], bins=20, kde=True)
plt.title('Annual Income Distribution')
plt.show()


In [None]:
# Box plot for Age
plt.figure(figsize=(6,4))
sns.boxplot(x=df['Age'])
plt.title('Box Plot - Age')
plt.show()

# Box plot for Annual Income
plt.figure(figsize=(6,4))
sns.boxplot(x=df['Annual Income (k$)'])
plt.title('Box Plot - Annual Income')
plt.show()


In [None]:
X = df[['Annual Income (k$)', 'Spending Score (1-100)']]
X.head()


In [None]:
from sklearn.cluster import KMeans

# KMeans with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)


In [None]:
# Plotting clusters
plt.figure(figsize=(8,6))
sns.scatterplot(x='Annual Income (k$)', y='Spending Score (1-100)', 
                hue='Cluster', data=df, palette='Set1', s=100)
plt.title('K-Means Clustering (k=3)')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()


Intermediate Level - KMeans with 5 clusters

In [None]:
from sklearn.preprocessing import StandardScaler

# Select numerical features only
numerical_features = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']
X_scaled = StandardScaler().fit_transform(df[numerical_features])


In [None]:
wcss = []
K = range(1, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    wcss.append(kmeans.inertia_)

# Plot WCSS vs k
plt.figure(figsize=(8,4))
plt.plot(K, wcss, 'bo-')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.title('Elbow Method For Optimal k')
plt.show()


In [None]:
# Assume from elbow method we choose k=5
kmeans_opt = KMeans(n_clusters=5, random_state=42)
df['Cluster_5'] = kmeans_opt.fit_predict(X_scaled)


In [None]:
# Group by clusters and get mean values
cluster_profile = df.groupby('Cluster_5')[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']].mean()
cluster_profile


Hard Mode 

In [None]:
# Already selected: Age, Annual Income, Spending Score
X_all = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]
X_scaled_all = StandardScaler().fit_transform(X_all)


In [None]:
from sklearn.decomposition import PCA

# Reduce to 2D and 3D for visualization
pca_2d = PCA(n_components=2)
X_pca_2d = pca_2d.fit_transform(X_scaled_all)

pca_3d = PCA(n_components=3)
X_pca_3d = pca_3d.fit_transform(X_scaled_all)


In [None]:
from sklearn.metrics import silhouette_score

scores = []
random_states = [0, 10, 42, 100, 200]

for state in random_states:
    kmeans = KMeans(n_clusters=5, random_state=state)
    labels = kmeans.fit_predict(X_scaled_all)
    score = silhouette_score(X_scaled_all, labels)
    scores.append((state, score))

# Show silhouette scores
pd.DataFrame(scores, columns=["Random State", "Silhouette Score"])


In [None]:
kmeans_final = KMeans(n_clusters=5, random_state=42)
df['Final_Cluster'] = kmeans_final.fit_predict(X_scaled_all)


In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(X_pca_3d[:,0], X_pca_3d[:,1], X_pca_3d[:,2],
           c=df['Final_Cluster'], cmap='Set2', s=60)
ax.set_title('3D PCA Cluster Visualization')
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.show()
