# Week 2 – Task 1: Customer Segmentation using K-Means Clustering


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans


In [None]:
# Define column name constants
INCOME_COL = 'Annual Income (k$)'
SCORE_COL = 'Spending Score (1-100)'


In [None]:
# Load dataset
df = pd.read_csv("Mall_Customers.csv")
df.head()


In [None]:
# Overview
df.info()

# Check for missing values
print(df.isnull().sum())


In [None]:
# Summary statistics
df.describe()


In [None]:
# Plot distribution of Age
sns.histplot(df['Age'], kde=True, bins=20, color='skyblue')
plt.title("Age Distribution")
plt.show()

# Plot distribution of Annual Income
sns.histplot(df[INCOME_COL], kde=True, bins=20, color='orange')
plt.title("Annual Income Distribution")
plt.show()

# Plot distribution of Spending Score
sns.histplot(df[SCORE_COL], kde=True, bins=20, color='green')
plt.title("Spending Score Distribution")
plt.show()


In [None]:
sns.boxplot(data=df, x='Genre', y=SCORE_COL, palette='Set3')
plt.title("Spending Score by Gender")
plt.show()


In [None]:
sns.scatterplot(data=df, x=INCOME_COL, y=SCORE_COL, color='purple')
plt.title("INCOME_COL vs Spending Score")
plt.show()


In [None]:
# Select relevant features for clustering
X = df[[INCOME_COL, SCORE_COL]]


In [None]:
# Elbow method to determine optimal k
wcss = []

for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

# Plot elbow curve
plt.plot(range(1, 11), wcss, marker='o', color='teal')
plt.title("Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS (Within-Cluster Sum of Squares)")
plt.grid(True)
plt.show()


In [None]:
# Train final model with k = 5
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
y_kmeans = kmeans.fit_predict(X)

# Add cluster labels to DataFrame
df['Cluster'] = y_kmeans


In [None]:
# Visualize the clusters
plt.figure(figsize=(8,6))
sns.scatterplot(x=INCOME_COL, y=SCORE_COL, 
                hue='Cluster', data=df, palette='Set2', s=100)
plt.title("Customer Segments by K-Means Clustering")
plt.xlabel("INCOME_COL (k$)")
plt.ylabel("SCORE_COL")
plt.legend(title="Cluster")
plt.grid(True)
plt.show()


In [None]:
# Cluster statistics
df.groupby('Cluster')[['Age', INCOME_COL, SCORE_COL]].mean()


## Summary

In this task, I used unsupervised learning (K-Means Clustering) to segment customers based on their Annual Income and Spending Score.
I started by exploring the dataset and visualizing distributions, then used the elbow method to determine the optimal number of clusters.
After training the K-Means model, I assigned each customer to a segment and visualized the resulting clusters.
This exercise helped me understand how clustering works and how it can be used in marketing to target different types of customers.
