# Customer Segmentation using Unsupervised Machine Learning (K-Means)

This notebook is based on the [GeeksforGeeks tutorial](https://www.geeksforgeeks.org/machine-learning/customer-segmentation-using-unsupervised-machine-learning-in-python/) for customer segmentation using the K-Means clustering algorithm.


In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# To ignore warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
# Reading the dataset
data = pd.read_csv('Mall_Customers.csv')
data.head()


In [None]:
# Checking for missing values
data.isnull().sum()


In [None]:
# Basic statistics of data
data.describe()


In [None]:
# Visualizing gender distribution
sns.countplot(x='Gender', data=data)
plt.title('Gender Distribution')
plt.show()


In [None]:
# Visualizing Age distribution
plt.figure(figsize=(8, 4))
sns.histplot(data['Age'], bins=20, kde=True)
plt.title('Age Distribution')
plt.show()


In [None]:
# Annual Income distribution
plt.figure(figsize=(8, 4))
sns.histplot(data['Annual Income (k$)'], bins=20, kde=True)
plt.title('Annual Income Distribution')
plt.show()


In [None]:
# Spending Score distribution
plt.figure(figsize=(8, 4))
sns.histplot(data['Spending Score (1-100)'], bins=20, kde=True)
plt.title('Spending Score Distribution')
plt.show()


In [None]:
# Using only Annual Income and Spending Score for clustering
X = data[['Annual Income (k$)', 'Spending Score (1-100)']]


In [None]:
# Elbow Method to find the optimal number of clusters
from sklearn.cluster import KMeans

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', random_state=42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

plt.plot(range(1, 11), wcss)
plt.title('Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()


In [None]:
# Applying KMeans with 5 clusters
kmeans = KMeans(n_clusters=5, init='k-means++', random_state=42)
y_kmeans = kmeans.fit_predict(X)


In [None]:
# Visualizing the clusters
plt.figure(figsize=(8, 6))
plt.scatter(X.values[y_kmeans == 0, 0], X.values[y_kmeans == 0, 1], s=100, c='red', label='Cluster 1')
plt.scatter(X.values[y_kmeans == 1, 0], X.values[y_kmeans == 1, 1], s=100, c='blue', label='Cluster 2')
plt.scatter(X.values[y_kmeans == 2, 0], X.values[y_kmeans == 2, 1], s=100, c='green', label='Cluster 3')
plt.scatter(X.values[y_kmeans == 3, 0], X.values[y_kmeans == 3, 1], s=100, c='cyan', label='Cluster 4')
plt.scatter(X.values[y_kmeans == 4, 0], X.values[y_kmeans == 4, 1], s=100, c='magenta', label='Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label='Centroids')
plt.title('Customer Segments')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
