# Importing Libraries 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import os
import missingno as msno
os.chdir('C:\\Users\\Anwar\\Desktop') 
from sklearn.preprocessing import RobustScaler
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
df=pd.read_csv('Mall_Customers.csv')
df.head()

# EDA 

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.duplicated().sum()

# Profiling Report 

In [None]:
from pandas_profiling import ProfileReport
ProfileReport(df, title= "Mall Profiling Report")

# Dropping Customer ID 

In [None]:
df.drop(["CustomerID"], axis = 1, inplace=True)

# K-Means Clustering 

In [None]:
from mpl_toolkits.mplot3d import Axes3D
sns.set_style("white")
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(
    df.Age, df["Annual Income (k$)"], df["Spending Score (1-100)"],
    c=df["Spending Score (1-100)"], cmap='coolwarm', marker='o', s=60
)

ax.view_init(30, 185)
plt.xlabel("Age")
plt.ylabel("Annual Income (k$)")
ax.set_zlabel('Spending Score (1-100)')
cbar = plt.colorbar(scatter)
cbar.set_label('Spending Score')

plt.show()

# Elbow Method 
- Elbow Method is a technique that we use to determine the number of centroids (k) to use in a k-means clustering algorithm. In this method to determine the k-value we continuously iterate for k=1 to k=n (Here n is the hyperparameter that we choose as per our requirement).


In [None]:
from sklearn.cluster import KMeans
wcss = []
for k in range(1,11):
    kmeans = KMeans(n_clusters=k, init="k-means++")
    kmeans.fit(df.iloc[:,1:])
    wcss.append(kmeans.inertia_)
plt.figure(figsize=(12,6))    
plt.grid()
plt.plot(range(1,11),wcss, linewidth=2, color="red", marker ="8")
plt.xlabel("K Value")
plt.xticks(np.arange(1,11,1))
plt.ylabel("WCSS")
plt.show()

In [None]:
df

In [None]:
km = KMeans(n_clusters=4)
clusters = km.fit_predict(df.iloc[:,1:])# excluding Genre column

In [None]:
clusters

In [None]:
df.columns

In [None]:
sns.set_style("white")
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(
    df['Age'], df['Annual Income (k$)'], df['Spending Score (1-100)'],
    c=clusters, cmap='viridis', s=60
)

ax.view_init(30, 185)
plt.xlabel("Age")
plt.ylabel("Annual Income (k$)")
ax.set_zlabel('Spending Score (1-100)')
cbar = plt.colorbar(scatter)
cbar.set_label('Cluster Labels')

plt.show()

In [None]:

km = KMeans(n_clusters=5)
clusters2 = km.fit_predict(df.iloc[:,1:])

sns.set_style("white")
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection='3d')

scatter = ax.scatter(
    df['Age'], df['Annual Income (k$)'], df['Spending Score (1-100)'],
    c=clusters, cmap='viridis', s=60
)

ax.view_init(30, 185)
plt.xlabel("Age")
plt.ylabel("Annual Income (k$)")
ax.set_zlabel('Spending Score (1-100)')

cbar = plt.colorbar(scatter)
cbar.set_label('Cluster Labels')

plt.show()

In [None]:
clusters2

### Changing colors 

In [None]:
km = KMeans(n_clusters=5)
clusters3 = km.fit_predict(df.iloc[:,1:])
cluster_colors = ['r', 'g', 'b', 'c', 'm']

sns.set_style("white")
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(
    df['Age'], df['Annual Income (k$)'], df['Spending Score (1-100)'],
    c=[cluster_colors[i] for i in clusters], s=60
)

ax.view_init(30, 185)
plt.xlabel("Age")
plt.ylabel("Annual Income (k$)")
ax.set_zlabel('Spending Score (1-100)')

for i, color in enumerate(cluster_colors):
    ax.scatter([], [], [], c=color, label=f'Cluster {i}', s=60)

ax.legend()

plt.show()

# Silhouette Score:
- Calculate the silhouette score for both k=4 and k=5. The silhouette score measures how similar each data point is to its own cluster compared to other clusters. Higher silhouette scores indicate better clustering. You can choose the k with the higher silhouette score.

In [None]:
from sklearn.metrics import silhouette_score
kmeans_4 = KMeans(n_clusters=4)
clusters_4 = kmeans_4.fit_predict(df.iloc[:, 1:])
silhouette_avg_4 = silhouette_score(df.iloc[:, 1:], clusters_4)
print(f"Silhouette Score for k=4: {silhouette_avg_4}")

# Calculate Silhouette Score for k=5
kmeans_5 = KMeans(n_clusters=5)
clusters_5 = kmeans_5.fit_predict(df.iloc[:, 1:])
silhouette_avg_5 = silhouette_score(df.iloc[:, 1:], clusters_5)
print(f"Silhouette Score for k=5: {silhouette_avg_5}")

## Plotting Centroids

In [None]:
km = KMeans(n_clusters=5)
clusters = km.fit_predict(df.iloc[:,1:])
cluster_colors = ['r', 'g', 'b', 'c', 'm']

sns.set_style("white")
fig = plt.figure(figsize=(20, 10))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(
    df['Age'], df['Annual Income (k$)'], df['Spending Score (1-100)'],
    c=[cluster_colors[i] for i in clusters], s=60
)

ax.view_init(30, 185)
plt.xlabel("Age")
plt.ylabel("Annual Income (k$)")
ax.set_zlabel('Spending Score (1-100)')

# Plot cluster centroids with labels and same color as the cluster
for i, color in enumerate(cluster_colors):
    cluster_mask = clusters == i
    centroid = df.iloc[cluster_mask].mean()  # Calculate the mean as the centroid
    ax.scatter(centroid['Age'], centroid['Annual Income (k$)'], centroid['Spending Score (1-100)'],
               c=color, marker='X', s=200, label=f'Cluster {i} Centroid')

ax.legend()

plt.show()

## Now we will do clustering based on Annual income and Spending score using Hirachical Clustering 

In [None]:

data_for_clustering = df[['Annual Income (k$)', 'Spending Score (1-100)']]
plt.figure(figsize=(10, 6))
sns.scatterplot(data=data_for_clustering, x='Annual Income (k$)', y='Spending Score (1-100)')
plt.title("Scatter Plot of Annual Income vs Spending Score")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.grid(True)
plt.show()

In [None]:

from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

data_for_clustering = df[['Annual Income (k$)', 'Spending Score (1-100)']]

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_for_clustering)

linkage_matrix = linkage(scaled_data, method='ward')
num_clusters = 5 
clusters = fcluster(linkage_matrix, t=num_clusters, criterion='maxclust')
df['Cluster'] = clusters
sns.set_style("whitegrid")
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', palette='viridis', s=60)
plt.title("Clusters After Hierarchical Clustering")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.grid(True)
plt.legend(title='Cluster', loc='upper right')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
dendrogram(linkage_matrix)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Data Points")
plt.ylabel("Distance")
plt.show()

# Deciding the best Linkage Type 

- Based on Silhouette score

In [None]:

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.metrics import silhouette_score

data_for_clustering = df[['Annual Income (k$)', 'Spending Score (1-100)']]
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_for_clustering)
linkage_methods = ['ward', 'single', 'complete', 'average', 'centroid']

best_silhouette_score = -1
best_linkage = None
best_clusters = None

for method in linkage_methods:
    linkage_matrix = linkage(scaled_data, method=method)
    num_clusters = 5  
    clusters = fcluster(linkage_matrix, t=num_clusters, criterion='maxclust')
    silhouette_avg = silhouette_score(scaled_data, clusters)
    
    if silhouette_avg > best_silhouette_score:
        best_silhouette_score = silhouette_avg
        best_linkage = method
        best_clusters = clusters
df['BestCluster'] = best_clusters

sns.set_style("whitegrid")
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='BestCluster', palette='viridis', s=60)
plt.title(f"Clusters After Hierarchical Clustering (Best Linkage: {best_linkage.capitalize()})")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.grid(True)
plt.legend(title='Cluster', loc='upper right')
plt.show()

In [None]:

from sklearn.preprocessing import StandardScaler
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

data_for_clustering = df[['Annual Income (k$)', 'Spending Score (1-100)']]

scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_for_clustering)

linkage_matrix = linkage(scaled_data, method='ward')
num_clusters = 3
clusters = fcluster(linkage_matrix, t=num_clusters, criterion='maxclust')
df['Cluster'] = clusters
sns.set_style("whitegrid")
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', palette='viridis', s=60)
plt.title("Clusters After Hierarchical Clustering")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.grid(True)
plt.legend(title='Cluster', loc='upper right')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
dendrogram(linkage_matrix)
plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Data Points")
plt.ylabel("Distance")
plt.show()

## Conclusion:

In this analysis, we have successfully segmented and analyzed mall customers based on their age, annual income, and spending behavior. The dataset provided valuable insights into customer preferences and allowed us to optimize marketing strategies and business operations.

Here are the key highlights of our analysis:

1. **Data Exploration:** We began by exploring the dataset, checking for missing values, and gaining a better understanding of the customer data.

2. **K-Means Clustering:** We applied K-Means Clustering to group customers into clusters. Using techniques like the Elbow Method and Silhouette Score, we determined that K=5 provided the best clustering results.

3. **Visualization:** We visualized the clusters in a 3D space, allowing us to see how customers from different clusters are distributed based on age, annual income, and spending score.

4. **Cluster Analysis:** We analyzed the characteristics of each cluster to gain insights into customer behavior. This information can be used to tailor marketing campaigns and services to specific customer segments.

5. **Hierarchical Clustering:** In addition to K-Means, we performed Hierarchical Clustering based on annual income and spending score. This approach provided further segmentation possibilities.

6. **Optimal Number of Clusters:** We used dendrograms to determine the optimal number of clusters, finding that K=3 provided a meaningful segmentation.