In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
df=pd.read_csv('/content/Mall_Customers.csv')

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
num=df.select_dtypes(include='number')
plt.boxplot(num)
plt.show()

In [None]:
encoded=LabelEncoder()
df['Gender']=encoded.fit_transform(df['Gender'])

In [None]:
df['Gender'].head()

In [None]:
df.head()

In [None]:
X = df[["Annual Income (k$)", "Spending Score (1-100)"]]

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
wcss = []  # within cluster sum of squares
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, random_state=42, n_init=10)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

In [None]:
plt.plot(range(1, 11), wcss, marker="o")
plt.title("Elbow Method")
plt.xlabel("Number of Clusters")
plt.ylabel("WCSS")
plt.show()

In [None]:
from sklearn.metrics import silhouette_score

scores = {}
for k in range(2, 11):
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X)
    s = silhouette_score(X, labels)
    scores[k] = s
    print(f"score for k={k}: {s}")

print(scores)  ##########################################################################to get k which equal =5

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
y_kmeans = kmeans.fit_predict(X)
df["Cluster"] = y_kmeans

In [None]:
df.head()

In [None]:
df['Cluster'].value_counts()

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X.iloc[:,0], X.iloc[:,1], c=y_kmeans, cmap="viridis", s=50)
plt.scatter(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1],
            s=200, c="red", marker="X", label="Centroids")
plt.title("Customer Segmentation (Income vs Spending)")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.legend()
plt.show()

In [None]:
optimal_k = 5  #
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
labels = kmeans.fit_predict(X)

df["Cluster"] = labels
centers = kmeans.cluster_centers_
print("Centroids (Income, Spending):\n", centers)

In [None]:
#BOUNS

In [None]:
from sklearn.cluster import DBSCAN

In [None]:
dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(X_scaled)

df["Cluster"] = labels

In [None]:
plt.figure(figsize=(8,6))
plt.scatter(X["Annual Income (k$)"], X["Spending Score (1-100)"],
            c=labels, cmap="plasma", s=50)

plt.title("DBSCAN Clustering (Income vs Spending)")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1-100)")
plt.show()


In [None]:
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_outliers = list(labels).count(-1)

print("Number of clusters:", n_clusters)
print("Number of outliers:", n_outliers)

In [None]:
cluster_analysis = df.groupby("Cluster")["Spending Score (1-100)"].agg(["count", "mean", "median"]).round(2)
print(cluster_analysis)


In [None]:
import matplotlib.pyplot as plt

cluster_analysis["mean"].plot(kind="bar", color="skyblue", figsize=(8,5))
plt.title("Average Spending Score per Cluster")
plt.xlabel("Cluster")
plt.ylabel("Average Spending Score")
plt.show()