# Import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Load the dataset
# Make sure 'Mall_Customers.csv' is in the same folder (or uploaded if using Colab)
try:
    df = pd.read_csv('Mall_Customers.csv')
    print(" Dataset loaded successfully!")
except FileNotFoundError:
    print(" Error: 'Mall_Customers.csv' not found.")
    print("Please upload the file or ensure it's in the correct directory.")

# Display the first 5 rows to verify the data
print("\nHere are the first 5 rows of the dataset:")
print(df.head())


Data cleaning and exploratory analysis:

# Display a concise summary of the dataframe
print("\n--- Dataset Information ---")
df.info()

# Check for any missing values
print("\n--- Missing Values Check ---")
print(df.isnull().sum())

# Check for duplicated rows
print("\n--- Duplicate Rows Check ---")
print(f"Number of duplicate rows: {df.duplicated().sum()}")

# Rename columns for easier access and consistency
df.rename(columns={
    'Annual Income (k$)': 'Annual_Income',
    'Spending Score (1-100)': 'Spending_Score'
}, inplace=True)

print("\n--- First 5 Rows After Cleaning ---")
print(df.head())

# Set up a clean and modern plot style
sns.set(style="whitegrid")

# Plot 1: Age Distribution
plt.figure(figsize=(10, 6))
sns.histplot(df['Age'], bins=30, kde=True, color='skyblue')
plt.title('Age Distribution of Customers')
plt.xlabel('Age')
plt.ylabel('Number of Customers')
plt.show()

# Plot 2: Gender Distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='Gender', data=df, palette='pastel')
plt.title('Gender Distribution of Customers')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()

# Plot 3: Annual Income and Spending Score Distributions
plt.figure(figsize=(12, 5))

# Annual Income
plt.subplot(1, 2, 1)
sns.histplot(df['Annual_Income'], bins=30, kde=True, color='green')
plt.title('Annual Income Distribution')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Number of Customers')

# Spending Score
plt.subplot(1, 2, 2)
sns.histplot(df['Spending_Score'], bins=30, kde=True, color='purple')
plt.title('Spending Score Distribution')
plt.xlabel('Spending Score (1–100)')
plt.ylabel('Number of Customers')

plt.tight_layout()
plt.show()

# Plot 4: Relationship between Annual Income and Spending Score
plt.figure(figsize=(10, 7))
sns.scatterplot(
    x='Annual_Income', 
    y='Spending_Score', 
    data=df, 
    hue='Gender', 
    s=100, 
    alpha=0.7
)
plt.title('Annual Income vs. Spending Score by Gender')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1–100)')
plt.legend(title='Gender')
plt.show()


K Means clustering:


# --- Feature Selection and Scaling ---
# We start by selecting the 'Annual_Income' feature for clustering
X1 = df[['Annual_Income']]

# Standardize the data so that it has a mean of 0 and a standard deviation of 1
scaler1 = StandardScaler()
X1_scaled = scaler1.fit_transform(X1)

# --- Step 1: Determine Optimal k Using the Elbow Method ---
inertia = []
K_range = range(1, 11)

# Loop through different values of k to calculate inertia (within-cluster sum of squares)
for k in K_range:
    kmeans1 = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=42)
    kmeans1.fit(X1_scaled)
    inertia.append(kmeans1.inertia_)

# Plot the Elbow Curve to visually identify the best k value
plt.figure(figsize=(10, 5))
plt.plot(K_range, inertia, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Annual Income Feature')
plt.show()

# --- Step 2: Verify Using the Silhouette Score ---
silhouette_scores = []
K_range_sil = range(2, 11)  # Silhouette Score requires at least 2 clusters

# Loop through different k values to compute silhouette scores
for k in K_range_sil:
    kmeans1 = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=42)
    kmeans1.fit(X1_scaled)
    labels = kmeans1.labels_
    silhouette_scores.append(silhouette_score(X1_scaled, labels))

# Plot Silhouette Scores to confirm the optimal number of clusters
plt.figure(figsize=(10, 5))
plt.plot(K_range_sil, silhouette_scores, 'ro-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for Annual Income Feature')
plt.show()





Interpretation:
Gender Distribution: The dataset shows a slight imbalance, with females representing a larger portion of the mall's customer base (as seen in the count plot).

Age Distribution: The customer base is heavily skewed towards younger adults, with a peak in the 20-35 age range. The number of customers declines steadily after age 40.

Income vs. Spending (Key Insight): The scatter plot of "Annual Income" vs. "Spending Score" does not show a simple linear correlation. Instead, it reveals distinct, blob-like clusters of customers, suggesting segmentation is possible. For example, there are clear groups of "low income, low spenders," "high income, low spenders," and "high income, high spenders."

K means Clustering
# Clustering Based on 'Annual_Income'

# Step 1: Select and scale the feature
# We focus on 'Annual_Income' and standardize it for better clustering performance
X1 = df[['Annual_Income']]
scaler1 = StandardScaler()
X1_scaled = scaler1.fit_transform(X1)

# Step 2: Determine Optimal k Using the Elbow Method 
inertia = []
K_range = range(1, 11)

# Compute inertia (within-cluster sum of squares) for different k values
for k in K_range:
    kmeans1 = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=42)
    kmeans1.fit(X1_scaled)
    inertia.append(kmeans1.inertia_)

# Plot the Elbow Curve to visualize where inertia starts to level off
plt.figure(figsize=(10, 5))
plt.plot(K_range, inertia, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Annual Income Feature')
plt.show()

# Step 3: Validate Using the Silhouette Score 
silhouette_scores = []
K_range_sil = range(2, 11)  # Silhouette Score requires at least 2 clusters

# Compute silhouette scores for different k values
for k in K_range_sil:
    kmeans1 = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=42)
    kmeans1.fit(X1_scaled)
    labels = kmeans1.labels_
    silhouette_scores.append(silhouette_score(X1_scaled, labels))

# Plot the Silhouette Scores to confirm the best k
plt.figure(figsize=(10, 5))
plt.plot(K_range_sil, silhouette_scores, 'ro-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for Annual Income Feature')
plt.show()

# Interpretation
# Elbow Method: The "elbow" point (where the curve flattens) is not very sharp but appears around k = 3.
# Silhouette Score: The highest silhouette value also occurs at k = 3.
# Conclusion: The optimal number of clusters for 'Annual Income' alone is 3.

# Step 4: Apply K-Means with k = 3 
kmeans1_final = KMeans(n_clusters=3, init='k-means++', n_init=10, random_state=42)
df['Cluster_1D'] = kmeans1_final.fit_predict(X1_scaled)

# Display the first few rows with the new cluster assignments
print("\n1D Clustering Results (First 5 Rows)")
print(df.head())


Clustering with 2 Features: Annual Income" & "Spending Score
# Clustering Based on 'Annual_Income' and 'Spending_Score'

# Step 1: Select and scale the two key features
# These features provide insight into customer purchasing behavior and spending capacity
X2 = df[['Annual_Income', 'Spending_Score']]
scaler2 = StandardScaler()
X2_scaled = scaler2.fit_transform(X2)

# Step 2: Determine the optimal number of clusters using the Elbow Method
inertia = []
K_range = range(1, 11)

# Calculate inertia (within-cluster sum of squares) for different k values
for k in K_range:
    kmeans2 = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=42)
    kmeans2.fit(X2_scaled)
    inertia.append(kmeans2.inertia_)

# Plot the Elbow Curve to visualize where the curve begins to flatten
plt.figure(figsize=(10, 5))
plt.plot(K_range, inertia, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Annual Income and Spending Score')
plt.show()

# Step 3: Validate with Silhouette Scores 
silhouette_scores = []
K_range_sil = range(2, 11)  # Silhouette requires at least 2 clusters

# Compute silhouette scores to measure how well each object fits within its cluster
for k in K_range_sil:
    kmeans2 = KMeans(n_clusters=k, init='k-means++', n_init=10, random_state=42)
    kmeans2.fit(X2_scaled)
    labels = kmeans2.labels_
    silhouette_scores.append(silhouette_score(X2_scaled, labels))

# Plot Silhouette Scores to confirm the ideal k
plt.figure(figsize=(10, 5))
plt.plot(K_range_sil, silhouette_scores, 'ro-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Scores for Annual Income and Spending Score')
plt.show()

# Step 4: Interpretation
# Elbow Method: A clear and sharp elbow appears at k = 5, indicating the optimal point.
# Silhouette Score: The highest silhouette score also occurs at k = 5, confirming this choice.
# Conclusion: The optimal number of clusters for the 'Annual Income' and 'Spending_Score' combination is 5.

# Step 5: Apply K-Means Clustering with k = 5 
kmeans2_final = KMeans(n_clusters=5, init='k-means++', n_init=10, random_state=42)
df['Cluster_2D'] = kmeans2_final.fit_predict(X2_scaled)

# Display a preview of the dataset with the new cluster assignments
print("\n--- 2D Clustering Results (First 5 Rows) ---")
print(df.head())


Visualisation and Comparison of Clusters



plt.figure(figsize=(18, 8))

# (1 Clustering)
plt.subplot(1, 2, 1)
sns.stripplot(x='Cluster_1D', y='Annual_Income', data=df, palette='viridis', jitter=True)
plt.title('1D Clustering (k=3) based on Annual Income')
plt.xlabel('Cluster')
plt.ylabel('Annual Income (k$)')

#  (2 Clustering)
plt.subplot(1, 2, 2)
sns.scatterplot(x='Annual_Income', y='Spending_Score', hue='Cluster_2D', data=df, 
                palette='bright', s=100, alpha=0.9)
plt.title('2D Clustering (k=5) based on Income and Spending')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend(title='Cluster')

plt.tight_layout()
plt.show()



The 1D clustering creates simple income-based groups—low, medium, and high—which offer limited marketing insight since spending behavior isn’t considered. In contrast, the 2D clustering provides deeper, behavior-based segmentation by analyzing both income and spending, resulting in five distinct customer personas that enable more targeted and effective marketing strategies.

Actionable Suggestions:
Cluster 0 (e.g., Green): High Income, Low Spending -> "The Careful Rich"

Cluster 1 (e.g., Red): High Income, High Spending -> "The VIPs / Target"

Cluster 2 (e.g., Purple): Low Income, Low Spending -> "The Thrifty"

Cluster 3 (e.g., Orange): Low Income, High Spending -> "The Careless Spenders"

Cluster 4 (e.g., Blue): Medium Income, Medium Spending -> "The Standard Customer"

1. The VIPs (High Income, High Spending): These are the mall’s most valuable shoppers. Engage them through exclusive events, premium loyalty programs (e.g., “Gold” or “Platinum” tiers with perks like valet parking or VIP lounges), and luxury marketing featuring high-end brands.

2. The Careful Rich (High Income, Low Spending): Wealthy but cautious buyers who prioritize quality and exclusivity. Focus on value-added services like extended warranties and customization, promote products as long-term investments, and invite them to niche events such as art shows or wine tastings.

3. The Standard Customer (Mid Income, Mid Spending): The mall’s core group—budget-conscious but value-driven. Use standard loyalty programs, mall-wide discounts, and family-oriented events to attract and retain them.

4. The Careless Spenders (Low Income, High Spending): Often young, trend-driven consumers who buy impulsively. Reach them through social media campaigns, flash sales, and student discounts that appeal to their desire for instant gratification.

5. The Thrifty (Low Income, Low Spending): Highly price-sensitive shoppers who visit mainly for essentials. Use digital coupons, clearance alerts, and value bundles to attract them, while leveraging their presence to boost mall traffic for other tenants. Avoid luxury marketing, as it will not resonate with this group.