In [None]:
# ------------------------------------------------------------
# K-MEANS CLUSTERING ON SALES DATA
# ------------------------------------------------------------
# Aim: Perform customer segmentation using K-Means clustering on
#      Sales and Quantity Ordered data and identify optimal clusters
#      using the Elbow Method.
# Steps: Load → Preprocess → Select Features → Elbow Method → K-Means → Visualize
# ------------------------------------------------------------

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# -----------------------------
# 1. LOAD & PRE-PROCESS DATA
# -----------------------------
'''This step loads the dataset and selects only the required numeric features
for clustering. We extract the 'SALES' and 'QUANTITYORDERED' columns because they
represent customer purchase behavior. Null values are removed to ensure clean input.'''

df = pd.read_csv("sales_data_sample.csv", encoding="ISO-8859-1")
data = df[['SALES', 'QUANTITYORDERED']].dropna()

print("Dataset Shape:", data.shape)
print("\nSelected Data Preview:")
print(data.head())

# -----------------------------
# 2. ELBOW METHOD (FIND OPTIMAL K)
# -----------------------------
'''This code calculates the inertia values for different K values to apply the Elbow Method.

An empty list named inertia is initialized to store inertia scores. K_values is defined as a range from 1 to 9,
representing the different numbers of clusters to test. Inside the for loop, KMeans is initialized with k clusters
and fitted on the dataset using model.fit(data). After training, model.inertia_ returns the sum of squared distances
between data points and their assigned cluster centers, which is appended to the inertia list.

The plot section visualizes K on the X-axis and inertia on the Y-axis using plt.plot(). The marker='o' highlights 
each point clearly. The curve helps identify the "Elbow Point" where inertia stops decreasing significantly, 
indicating the optimal number of clusters to choose.'''


inertia = []
K_values = range(1, 10)

for k in K_values:
    model = KMeans(n_clusters=k, random_state=42)
    model.fit(data)
    inertia.append(model.inertia_)

plt.figure(figsize=(7,4))
plt.plot(K_values, inertia, marker='o')
plt.title("Elbow Method to Determine Optimal K")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia")
plt.grid(True)
plt.show()


# -----------------------------
# 3. APPLY K-MEANS CLUSTERING
# -----------------------------
'''This code applies the K-Means algorithm using the optimal number of clusters (K=3).
KMeans(n_clusters=3) initializes the model to form 3 customer segments. The fit_predict() method 
trains the model on the selected data and simultaneously assigns a cluster label (0, 1, or 2) to every record.

The assigned cluster values are first stored in the variable clusters, and then added as a new column named 'Cluster'
to the dataset for clear identification of each data point's cluster group. Printing the updated data confirms that
each row is now labeled with its respective cluster.'''

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(data)   # store result first
data['Cluster'] = clusters            # now add column safely

print("\nCluster Assigned Data:")
print(data.head())

# -----------------------------
# 4. VISUALIZE CLUSTERS + CENTROIDS
# -----------------------------
'''This code visualizes the clusters formed by the K-Means model.

plt.scatter() plots a 2D scatter graph using Sales on the X-axis and Quantity Ordered on the Y-axis, with the 
c parameter coloring each point based on its assigned cluster. The cmap='viridis' applies a color gradient to distinctly 
highlight different clusters.

The second scatter plot overlays the cluster centroids using kmeans.cluster_centers_. The [:,0] and [:,1] index the 
centroid coordinates for Sales and Quantity Ordered respectively. The centroids are marked with a larger red 'X' 
(marker='X', s=200) to visually represent the center position of each cluster.

Labels, title, and legend are added for clarity, making it easy to interpret how customers are segmented based 
on their purchasing behavior.'''


plt.figure(figsize=(8,5))
plt.scatter(data['SALES'], data['QUANTITYORDERED'], c=data['Cluster'], cmap='viridis')
plt.scatter(kmeans.cluster_centers_[:,0],
            kmeans.cluster_centers_[:,1],
            s=200, c='red', marker='X', label='Centroids')

plt.title("Customer Segmentation using K-Means")
plt.xlabel("Sales")
plt.ylabel("Quantity Ordered")
plt.legend()
plt.show()

