In [None]:
from random import randrange
from sklearn.datasets import make_blobs
from sklearn.metrics import pairwise_distances_argmin
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Working With Dummy Data

In [None]:
dataset, true_labels = make_blobs(n_samples=300, n_features=2, centers=4, cluster_std=.6, random_state=0)

# The generated true_labels contain the true label of each datapoint. They are useful to validate our solution later

In [None]:
second_feature_vals, first_feature_vals = np.rot90(dataset)

plt.scatter(first_feature_vals, second_feature_vals)
plt.title("Scatter Plot of Dummy Data Generated by make_blobs Utility")
plt.xlabel("First Feature")
plt.ylabel("Second Feature")
plt.show()

Looking at the generated scatter plot, we can conclude that setting $k$ to $4$ clusters sounds reasonable enough. We are also sure that this is the precise number of clusters, since we set it to $4$ when we generated our blobs with `make_blobs`

In [None]:
def k_means(dataset, k):
    centroids = dataset[np.random.choice(dataset.shape[0], k)]
    
    while True:
        labels = pairwise_distances_argmin(dataset, centroids)
        new_centroids = np.array([dataset[labels == label].mean(0) for label in range(k)])

        if np.all(new_centroids == centroids):
            break

        centroids = new_centroids
    
    return labels

In [None]:
labels = k_means(dataset, 4)

plt.subplot(1, 2, 1)
plt.scatter(first_feature_vals, second_feature_vals, c=labels)
plt.title("Our Implementation Generated Labels")
plt.subplot(1, 2, 2)
plt.scatter(first_feature_vals, second_feature_vals, c=true_labels)
plt.title("The True Labels of the Dataset")
plt.subplots_adjust(right=2, wspace=0.2)
plt.show()

We can see that the generated labels from our k-means implementation match the true labels generated from `make_blos` utility

# Working With Real Data

In [None]:
df = pd.read_csv("data.csv")[["ApplicantIncome", "LoanAmount"]]

In [None]:
plt.scatter(df["ApplicantIncome"], df["LoanAmount"])
plt.title("Scatter Plot of Applicant Income and Loan Amount")
plt.xlabel("Applicant Income")
plt.ylabel("Loan Amount")
plt.show()

We can see that it's hard to decide on the number of clusters $k$, thus why we will use The Elbow Method to estimate it

In [None]:
inertias = [KMeans(k).fit(df).inertia_ for k in range(1, 11)]

plt.plot(range(1, 11), inertias)
plt.show()

The Elbow Point is where the most substantial decrease occurred in the graph, and it seems to be around $4$

In [None]:
labels = k_means(df.to_numpy(), 4)

plt.subplot(1, 2, 1)
plt.scatter(df["ApplicantIncome"], df["LoanAmount"], c=labels)
plt.subplot(1, 2, 2)
plt.scatter(df["ApplicantIncome"], df["LoanAmount"], c=KMeans(4).fit(df).labels_)
plt.subplots_adjust(right=2, wspace=0.2)
plt.show()