Reads the preprocessed data files stored in /Data/Preprocessed Data

In [2]:
import pandas as pd

preProcessedUttarPradesh = pd.read_csv('./../Data/Preprocessed Data/preprocessedUttarPradesh.csv',low_memory = False)
preProcessedMadhyaPradesh = pd.read_csv('./../Data/Preprocessed Data/preprocessedMadhyaPradesh.csv',low_memory=False)
preProcessedBihar = pd.read_csv('./../Data/Preprocessed Data/preprocessedBihar.csv',low_memory=False)
preProcessedUttarakhand = pd.read_csv('./../Data/Preprocessed Data/preprocessedUttarakhand.csv',low_memory=False)
preProcessedOdisha = pd.read_csv('./../Data/Preprocessed Data/preprocessedOdisha.csv',low_memory=False)
preProcessedJharkhand = pd.read_csv('./../Data/Preprocessed Data/preprocessedJharkhand.csv',low_memory=False)
preProcessedChhattisgarh = pd.read_csv('./../Data/Preprocessed Data/preprocessedChhattisgarh.csv',low_memory=False)

Plots unclustered data.
Takes as input a data frame consisting of a pair of attributes to be plotted
Output - Unclustered Scatter plot

In [0]:
import matplotlib.pyplot as plt

def plot_unclustered_data(data):
    # plt.figure(figsize=(15,15)) # Uncomment to set figure size
    plt.scatter(data.iloc[:,0],data.iloc[:,1],s=1,c='black',label='unclustered data')
    plt.xlabel(data.columns[0]) # first column of dataframe as x-axis 
    plt.ylabel(data.columns[1]) # second column of dataframe as y-axis
    plt.legend() # Comment to remove legend from the plot
    plt.title('Plot of data points') # set title of the plot
    plt.show()

Plots clustered Data.
Takes as input : 
1) A dictionary having key as the cluster number and value as array of data points belonging to that cluster.
2) String containing the attribute name of the x-axis.
3) String containing the attribute name of the y-axis.
4) Array containing the centroids of each cluster in order.
Outputs : 
Scatter Plot with different clusters in different colors

In [0]:
import matplotlib.pyplot as plt

def plot_clustered_data(clusters,labelx,labely,centroids):
    plt.figure(figsize=(10,6)) # Set the size of the figure
    K = len(clusters) # number of clusters
    # List of colors for different clusters. Add more colors if number of clusters increase (>10)
    color=['red','darkblue','darkgreen','magenta','lawngreen','cyan','yellow','grey','tan','chocolate']
    # Set the labels of various clusters
    labels=['cluster1','cluster2','cluster3','cluster4','cluster5','cluster6','cluster7','cluster8','cluster9','cluster10']
    # Plot Clusters
    for k in range(0,K):
        # plt.scatter(clusters[k][:,0],clusters[k][:,1],s=1,c=color[k],label=labels[k]) # Uncomment to print labels
        plt.scatter(clusters[k][:,0],clusters[k][:,1],s=8,c=color[k])
    # plt.scatter(centroids[:,0],centroids[:,1],s=4,c='black',label='Centroids')
    plt.scatter(centroids[:,0],centroids[:,1],s=12,c='black')
    plt.xlabel(labelx) # Set x-axis label
    plt.ylabel(labely) # Set y-axis label
    plt.legend() # Comment to hide legend
    plt.show()

Performs K-Means Clustering.
Takes Input : 
1) Number of Clusters.
2) DataFrame containing pair of attributes to be clustered.
3) max_iter - Number of iterations before stopping (you can change its value as per requirements)
Note : 18 minutes running time for max_iter=100 on a dataframe with 1.5 Lakh rows
Output : 
1) Numpy array containing Centroids of the clusters.
2) Dictionary having key as Cluster number and value as an array of datapoints belonging to that cluster
3) Inertia, i.e. within cluster sum of squared errors

In [0]:
from scipy import spatial
import numpy as np
import pandas as pd

def kmeans(k,data,max_iter=100):
    # print(k)
    data = data.astype(float)
    num_features = len(data.columns) # number of features
    data_arr = data.to_numpy() # coverting dataframe to numpy array
    data_arr = data_arr[~np.isnan(data_arr).any(axis=1)] # removing nan values if any
    data_arr = data_arr[~np.isinf(data_arr).any(axis=1)] # removing infinite values if any
    df = pd.DataFrame(data=data_arr,columns=data.columns) 
    centroids = data.sample(n=k) # randomly sampling k datapoints as initial centroids
    centroids = centroids.to_numpy() # converting them to numpy array

    # print(centroids.shape)
    # dist_matrix = np.array([]).reshape(len(data),0)
    # print(centroids) # Uncomment to print the initial centroids
    dist_matrix = []
    for i in range(0,max_iter): # iterating max_iter times
        dist_matrix = []
        for datapt in data_arr: # for each datapoint
            row = []
            if(np.isnan(datapt).any() or np.isinf(datapt).any()): # ignore if the datapoint is nan or infinite
                continue
            for centroid in centroids: # calculate distance of each datapoint with each centroid
                row.append(spatial.distance.euclidean(datapt,centroid))
            dist_matrix.append(row) # append the values in distance matrix
        dist_matrix = np.array(dist_matrix)
        clusters = {}
        for cluster in range(0,k):
            clusters[cluster] = []
        for row,num in zip(dist_matrix,range(0,len(data_arr))): # each datapoint to the cluster whose centroid it is closest to
            clusters[np.argmin(row)].append(data_arr[num])
        for cluster in range(0,k):
            clusters[cluster] = np.array(clusters[cluster])
            if(len(clusters[cluster]) > 0): # re calculate the centroids of each cluster
                centroids[cluster] = np.mean(clusters[cluster],axis=0)
        # print('------------------')
    # print(centroids)
    clusters = {}
    inertia = 0
    for cluster in range(0,k):
        clusters[cluster] = []
    for row,num in zip(dist_matrix,range(0,len(data_arr))): # calculate inertia
        c_num = np.argmin(row)
        clusters[c_num].append(data_arr[num])
        inertia += (row[c_num])**2
    
    return centroids,clusters,inertia

Used to determine the optimal value of 'k' to be given as input to k-means algorithm.
Take input : 
1) dataframe consisting of pair of attributes to be clustered
2) maximum number of clusters between which you want to find the optimal 'k' value.
Gives Output : 
1) array of centroids for each 'k' value.
2) array of clusters for each 'k' value.

In [0]:
def elbow_method(data,max_clusters):
    K = []    
    distortions = []
    centroids_arr = []
    clusters_arr = []
    for k in range(2,max_clusters): # find inertia for all 'k' values between (2,max_clusters) by calling kmeans function
        centroids,clusters,inertia = kmeans(k,data)
        centroids_arr.append(centroids)
        clusters_arr.append(clusters)
        distortions.append(inertia)
        K.append(k)
    
    # plot the inertia obtained for all 'k'
    plt.figure(figsize=(14,10))
    plt.plot(K, distortions, 'bx-')
    plt.xlabel('k')
    plt.ylabel('Distortion')
    plt.title('The Elbow Method showing the optimal k')
    plt.show()

    return centroids_arr,clusters_arr

In [0]:
# states to be clustered
states = [preProcessedBihar, preProcessedChhattisgarh, preProcessedMadhyaPradesh, preProcessedOdisha, preProcessedUttarakhand, preProcessedUttarPradesh, preProcessedJharkhand]

In [0]:
# attributes pairs to be clustered
continuous_attributes = ['Age','Weight_in_kg','Length_height_cm','Haemoglobin_level','BP_systolic','BP_Diastolic','Pulse_rate','fasting_blood_glucose_mg_dl','BMI']

In [0]:
# for each pair of attributes for each state find the optimal 'k' value by plotting elbow grapgh
for s in range(0,len(states)):
    for i in range(0,len(continuous_attributes)):
        for j in range(i+1,len(continuous_attributes)):
            df = states[s][[continuous_attributes[i],continuous_attributes[j]]]
            elbow_method(df,15)

In [0]:
# define your number of clusters for each pair of attributes for each state in this array from the elbow graphs obtained above
k_arr =  [] 

In [0]:
# perform kmeans with the optimal 'k' value obtained
import numpy as np

itr = 0
for s in range(0,len(states)):
    for i in range(0,len(continuous_attributes)):
        for j in range(i+1,len(continuous_attributes)):
            df = states[s][[continuous_attributes[i],continuous_attributes[j]]]
            centroids,clusters,_  = kmeans(k_arr[itr],df) # find centroids and clusters
            for cluster in range(0,len(clusters)):
                clusters[cluster] = np.row_stack(clusters[cluster])
            centroids = np.row_stack(centroids)
            labelx = continuous_attributes[i]
            labely = continuous_attributes[j]
            plot_clustered_data(clusters,labelx,labely,centroids) # plot clustered data
            itr += 1

Attached below is a google colab notebook where we have plotted the clusters obtained from our algorithm and that obtained from the standard sklearn kmeans algorithm for comparing the results.

https://colab.research.google.com/drive/1mnE7CXQq2oiccj58sthswK2PIt1m641w

As you can see our the optimum 'k' value obtained from the elbow method was 8 and clusters of our algorithm and the standard algorithm were almost identical.