### Importing the required libraries

In [1]:
import pandas as pd
import numpy as np
import math
import os

### Importing the CSV files(non-encoded and encoded) into Pandas data frames


In [2]:
encoded_data = pd.read_csv("../Data/Processed/encoded.csv",low_memory=False)

In [3]:
processed_data = pd.read_csv("../Data/Processed/processed.csv",low_memory=False)

#### Specifying attributes to be considered for clustering

In [4]:
attr=['age','highest_qualification','no_of_dwelling_rooms','status_of_living']

### Initial preprocessing and sampling

In [5]:
clustering_data=encoded_data[attr] # Our dataset for clustering purposes

clustering_data['diagnosed_for']=processed_data['diagnosed_for'] # Including the column for diagnosed disease

clustering_sample = clustering_data.sample(n=100000) # Taking a random sample from the dataset

x=clustering_sample[['age','highest_qualification','no_of_dwelling_rooms','status_of_living']].values 

x=(x-x.mean())/x.std()
#normalizing the data to avoid scaling errors and issues

k=len(clustering_data['diagnosed_for'].unique()) #No of clusters defined = no of unique values of target attribute

Euclidean_Distance=np.zeros(shape=(len(clustering_sample),k))
# 2D array denoting distance of each point from each centroid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


#### Initialzing the centroids

In [6]:
Centroid=x[np.random.randint(x.shape[0], size=k), :]
# Centroid array

Computing distance of each point from every centroid

In [7]:
for i in range(k):
    Euclidean_Distance[:,i]=np.sum((x-Centroid[i])**2,axis=1)

Assigning the cluster based on minimum distance

In [8]:
C=np.argmin(Euclidean_Distance,axis=1)+1

Counting no of points in each cluster

In [9]:
n = np.zeros((k,1))
for i in range(k):
    n[i] = np.count_nonzero(C == i+1)

Running k means for 100 iterations

In [10]:
for i in range(100):
    for i in range(k):
        Euclidean_Distance[:,i]=np.sum((x-Centroid[i])**2,axis=1)
    C=np.argmin(Euclidean_Distance,axis=1)+1
    for i in range(k):
        n[i] = np.count_nonzero(C == i+1)
    #print(n)
    Centroid.fill(0)
    for i in range(len(clustering_sample)):
        Centroid[C[i]-1]+=x[i];
    for i in range(k):
        if(n[i]>0):
            Centroid[i]=Centroid[i]/n[i]

In [21]:
clustering_sample_val = clustering_sample.values

In [22]:
D = clustering_data['diagnosed_for'].unique()

In [23]:
D

array(['Not diagnosed', 'Cataract', 'Asthma/chronic respiratory failure',
       'Tuberculosis', 'Diabetes', 'Chronic heart disease',
       'Hypertension', 'Anaemia', 'Goitre/ Thyroid disorder',
       'Others(Hernia;Hydrocele;Peptic Ulcer etc)', 'Renal stone',
       'Chronic liver disease', 'Rheumatoid arthritis / osteoarthritis',
       'Cancer-Genitourinary system', 'Flourosis',
       'Stroke/Cerebrovascular accident',
       'Piles/Anal fissure & Anal fistula', 'Glaucoma', 'Skin Cancer',
       'Epilepsy', 'Tumour(any type)', 'Pyorrhea',
       'Gall stone /cholecystitis', 'Chronic skin diseases - psoriasis',
       'Cancer-Respiratory system', 'Blood cancer/leukemia',
       'Sinusitis,Tonsilitis', 'Cancer-Gastro-intestinal system',
       'Leprosy', 'Myocardial infarction/heart attack',
       'Chronic renal failure', 'Cancer-Breast',
       'Others (Hernia;hydrocele;peptic ulcer etc)',
       'Chronic heart disease/failure', 'Chronic liver failure',
       'Rheumatic fever/Rh

In [24]:
disease={}
for i in range(len(D)):
    disease[D[i]]=i+1
# dictionary to assign a unique number to each diagnosis

In [25]:
disease

{'Not diagnosed': 1,
 'Cataract': 2,
 'Asthma/chronic respiratory failure': 3,
 'Tuberculosis': 4,
 'Diabetes': 5,
 'Chronic heart disease': 6,
 'Hypertension': 7,
 'Anaemia': 8,
 'Goitre/ Thyroid disorder': 9,
 'Others(Hernia;Hydrocele;Peptic Ulcer etc)': 10,
 'Renal stone': 11,
 'Chronic liver disease': 12,
 'Rheumatoid arthritis / osteoarthritis': 13,
 'Cancer-Genitourinary system': 14,
 'Flourosis': 15,
 'Stroke/Cerebrovascular accident': 16,
 'Piles/Anal fissure & Anal fistula': 17,
 'Glaucoma': 18,
 'Skin Cancer': 19,
 'Epilepsy': 20,
 'Tumour(any type)': 21,
 'Pyorrhea': 22,
 'Gall stone /cholecystitis': 23,
 'Chronic skin diseases - psoriasis': 24,
 'Cancer-Respiratory system': 25,
 'Blood cancer/leukemia': 26,
 'Sinusitis,Tonsilitis': 27,
 'Cancer-Gastro-intestinal system': 28,
 'Leprosy': 29,
 'Myocardial infarction/heart attack': 30,
 'Chronic renal failure': 31,
 'Cancer-Breast': 32,
 'Others (Hernia;hydrocele;peptic ulcer etc)': 33,
 'Chronic heart disease/failure': 34,
 '

In [26]:
for i in range(len(clustering_sample_val)):
    clustering_sample_val[i][4]=disease[clustering_sample_val[i][4]]
# assigning numbers to each diagnosis

In [27]:
clustering_sample_val = clustering_sample_val.astype('float64')

In [28]:
cluster_to_disease={}
# Dictionary to assign disease to cluster based on max voting

In [29]:
C1=np.matrix(C)

C1=C1.T

C2=np.asarray(C1)

In [30]:
clustering_sample_val = np.append(clustering_sample_val, C2, axis = 1)

In [31]:
clustering_sample_val[:,[4, 5]] = clustering_sample_val[:,[5, 4]]

#### Assigning a diagnosis to each cluster based on max vote ignoring people who have not been diagnosed with anything

In [32]:
for i in range(1,len(D)+1):
    a=clustering_sample_val[clustering_sample_val[:,4]==i][:,5]
    (values,counts) = np.unique(a,return_counts=True)
    if(values.size>1):
        ind=np.argmax(np.delete(counts,0))
        #print(ind)
        cluster_to_disease[i]=values[ind+1]
    else:
        ind=np.argmax(counts)
        cluster_to_disease[i]=values[ind]
# assigning disease to cluster based on max vote

In [34]:
D[10]

'Renal stone'

In [35]:
D[13]

'Cancer-Genitourinary system'

In [36]:
count_renal_stone=0
count_cancer_genitourinary_system=0

In [37]:
for key in cluster_to_disease:
    if cluster_to_disease[key]==10:
        count_renal_stone+=1
        
    elif cluster_to_disease[key]==13:
        count_cancer_genitourinary_system+=1
        

#### Denormalizing the centroids of these clusters and then comparing their attribute values

In [41]:
Mean_renal_stone = np.zeros(shape=(1,4),dtype=float)
Mean_cancer_genitourinary_system = np.zeros(shape=(1,4),dtype=float)

In [42]:
for i in range(1,k+1):
    if cluster_to_disease[i]==10:
        Mean_renal_stone += Centroid[i-1]
    elif cluster_to_disease[i]==13:
        Mean_cancer_genitourinary_system += Centroid[i-1]

In [43]:
Mean_renal_stone/=count_renal_stone
Mean_cancer_genitourinary_system/=count_cancer_genitourinary_system

In [46]:
Mean_denorm=clustering_sample[['age','highest_qualification','no_of_dwelling_rooms','status_of_living']].values.mean()

In [47]:
Std_denorm=clustering_sample[['age','highest_qualification','no_of_dwelling_rooms','status_of_living']].values.std()

In [48]:
Mean_renal_stone_denorm=np.multiply(Mean_renal_stone,Std_denorm)+Mean_denorm

In [49]:
Mean_cancer_genitourinary_system_denorm=np.multiply(Mean_cancer_genitourinary_system,Std_denorm)+Mean_denorm

In [50]:
Mean_renal_stone_denorm

array([[34.97975286,  3.87952635,  3.22931999,  2.06376392]])

In [51]:
Mean_cancer_genitourinary_system_denorm

array([[26.6176507 ,  2.36312167,  1.97596528,  1.20382206]])