In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
diabetes = pd.read_csv('diabetes.csv')
print(diabetes.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [3]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
print('Dimensions:' , diabetes.shape)

Dimensions: (768, 9)


In [5]:
print(diabetes.groupby('Outcome').size())

Outcome
0    500
1    268
dtype: int64


In [6]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
#Cleaning the data
diabetes = diabetes[diabetes.Outcome > 0]
diabetes = diabetes[diabetes.Glucose > 0]
diabetes = diabetes[diabetes.BloodPressure > 0]
diabetes = diabetes[diabetes.SkinThickness > 0]
diabetes = diabetes[diabetes.BMI > 0]
diabetes = diabetes[diabetes.DiabetesPedigreeFunction > 0]
diabetes = diabetes[diabetes.Age > 0]
diabetes = diabetes.reset_index(drop=True)

In [8]:
#New dimensions
print("Dimensions:", diabetes.shape)

Dimensions: (177, 9)


In [9]:
print(diabetes.describe())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   177.000000  177.000000     177.000000     177.000000  177.000000   
mean      4.700565  143.118644      74.700565      32.977401  151.920904   
std       3.919017   31.265043      12.523870      10.395013  145.940035   
min       0.000000   78.000000      30.000000       7.000000    0.000000   
25%       1.000000  118.000000      68.000000      27.000000    0.000000   
50%       4.000000  144.000000      74.000000      32.000000  135.000000   
75%       8.000000  171.000000      84.000000      39.000000  207.000000   
max      17.000000  199.000000     110.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age  Outcome  
count  177.000000                177.000000  177.000000    177.0  
mean    35.819774                  0.616588   36.412429      1.0  
std      6.611560                  0.398935   10.837355      0.0  
min     22.900000                  0.127000   2

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(diabetes.loc[:, diabetes.columns != 'Outcome'], diabetes['Outcome'], stratify=diabetes['Outcome'], random_state=66)

## K-means

In [11]:
from sklearn.cluster import KMeans
X = np.array(diabetes.drop(['Outcome'], axis=1).astype(float))
y = np.array(diabetes['Outcome'])

In [12]:
kmeans = KMeans(n_clusters=4, max_iter=1000, algorithm = 'auto')
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1000,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [13]:
pred = []

for i in range(len(X)):
    predict_me = np.array(X[i].astype(float))
    predict_me = predict_me.reshape(-1, len(predict_me))
    prediction = kmeans.predict(predict_me)
    pred.append("a"+str(prediction[0]))

In [14]:
diabetes['Area'] = pred

In [15]:
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Area
0,6,148,72,35,0,33.6,0.627,50,1,a0
1,0,137,40,35,168,43.1,2.288,33,1,a2
2,3,78,50,32,88,31.0,0.248,26,1,a2
3,2,197,70,45,543,30.5,0.158,53,1,a1
4,1,189,60,23,846,30.1,0.398,59,1,a1
5,5,166,72,19,175,25.8,0.587,51,1,a2
6,0,118,84,47,230,45.8,0.551,31,1,a3
7,1,115,70,30,96,34.6,0.529,32,1,a2
8,9,119,80,35,0,29.0,0.263,29,1,a0
9,11,143,94,33,146,36.6,0.254,51,1,a2
