In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv('diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
X = dataset.iloc[:, 1:].values
y = dataset.iloc[:, 0].values

In [3]:
print(X)

[[ 1.  0.  1. ...  4.  6.  8.]
 [ 1.  1.  1. ... 12.  6.  8.]
 [ 0.  0.  1. ... 13.  6.  8.]
 ...
 [ 1.  0.  1. ...  8.  5.  2.]
 [ 0.  0.  1. ...  8.  5.  4.]
 [ 1.  1.  1. ... nan nan nan]]


In [4]:
print(y)

[0. 0. 0. ... 1. 1. 1.]


In [5]:
y.reshape(len(y),1)

array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [1.],
       [1.]])

In [6]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values = np.nan, strategy='mean')
imputer.fit(X[:, 1:])
X[:,1:] = imputer.transform(X[:, 1:])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

In [8]:
print(X_train)

[[ 1.  0.  1. ... 13.  4.  2.]
 [ 1.  1.  1. ... 10.  6.  8.]
 [ 0.  0.  1. ... 11.  6.  8.]
 ...
 [ 0.  1.  1. ... 11.  2.  2.]
 [ 1.  1.  1. ... 12.  6.  6.]
 [ 0.  1.  1. ...  5.  6.  8.]]


In [9]:
print(y_train)

[0. 1. 0. ... 1. 1. 0.]


In [10]:
print(X_test)

[[ 0.  1.  1. ... 12.  6.  8.]
 [ 0.  1.  1. ...  7.  4.  7.]
 [ 1.  0.  1. ... 12.  3.  4.]
 ...
 [ 0.  0.  1. ... 11.  6.  6.]
 [ 0.  0.  1. ... 10.  6.  8.]
 [ 0.  0.  1. ... 11.  3.  5.]]


In [11]:
print(y_test)

[0. 0. 0. ... 1. 0. 0.]


In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [13]:
print(X_train)

[[ 1.06280809 -0.90932667  0.18723423 ...  1.60233322 -1.01238636
  -1.87278162]
 [ 1.06280809  1.0997148   0.18723423 ...  0.60480912  0.98241441
   0.96822276]
 [-0.94090364 -0.90932667  0.18723423 ...  0.93731715  0.98241441
   0.96822276]
 ...
 [-0.94090364  1.0997148   0.18723423 ...  0.93731715 -3.00718713
  -1.87278162]
 [ 1.06280809  1.0997148   0.18723423 ...  1.26982519  0.98241441
   0.0212213 ]
 [-0.94090364  1.0997148   0.18723423 ... -1.05773104  0.98241441
   0.96822276]]


In [14]:
print(X_test)

[[-0.94090364  1.0997148   0.18723423 ...  1.26982519  0.98241441
   0.96822276]
 [-0.94090364  1.0997148   0.18723423 ... -0.39271498 -1.01238636
   0.49472203]
 [ 1.06280809 -0.90932667  0.18723423 ...  1.26982519 -2.00978675
  -0.92578016]
 ...
 [-0.94090364 -0.90932667  0.18723423 ...  0.93731715  0.98241441
   0.0212213 ]
 [-0.94090364 -0.90932667  0.18723423 ...  0.60480912  0.98241441
   0.96822276]
 [-0.94090364 -0.90932667  0.18723423 ...  0.93731715 -2.00978675
  -0.45227943]]


In [15]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier.fit(X_train, y_train)

In [16]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[0. 0.]
 [0. 0.]
 [0. 0.]
 ...
 [0. 1.]
 [0. 0.]
 [0. 0.]]


In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)*100

[[7816 1043]
 [1750 1083]]


76.11187136503592

In [18]:
print(classifier.predict(sc.transform([[1,1,1,40,1,0,0,0,0,1,0,1,0,5,18,15,1,0,9,4,3]])))

[1.]
