In [1]:
# Import Modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.preprocessing import StandardScaler

# Display all of the columns
pd.set_option('display.max_columns', None)

In [2]:
# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(
    Path('Resources/diabetes_binary_5050split_health_indicators_BRFSS2015.csv')
)
df.head()

Unnamed: 0,Diabetes_binary,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,0.0,1.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,6.0
1,0.0,0.0,1.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,11.0,6.0,8.0
2,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,8.0
3,0.0,1.0,1.0,1.0,23.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,30.0,1.0,1.0,11.0,6.0,7.0
4,0.0,0.0,0.0,1.0,30.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,5.0,8.0


In [3]:
# Split target column from dataset
y = df['Diabetes_binary']
X = df.drop(columns='Diabetes_binary')

In [4]:
# Preview the data
X[:5]

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,6.0
1,0.0,1.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,11.0,6.0,8.0
2,1.0,0.0,1.0,27.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,8.0
3,1.0,1.0,1.0,23.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,30.0,1.0,1.0,11.0,6.0,7.0
4,0.0,0.0,1.0,30.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,5.0,8.0


In [5]:
# Print first five entries for target
y[:5]

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Diabetes_binary, dtype: float64

In [6]:
# Encode the categorical variables using get_dummies
X = pd.get_dummies(X)

In [7]:
# Preview the data
X.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,24.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,6.0
1,0.0,1.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,11.0,6.0,8.0
2,1.0,0.0,1.0,27.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,8.0,4.0,8.0
3,1.0,1.0,1.0,23.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,30.0,1.0,1.0,11.0,6.0,7.0
4,0.0,0.0,1.0,30.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,5.0,8.0


In [8]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=16)

In [9]:
X_train.shape

(53019, 21)

In [10]:
X_test.shape

(17673, 21)

In [11]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
X_scaler = scaler.fit(X_train)
# Scale the training data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [12]:
# Instantiate the model with k = 3 neighbors
model = KNeighborsClassifier(n_neighbors=3)

In [13]:
# Train the model
model.fit(X_train_scaled, y_train)

In [14]:
# Create predictions
y_pred = model.predict(X_test_scaled)

# Review the predictions
y_pred

array([1., 1., 0., ..., 1., 0., 1.])

In [15]:
# Print confusion matrix
confusion_matrix(y_pred,y_test)

array([[6023, 2571],
       [2809, 6270]])

In [16]:
# Print classification report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       0.68      0.70      0.69      8594
         1.0       0.71      0.69      0.70      9079

    accuracy                           0.70     17673
   macro avg       0.70      0.70      0.70     17673
weighted avg       0.70      0.70      0.70     17673

