In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier

## Diabetes Model

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("diabetes_data.csv")
df_diabetes_data = pd.read_csv(file_path)

# Review the DataFrame
df_diabetes_data.head()# Separate the data into labels and features


Unnamed: 0,Age,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes
0,4.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0
1,12.0,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0
2,13.0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0
3,11.0,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0
4,8.0,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
# Separate the y variable, the labels
y = df_diabetes_data['Diabetes']

# Separate the X variable, the features
X = df_diabetes_data.drop(columns='Diabetes')

In [4]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X, y)

scaler = StandardScaler()
numerical_cols = ['Age', 'BMI', 'GenHlth', 'MentHlth', 'PhysHlth']
X_train_d[numerical_cols] = scaler.fit_transform(X_train_d[numerical_cols])
X_test_d[numerical_cols] = scaler.fit_transform(X_test_d[numerical_cols])


### Logistic Reg

In [5]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_diabetes_model = LogisticRegression(solver='lbfgs')

# Fit the model using training data
lr_diabetes_model.fit(X_train_d, y_train_d)

In [6]:
# Make a prediction using the testing data
diabetes_pred = lr_diabetes_model.predict(X_test_d)

In [7]:
diabetes_class_report = classification_report(y_test_d, diabetes_pred)
print(diabetes_class_report)

              precision    recall  f1-score   support

         0.0       0.76      0.73      0.74      8791
         1.0       0.74      0.77      0.75      8882

    accuracy                           0.75     17673
   macro avg       0.75      0.75      0.75     17673
weighted avg       0.75      0.75      0.75     17673



### KNN Model

In [8]:
X_test_d_c = np.ascontiguousarray(X_test_d)

In [9]:
knn_diabetes = KNeighborsClassifier(n_neighbors=3)
knn_diabetes.fit(X_train_d, y_train_d)
diabetes_pred_knn = knn_diabetes.predict(X_test_d_c)



In [10]:
print(classification_report(y_test_d, diabetes_pred_knn))

              precision    recall  f1-score   support

         0.0       0.70      0.68      0.69      8791
         1.0       0.69      0.72      0.70      8882

    accuracy                           0.70     17673
   macro avg       0.70      0.70      0.70     17673
weighted avg       0.70      0.70      0.70     17673



## Stroke Model

In [11]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("stroke_data.csv")
df_stroke_data = pd.read_csv(file_path)

# Review the DataFrame
df_stroke_data = df_stroke_data.dropna()
print(df_stroke_data.isnull().sum())
df_stroke_data.head()

sex                  0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64


Unnamed: 0,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,1.0,63.0,0,1,1,4,1,228.69,36.6,1,1
1,1.0,42.0,0,1,1,4,0,105.92,32.5,0,1
2,0.0,61.0,0,0,1,4,1,171.23,34.4,1,1
3,1.0,41.0,1,0,1,3,0,174.12,24.0,0,1
4,1.0,85.0,0,0,1,4,1,186.21,29.0,1,1


In [12]:
# Separate the y variable, the labels
y2 = df_stroke_data['stroke']

# Separate the X variable, the features
X2 = df_stroke_data.drop(columns='stroke')

In [13]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X2, y2)

numerical_cols2 = ['age', 'work_type', 'avg_glucose_level', 'bmi']
X_train_s[numerical_cols2] = scaler.fit_transform(X_train_s[numerical_cols2])
X_test_s[numerical_cols2] = scaler.fit_transform(X_test_s[numerical_cols2])

### Logistic Reg

In [14]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_stroke_model = LogisticRegression(solver='lbfgs')

# Fit the model using training data
lr_stroke_model.fit(X_train_s, y_train_s)

In [15]:
# Make a prediction using the testing data
stroke_pred = lr_stroke_model.predict(X_test_s)

In [16]:
stroke_class_report = classification_report(y_test_s, stroke_pred)
print(stroke_class_report)

              precision    recall  f1-score   support

           0       0.66      0.75      0.70      5082
           1       0.71      0.62      0.66      5145

    accuracy                           0.68     10227
   macro avg       0.69      0.68      0.68     10227
weighted avg       0.69      0.68      0.68     10227



### KNN 

In [17]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_s, y_train_s)
stroke_pred_knn = knn.predict(X_test_s)

In [18]:
print(classification_report(y_test_s, stroke_pred_knn))

              precision    recall  f1-score   support

           0       0.98      0.82      0.89      5082
           1       0.85      0.98      0.91      5145

    accuracy                           0.90     10227
   macro avg       0.91      0.90      0.90     10227
weighted avg       0.91      0.90      0.90     10227

