# Group 4: Predicting Strokes

## Data Cleaning
First, we open the strokes dataset, then we rename the columns using a dictionary of names.

In [1]:
import pandas as pd

df = pd.read_csv("strokes.csv")

print(f"Total of {df.shape[0]} entries")
print(f"{len(df[df['Stroke'] == 1])} of these had strokes")

new_column_names = {
    "Diabetes_binary" : "diabetes", "HighBP" : "high_blood_pressure", 
    "HighChol" : "high_cholesterol", "CholCheck" : "cholesterol_checked", 
    "BMI" : "bmi", "Smoker" : "smokes", "Stroke" : "stroke", 
    "HeartDiseaseorAttack" : "heart_disease", "PhysActivity" : "physical_activity", 
    "Fruits" : "eats_fruits", "Veggies" : "eats_veggies",
    "HvyAlcoholConsump" : "drinks_alcohol", "AnyHealthcare" : "has_healthcare", 
    "NoDocbcCost" : "medical_costs", "GenHlth" : "general_health", 
    "MentHlth" : "mental_health", "PhysHlth" : "physical_health", "DiffWalk" : "difficulty_walking", 
    "Sex" : "gender", "Age" : "age", "Education" : "education", "Income": "income"}

df = df.rename(mapper = lambda col: new_column_names[col], axis=1)

df.head()

Total of 70692 entries
4395 of these had strokes


Unnamed: 0,diabetes,high_blood_pressure,high_cholesterol,cholesterol_checked,bmi,smokes,stroke,heart_disease,physical_activity,eats_fruits,...,has_healthcare,medical_costs,general_health,mental_health,physical_health,difficulty_walking,gender,age,education,income
0,0.0,1.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,5.0,30.0,0.0,1.0,4.0,6.0,8.0
1,0.0,1.0,1.0,1.0,26.0,1.0,1.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,1.0,12.0,6.0,8.0
2,0.0,0.0,0.0,1.0,26.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,1.0,0.0,10.0,0.0,1.0,13.0,6.0,8.0
3,0.0,1.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,3.0,0.0,1.0,11.0,6.0,8.0
4,0.0,0.0,0.0,1.0,29.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,8.0,5.0,8.0


Next, we split the data into those with a stroke and those without a stroke.

In [2]:
import random

no_strokes_per_stroke = 2

with_stroke = df[df["stroke"] == 1]

random.seed(1234)
random_choices = random.sample(df.index[df["stroke"] == 0].tolist(), no_strokes_per_stroke*len(with_stroke))
without_stroke = df.take(random_choices)

balanced_df = pd.concat([with_stroke, without_stroke], axis=0).reset_index()

Next, we split the data into the general and specific datasets, and write them to their corresponding CSVs.

In [4]:
general_columns = [
    "gender", "age", "income", "education",
    "bmi", "smokes", "eats_fruits", "eats_veggies", 
    "drinks_alcohol", "physical_activity"
]
specific_columns = general_columns + [
    "diabetes", "high_blood_pressure", "high_cholesterol", 
    "cholesterol_checked", "heart_disease", "general_health", 
    "mental_health", "physical_health", "difficulty_walking", 
    "has_healthcare", "medical_costs"
]

Y = balanced_df["stroke"]

general_x = balanced_df[general_columns]
specific_x = balanced_df[specific_columns]

Y.to_csv("Y.csv", index=False)
general_x.to_csv("general_x.csv", index=False)
specific_x.to_csv("specific_x.csv", index=False)

In [7]:
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
import time
import numpy as np



names = [
    "Poly SVM of degree 3",
    "Poly SVM of degree 4",
    "Linear SVM",
    "rbf SVM",
]

classifiers = [
    SVC(kernel="poly", C=0.025, random_state=42),
    SVC(kernel="poly", degree =4, C=0.025, random_state=42),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(kernel="rbf", C=0.025, random_state=42),
]

classifiers2 = [
    SVC(kernel="poly", C=0.025, random_state=42),
    SVC(kernel="poly", degree =4, C=0.025, random_state=42),
    SVC(kernel="linear", C=0.025, random_state=42),
    SVC(kernel="rbf", C=0.025, random_state=42),
]

X_train, X_test, y_train, y_test = train_test_split(
    general_x, Y, test_size=0.3, random_state=42
)

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    specific_x, Y, test_size=0.3, random_state=42
)

general_accuracies = []
specific_accuracies = []

general_confusion = []
specific_confusion = []


# iterate over classifiers
for name, clf, clf2 in zip(names, classifiers, classifiers2):
    print(f"{name}:")
    t0 = time.time()
    clf.fit(X_train, y_train)
    t1 = time.time()
    y_pred = clf.predict(X_test)
    t2 = time.time()
    accuracy = accuracy_score(y_test, y_pred)
    general_accuracies.append(accuracy)
    general_confusion.append((y_test, y_pred))
    print(f"General Accuracy: {round(accuracy*100,1)}%")
    print(f"Took {round((t1-t0) * 1000, 1)} ms to train")
    print(f"Took {round((t2-t1) * 1000, 1)} ms to run\n")
    
    t0 = time.time()
    clf2.fit(X_train2, y_train2)
    t1 = time.time()
    y_pred2 = clf2.predict(X_test2)
    t2 = time.time()
    accuracy = accuracy_score(y_test2, y_pred2)
    specific_accuracies.append(accuracy)
    specific_confusion.append((y_test2, y_pred2))
    print(f"Specific Accuracy: {round(accuracy*100,1)}%")
    print(f"Took {round((t1-t0) * 1000, 1)} ms to train")
    print(f"Took {round((t2-t1) * 1000, 1)} ms to run\n\n")
    

    

Poly SVM of degree 3:
General Accuracy: 65.7%
Took 1068.3 ms to train
Took 221.4 ms to run

Specific Accuracy: 69.2%
Took 1059.4 ms to train
Took 274.4 ms to run


Poly SVM of degree 4:
General Accuracy: 65.7%
Took 1249.5 ms to train
Took 230.9 ms to run

Specific Accuracy: 69.9%
Took 1143.1 ms to train
Took 281.0 ms to run


Linear SVM:
General Accuracy: 65.7%
Took 855.7 ms to train
Took 192.2 ms to run

Specific Accuracy: 73.8%
Took 912.7 ms to train
Took 232.6 ms to run


rbf SVM:
General Accuracy: 65.7%
Took 986.9 ms to train
Took 756.9 ms to run

Specific Accuracy: 69.4%
Took 1190.8 ms to train
Took 858.6 ms to run




In [6]:
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.linear_model import LogisticRegression
import time
import numpy as np



names = [
    "Random Forest",
    "Gaussian Process",
    "AdaBoost",
    "Decision Tree",
    "Linear SVM",
    "Logistic Regression",
]

classifiers = [
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    GaussianNB(),
    AdaBoostClassifier(random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    SVC(kernel="linear", C=0.025, random_state=42),
    LogisticRegression(max_iter=10000, random_state=42),
]

classifiers2 = [
    RandomForestClassifier(
        max_depth=5, n_estimators=10, max_features=1, random_state=42
    ),
    GaussianNB(),
    AdaBoostClassifier(random_state=42),
    DecisionTreeClassifier(max_depth=5, random_state=42),
    SVC(kernel="linear", C=0.025, random_state=42),
    LogisticRegression(max_iter=10000, random_state=42),
]

X_train, X_test, y_train, y_test = train_test_split(
    general_x, Y, test_size=0.3, random_state=42
)

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    specific_x, Y, test_size=0.3, random_state=42
)

general_accuracies = []
specific_accuracies = []

general_confusion = []
specific_confusion = []


# iterate over classifiers
for name, clf, clf2 in zip(names, classifiers, classifiers2):
    print(f"{name}:")
    t0 = time.time()
    clf.fit(X_train, y_train)
    t1 = time.time()
    y_pred = clf.predict(X_test)
    t2 = time.time()
    accuracy = accuracy_score(y_test, y_pred)
    general_accuracies.append(accuracy)
    general_confusion.append((y_test, y_pred))
    print(f"General Accuracy: {round(accuracy*100,1)}%")
    print(f"Took {round((t1-t0) * 1000, 1)} ms to train")
    print(f"Took {round((t2-t1) * 1000, 1)} ms to run\n")
    
    t0 = time.time()
    clf2.fit(X_train2, y_train2)
    t1 = time.time()
    y_pred2 = clf2.predict(X_test2)
    t2 = time.time()
    accuracy = accuracy_score(y_test2, y_pred2)
    specific_accuracies.append(accuracy)
    specific_confusion.append((y_test2, y_pred2))
    print(f"Specific Accuracy: {round(accuracy*100,1)}%")
    print(f"Took {round((t1-t0) * 1000, 1)} ms to train")
    print(f"Took {round((t2-t1) * 1000, 1)} ms to run\n\n")
    
def ensemble(classifiers, classifiers2, X_test, X_test2, y_test, y_test2, general_accuracies, specific_accuracies):
    y_pred = np.zeros(y_test.shape)
    y_pred2 = np.zeros(y_test2.shape)
    for i, (clf, clf2) in enumerate(zip(classifiers, classifiers2)):   
        y_pred += general_accuracies[i] * clf.predict(X_test)
        y_pred2 += specific_accuracies[i] * clf2.predict(X_test2)
    y_pred /= sum(general_accuracies)
    y_pred2 /= sum(specific_accuracies)
    
    y_pred = y_pred.round(0)
    y_pred2 = y_pred2.round(0)
    
    accuracy = accuracy_score(y_test, y_pred)
    accuracy2 = accuracy_score(y_test2, y_pred2)
    general_confusion.append((y_test, y_pred))
    specific_confusion.append((y_test2, y_pred2))
    
    print(f"Ensemble:")
    print(f"General Accuracy: {round(accuracy*100,1)}%")
    print(f"Specific Accuracy: {round(accuracy2*100,1)}%")
    

ensemble(classifiers, classifiers2, X_test, X_test2, y_test, y_test2, general_accuracies, specific_accuracies)
    

Random Forest:
General Accuracy: 66.2%
Took 23.8 ms to train
Took 4.7 ms to run

Specific Accuracy: 70.8%
Took 20.4 ms to train
Took 2.5 ms to run


Gaussian Process:
General Accuracy: 66.7%
Took 2.1 ms to train
Took 0.7 ms to run

Specific Accuracy: 72.3%
Took 2.2 ms to train
Took 0.7 ms to run


AdaBoost:
General Accuracy: 68.4%
Took 151.8 ms to train
Took 12.6 ms to run

Specific Accuracy: 75.1%
Took 188.8 ms to train
Took 12.3 ms to run


Decision Tree:
General Accuracy: 67.4%
Took 6.7 ms to train
Took 0.7 ms to run

Specific Accuracy: 74.3%
Took 9.8 ms to train
Took 0.9 ms to run


Linear SVM:
General Accuracy: 65.7%
Took 898.4 ms to train
Took 195.7 ms to run

Specific Accuracy: 69.4%
Took 1203.1 ms to train
Took 859.5 ms to run


Logistic Regression:
General Accuracy: 68.3%
Took 140.0 ms to train
Took 7.6 ms to run

Specific Accuracy: 75.1%
Took 618.9 ms to train
Took 3.5 ms to run


Ensemble:
General Accuracy: 68.4%
Specific Accuracy: 75.1%
