In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

In [3]:
df = pd.read_csv("../../datasets/stroke/healthcare-dataset-stroke-data.csv")
df = df.dropna()
df_x = df.drop("stroke", axis=1)
df_y = df['stroke']

In [4]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [5]:
df_x['gender'] = df_x['gender'].astype("category").cat.codes
df_x['ever_married'] = df_x['ever_married'].astype("category").cat.codes
df_x['work_type'] = df_x['work_type'].astype("category").cat.codes
df_x['Residence_type'] = df_x['Residence_type'].astype("category").cat.codes
df_x['smoking_status'] = df_x['smoking_status'].astype("category").cat.codes

In [6]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=.2)

In [7]:
model_ada = AdaBoostClassifier()
model_forest = RandomForestClassifier()
model_tree = DecisionTreeClassifier()
model_neigh = KNeighborsClassifier()
model_svm = SVC()
model_xg = XGBClassifier()

In [8]:
model_ada.fit(x_train, y_train)
model_forest.fit(x_train, y_train)
model_tree.fit(x_train, y_train)
model_neigh.fit(x_train, y_train)
model_svm.fit(x_train, y_train)
model_xg.fit(x_train, y_train)





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [9]:
pred_ada = model_ada.predict(x_test)
pred_forest = model_forest.predict(x_test)
pred_tree = model_tree.predict(x_test)
pred_neigh = model_neigh.predict(x_test)
pred_svm = model_svm.predict(x_test)
pred_xg = model_xg.predict(x_test)


In [10]:
metrics = {}
def get_metrics(x, name, accuracy_score):
    metrics[name] = x['macro avg']
    metrics[name]['accuracy'] = accuracy_score

def print_metrics(metrics):
    print(pd.DataFrame(metrics))


In [11]:
get_metrics(classification_report(y_test, pred_ada, output_dict=1, zero_division=1), "ada", accuracy_score(y_test, pred_ada))
get_metrics(classification_report(y_test, pred_forest, output_dict=1, zero_division=1), "forest", accuracy_score(y_test, pred_forest))
get_metrics(classification_report(y_test, pred_tree, output_dict=1, zero_division=1), "tree", accuracy_score(y_test, pred_tree))
get_metrics(classification_report(y_test, pred_neigh, output_dict=1, zero_division=1), "neighbours", accuracy_score(y_test, pred_neigh))
get_metrics(classification_report(y_test, pred_svm, output_dict=1, zero_division=1), "svm", accuracy_score(y_test, pred_svm))
get_metrics(classification_report(y_test, pred_xg, output_dict=1, zero_division=1), "xg", accuracy_score(y_test, pred_xg))

In [12]:
print_metrics(metrics)

                  ada      forest        tree  neighbours         svm  \
precision    0.642663    0.475510    0.541370    0.975560    0.975560   
recall       0.509346    0.498929    0.540551    0.500000    0.500000   
f1-score     0.506801    0.486938    0.540954    0.487474    0.487474   
support    982.000000  982.000000  982.000000  982.000000  982.000000   
accuracy     0.950102    0.949084    0.915479    0.951120    0.951120   

                   xg  
precision    0.576337  
recall       0.516551  
f1-score     0.520317  
support    982.000000  
accuracy     0.945010  


# SVM WINS

In [147]:
# Lets ensemble the best ones.
def ensemble(x, y):
    preds = pd.Series([])
    for index, rows in x.iterrows():
        pred_tree = model_tree.predict(rows.to_numpy().reshape(1,-1))
        pred_svm = model_svm.predict(rows.to_numpy().reshape(1,-1))
        pred_xg = model_xg.predict(pd.DataFrame(rows).T)
        pred_f = np.round((pred_tree[0]+pred_svm[0]+pred_xg[0])/3)
        preds = preds.append(pd.Series(pred_f), ignore_index=1)
    return preds

In [148]:
preds = ensemble(x_test, y_test)

  preds = pd.Series([])


In [149]:
classification_report(y_test, preds, output_dict=1)["macro avg"]

{'precision': 0.5592554644808744,
 'recall': 0.5077400071377587,
 'f1-score': 0.5049059530734923,
 'support': 982}

In [150]:
accuracy_score(y_test, preds)

0.9470468431771895