In [56]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, classification_report, roc_curve, plot_roc_curve, auc, precision_recall_curve, plot_precision_recall_curve, average_precision_score
from sklearn.model_selection import cross_val_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier



In [57]:
# Read csv file
df = pd.read_csv("healthcare-dataset-stroke-data.csv")
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [58]:
# Drop ID column
df1 = df.drop('id', axis=1)
df1.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [59]:
df1.shape

(5110, 11)

In [60]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 439.3+ KB


bmi Column has 4909 null values out of 5110. We will take a closer look at the bmi column.

In [61]:
# Check for null values
df1.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [62]:
df1.mean()

age                   43.226614
hypertension           0.097456
heart_disease          0.054012
avg_glucose_level    106.147677
bmi                   28.893237
stroke                 0.048728
dtype: float64

In [63]:
df1.median()

age                  45.000
hypertension          0.000
heart_disease         0.000
avg_glucose_level    91.885
bmi                  28.100
stroke                0.000
dtype: float64

In [64]:
# Replace null values with the median
df1["bmi"].fillna(df1["bmi"].median(), inplace=True)
df1

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,28.1,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,28.1,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [65]:
# Check for unique values
df1['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [66]:
df1.gender.value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [67]:
# Check for unique values
df1['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [68]:
df1["smoking_status"].value_counts()

never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64

In [69]:
# Drop 'Other' values from Gender column
other_value = df1[df1['gender'] =='Other'].index
df1 = df1.drop(other_value)

In [70]:
# Define independent variables (X) and predictive variable (y)
X = df1.drop(["stroke"], axis=1)
X = pd.get_dummies(X)

y = df1["stroke"]

In [71]:
X.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0
1,61.0,0,0,202.21,28.1,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0
2,80.0,0,1,105.92,32.5,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0


In [72]:
# Check the balance of our target values
y.value_counts()

0    4860
1     249
Name: stroke, dtype: int64

In [73]:
# Normalize data for numerical variables 
numerical_cols = X.select_dtypes(["float64","int64"])
scaler = StandardScaler()
X[numerical_cols.columns] = scaler.fit_transform(X[numerical_cols.columns])

In [74]:
X.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,1.051242,-0.328637,4.184599,2.70645,1.004893,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0
1,0.785889,-0.328637,-0.238972,2.121652,-0.099142,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0
2,1.626174,-0.328637,4.184599,-0.004867,0.472358,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0
3,0.255182,-0.328637,-0.238972,1.437473,0.719142,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1
4,1.581949,3.042866,-0.238972,1.501297,-0.631677,1,0,0,1,0,0,0,1,0,1,0,0,0,1,0


In [75]:
# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((4087, 20), (4087,), (1022, 20), (1022,))

In [76]:
# Check the numbers of positive and negative predicted stroke in training set
print (sum(y_train == 1))
print (sum(y_train == 0))

187
3900


#### Only 4.7% of our data predicts a posivive outcome (stroke). In order to get accurate prediction we will have to resample with SMOTE.

In [77]:
from imblearn.over_sampling import SMOTE

In [78]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train.ravel())

In [79]:
print (X_train.shape)
print (y_train.shape)
print (sum(y_train == 1))
print (sum(y_train == 0))

(7800, 20)
(7800,)
3900
3900


## Suport Vector Machine

In [83]:
# Import model
model = SVC(probability=True)

# Fit the model
model.fit(X_train, y_train)

# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
acc_score = balanced_accuracy_score(y_test, y_pred)
acc_score

0.5395161290322581

In [85]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [87]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(f"Accuracy Score : {acc_score}")
print(classification_report_imbalanced(y_test, y_pred))

Accuracy Score : 0.5395161290322581
                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.95      0.13      0.95      0.35      0.13       960
          1       0.14      0.13      0.95      0.14      0.35      0.11        62

avg / total       0.90      0.90      0.18      0.90      0.35      0.13      1022



In [89]:
score = cross_val_score(model, X_train, y_train, cv = 6)
precision = precision_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print ('train score of SVC is', score.mean(),'%')
print ('Precision score is ', precision)
print ('ROC Score is', roc)
print ('Recall Score is ', recall)

train score of SVC is 0.9288461538461538 %
Precision score is  0.14285714285714285
ROC Score is 0.539516129032258
Recall Score is  0.12903225806451613
