In [2]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra

Import Libraries

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

Read DataSet

In [4]:
data_set = pd.read_csv('data/healthcare-dataset-stroke-data.csv')

In [5]:
data_set.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


Data Clean up

In [6]:
data_set.drop(columns='id', inplace=True)

Check for missing values

In [7]:
print(data_set.isnull().sum())

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64


Fill missing values in 'bmi' column with mean

In [8]:
data_set['bmi'].fillna(data_set['bmi'].mean(), inplace=True)

# Convert categorical variables to numerical using Label Encoding

In [9]:
le = LabelEncoder()
data_set['gender'] = le.fit_transform(data_set['gender'])
data_set['ever_married'] = le.fit_transform(data_set['ever_married'])
data_set['work_type'] = le.fit_transform(data_set['work_type'])
data_set['Residence_type'] = le.fit_transform(data_set['Residence_type'])
data_set['smoking_status'] = le.fit_transform(data_set['smoking_status'])

In [10]:
# Data Preprocessing

# Split data into features and target variable
X = data_set.drop(columns='stroke')
y = data_set['stroke']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
# Handling Imbalanced Data using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Training the models

In [12]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_resampled, y_train_resampled)

In [13]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_resampled, y_train_resampled)

In [14]:
# Support Vector Machine (SVM)
from sklearn.svm import SVC
svm = SVC(kernel='rbf', random_state=42)
svm.fit(X_train_resampled, y_train_resampled)

# Evaluating the models

In [15]:
# Logistic Regression
log_reg_pred = log_reg.predict(X_test)
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, log_reg_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, log_reg_pred))
print("Classification Report:\n", classification_report(y_test, log_reg_pred))

Logistic Regression:
Accuracy: 0.7583170254403131
Confusion Matrix:
 [[728 232]
 [ 15  47]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.76      0.85       960
           1       0.17      0.76      0.28        62

    accuracy                           0.76      1022
   macro avg       0.57      0.76      0.57      1022
weighted avg       0.93      0.76      0.82      1022



In [16]:
# Random Forest
rf_pred = rf.predict(X_test)
print("\nRandom Forest:")
print("Accuracy:", accuracy_score(y_test, rf_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, rf_pred))
print("Classification Report:\n", classification_report(y_test, rf_pred))


Random Forest:
Accuracy: 0.9187866927592955
Confusion Matrix:
 [[932  28]
 [ 55   7]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.96       960
           1       0.20      0.11      0.14        62

    accuracy                           0.92      1022
   macro avg       0.57      0.54      0.55      1022
weighted avg       0.90      0.92      0.91      1022



In [17]:

# Support Vector Machine (SVM)
svm_pred = svm.predict(X_test)
print("\nSupport Vector Machine:")
print("Accuracy:", accuracy_score(y_test, svm_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, svm_pred))
print("Classification Report:\n", classification_report(y_test, svm_pred))


Support Vector Machine:
Accuracy: 0.776908023483366
Confusion Matrix:
 [[761 199]
 [ 29  33]]
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.79      0.87       960
           1       0.14      0.53      0.22        62

    accuracy                           0.78      1022
   macro avg       0.55      0.66      0.55      1022
weighted avg       0.91      0.78      0.83      1022



# Hybrid Prediction

In [18]:

# Combine predictions from all models
from scipy.stats import mode
combined_pred = np.array([log_reg_pred, rf_pred, svm_pred])
final_pred, _ = mode(combined_pred)

In [19]:
# Calculate accuracy of the hybrid model
final_pred = final_pred.reshape(-1)
print("\nHybrid Model Accuracy:", accuracy_score(y_test, final_pred))


Hybrid Model Accuracy: 0.8111545988258317
