# Death events by heart attack: prediction model
### The data is imbalanced and I have not treated that and the dataset contained also has its outliers removed.

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix,classification_report
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

In [3]:
df=pd.read_csv("removed_outliers_data.csv")
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
2,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
3,90.0,1,47,0,40,1,204000.0,2.1,132,1,1,8,1
4,75.0,1,246,0,15,0,127000.0,1.2,137,1,0,10,1


In [4]:
#Count check of classes in dependent variable
death_event_counts=df['DEATH_EVENT'].value_counts()
print(death_event_counts)

DEATH_EVENT
0    163
1     61
Name: count, dtype: int64


In [24]:
#Selecting  important features
filtered_df = df[['age', 'ejection_fraction', 'serum_creatinine','DEATH_EVENT']]
X=filtered_df.iloc[:,0:3]
y=filtered_df.iloc[:,3]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [25]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)

In [26]:
#Count of y variable's test set 
death_event_counts2 = y_test.value_counts()
print(death_event_counts2 )

DEATH_EVENT
0    33
1    12
Name: count, dtype: int64


### 1.  Logistic Regression

In [27]:
lr1 = LogisticRegression()
lr1.fit(X_train,y_train)

ytrain_pred_lr1=lr1.predict(X_train)
ytest_pred_lr1=lr1.predict(X_test)

classifiers = [lr1, lr1]
predictions = [ytest_pred_lr1, ytrain_pred_lr1]
data_names = ["Testing Data", "Training Data"]

for i in range(len(classifiers)):
    accuracy = accuracy_score(y_test, predictions[i]) if i == 0 else accuracy_score(y_train, predictions[i])
    data_name = data_names[i]
    print(f"Accuracy for {data_name}: {accuracy * 100:.2f}%")

train_report1 = classification_report(y_train, ytrain_pred_lr1)
test_report1 = classification_report(y_test, ytest_pred_lr1)

# Print the classification reports
print("Classification Report for Training Data:\n", train_report1)
print("\nClassification Report for Testing Data:\n", test_report1)

Accuracy for Testing Data: 68.89%
Accuracy for Training Data: 82.68%
Classification Report for Training Data:
               precision    recall  f1-score   support

           0       0.85      0.93      0.89       130
           1       0.75      0.55      0.64        49

    accuracy                           0.83       179
   macro avg       0.80      0.74      0.76       179
weighted avg       0.82      0.83      0.82       179


Classification Report for Testing Data:
               precision    recall  f1-score   support

           0       0.77      0.82      0.79        33
           1       0.40      0.33      0.36        12

    accuracy                           0.69        45
   macro avg       0.59      0.58      0.58        45
weighted avg       0.67      0.69      0.68        45



In [28]:
# Hyperparametric tuning for Logistic Regression
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  # Different values of the regularization parameter
    'penalty': ['l1', 'l2']  # Types of regularization (L1 or L2)
}

# Create a GridSearchCV instance
grid_search = GridSearchCV(estimator=lr1, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Perform the grid search on the training data
grid_search.fit(X_train, y_train)

# Get the best hyperparameters from the grid search
best_params = grid_search.best_params_
best_score = grid_search.best_score_

# Print the best hyperparameters and the corresponding accuracy
print("Best Hyperparameters:", best_params)
print("Best Accuracy:", best_score)


Best Hyperparameters: {'C': 0.1, 'penalty': 'l2'}
Best Accuracy: 0.8100000000000002


In [29]:
# Create a LogisticRegression instance with the best hyperparameters
best_lr = LogisticRegression(C=best_params['C'], penalty=best_params['penalty'])

# Fit the best logistic regression model to the training data
best_lr.fit(X_train, y_train)

# Now you can use the best_lr model for predictions and evaluation
ytrain_pred_best_lr = best_lr.predict(X_train)
ytest_pred_best_lr = best_lr.predict(X_test)

# Calculate the classification report for the best_lr model
train_report_best_lr = classification_report(y_train, ytrain_pred_best_lr)
test_report_best_lr = classification_report(y_test, ytest_pred_best_lr)

# Print the classification reports
print("Classification Report for Training Data (Best LR):\n", train_report_best_lr)
print("\nClassification Report for Testing Data (Best LR):\n", test_report_best_lr)


Classification Report for Training Data (Best LR):
               precision    recall  f1-score   support

           0       0.82      0.98      0.89       130
           1       0.88      0.45      0.59        49

    accuracy                           0.83       179
   macro avg       0.85      0.71      0.74       179
weighted avg       0.84      0.83      0.81       179


Classification Report for Testing Data (Best LR):
               precision    recall  f1-score   support

           0       0.74      0.88      0.81        33
           1       0.33      0.17      0.22        12

    accuracy                           0.69        45
   macro avg       0.54      0.52      0.51        45
weighted avg       0.63      0.69      0.65        45



### Since the dataset is not balanced, i.e., its 0s are more in number than 1s, hence hyperparametric tuning is not going to do much good. As you can see above, after hpt, the precision and recall scores worsened as seen before hpt. Hence, I have not performed one for either models below. 

### 2. SVM

In [12]:
svm1=SVC(kernel='rbf',random_state=0)
svm1.fit(X_train,y_train)

ytrain_pred_svm1=svm1.predict(X_train)
ytest_pred_svm1=svm1.predict(X_test)

classifiers = [svm1, svm1]
predictions = [ytest_pred_svm1, ytrain_pred_svm1]
data_names = ["Testing Data", "Training Data"]

for i in range(len(classifiers)):
    accuracy = accuracy_score(y_test, predictions[i]) if i == 0 else accuracy_score(y_train, predictions[i])
    data_name = data_names[i]
    print(f"Accuracy for {data_name}: {accuracy * 100:.2f}%")
    
train_report2 = classification_report(y_train, ytrain_pred_svm1)
test_report2 = classification_report(y_test, ytest_pred_svm1)

# Print the classification reports
print("Classification Report for Training Data:\n", train_report2)
print("\nClassification Report for Testing Data:\n", test_report2)


Accuracy for Testing Data: 73.33%
Accuracy for Training Data: 84.92%
Classification Report for Training Data:
               precision    recall  f1-score   support

           0       0.88      0.92      0.90       130
           1       0.76      0.65      0.70        49

    accuracy                           0.85       179
   macro avg       0.82      0.79      0.80       179
weighted avg       0.84      0.85      0.85       179


Classification Report for Testing Data:
               precision    recall  f1-score   support

           0       0.80      0.85      0.82        33
           1       0.50      0.42      0.45        12

    accuracy                           0.73        45
   macro avg       0.65      0.63      0.64        45
weighted avg       0.72      0.73      0.73        45



### 3. KNN

In [13]:
knn1=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
knn1.fit(X_train,y_train)

ytrain_pred_knn1=knn1.predict(X_train)
ytest_pred_knn1=knn1.predict(X_test)

classifiers = [knn1, knn1]
predictions = [ytest_pred_knn1, ytrain_pred_knn1]
data_names = ["Testing Data", "Training Data"]

for i in range(len(classifiers)):
    accuracy = accuracy_score(y_test, predictions[i]) if i == 0 else accuracy_score(y_train, predictions[i])
    data_name = data_names[i]
    print(f"Accuracy for {data_name}: {accuracy * 100:.2f}%")
    
train_report3 = classification_report(y_train, ytrain_pred_knn1)
test_report3 = classification_report(y_test, ytest_pred_knn1)

# Print the classification reports
print("Classification Report for Training Data:\n", train_report3)
print("\nClassification Report for Testing Data:\n", test_report3)


Accuracy for Testing Data: 66.67%
Accuracy for Training Data: 83.80%
Classification Report for Training Data:
               precision    recall  f1-score   support

           0       0.85      0.94      0.89       130
           1       0.78      0.57      0.66        49

    accuracy                           0.84       179
   macro avg       0.82      0.75      0.78       179
weighted avg       0.83      0.84      0.83       179


Classification Report for Testing Data:
               precision    recall  f1-score   support

           0       0.78      0.76      0.77        33
           1       0.38      0.42      0.40        12

    accuracy                           0.67        45
   macro avg       0.58      0.59      0.58        45
weighted avg       0.68      0.67      0.67        45



### 4. Niave bayes

In [14]:
nb1=BernoulliNB()
nb1.fit(X_train,y_train)

ytrain_pred_nb1=nb1.predict(X_train)
ytest_pred_nb1=nb1.predict(X_test)

classifiers = [nb1, nb1]
predictions = [ytest_pred_nb1, ytrain_pred_nb1]
data_names = ["Testing Data", "Training Data"]

for i in range(len(classifiers)):
    accuracy = accuracy_score(y_test, predictions[i]) if i == 0 else accuracy_score(y_train, predictions[i])
    data_name = data_names[i]
    print(f"Accuracy for {data_name}: {accuracy * 100:.2f}%")
    
train_report4 = classification_report(y_train, ytrain_pred_nb1)
test_report4 = classification_report(y_test, ytest_pred_nb1)

# Print the classification reports
print("Classification Report for Training Data:\n", train_report4)
print("\nClassification Report for Testing Data:\n", test_report4)


Accuracy for Testing Data: 68.89%
Accuracy for Training Data: 76.54%
Classification Report for Training Data:
               precision    recall  f1-score   support

           0       0.79      0.92      0.85       130
           1       0.62      0.37      0.46        49

    accuracy                           0.77       179
   macro avg       0.71      0.64      0.66       179
weighted avg       0.75      0.77      0.74       179


Classification Report for Testing Data:
               precision    recall  f1-score   support

           0       0.73      0.91      0.81        33
           1       0.25      0.08      0.12        12

    accuracy                           0.69        45
   macro avg       0.49      0.50      0.47        45
weighted avg       0.60      0.69      0.63        45



### 5. Decision Tree


In [16]:
dt1=DecisionTreeClassifier()
dt1.fit(X_train,y_train)

ytrain_pred_dt1=dt1.predict(X_train)
ytest_pred_dt1=dt1.predict(X_test)

classifiers = [dt1, dt1]
predictions = [ytest_pred_dt1, ytrain_pred_dt1]
data_names = ["Testing Data", "Training Data"]

for i in range(len(classifiers)):
    accuracy = accuracy_score(y_test, predictions[i]) if i == 0 else accuracy_score(y_train, predictions[i])
    data_name = data_names[i]
    print(f"Accuracy for {data_name}: {accuracy * 100:.2f}%")
    
train_report5_a = classification_report(y_train, ytrain_pred_dt1)
test_report5_a = classification_report(y_test, ytest_pred_dt1)

# Print the classification reports
print("Classification Report for Training Data:\n", train_report5_a)
print("\nClassification Report for Testing Data:\n", test_report5_a)


Accuracy for Testing Data: 71.11%
Accuracy for Training Data: 98.88%
Classification Report for Training Data:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       130
           1       1.00      0.96      0.98        49

    accuracy                           0.99       179
   macro avg       0.99      0.98      0.99       179
weighted avg       0.99      0.99      0.99       179


Classification Report for Testing Data:
               precision    recall  f1-score   support

           0       0.83      0.76      0.79        33
           1       0.47      0.58      0.52        12

    accuracy                           0.71        45
   macro avg       0.65      0.67      0.66        45
weighted avg       0.74      0.71      0.72        45



### 6. Random Forest

In [19]:
rf1=RandomForestClassifier(n_estimators=100,random_state=0)
rf1.fit(X_train,y_train)

ytrain_pred_rf1=rf1.predict(X_train)
ytest_pred_rf1=rf1.predict(X_test)

classifiers = [rf1, rf1]
predictions = [ytest_pred_rf1, ytrain_pred_rf1]
data_names = ["Testing Data", "Training Data"]

for i in range(len(classifiers)):
    accuracy = accuracy_score(y_test, predictions[i]) if i == 0 else accuracy_score(y_train, predictions[i])
    data_name = data_names[i]
    print(f"Accuracy for {data_name}: {accuracy * 100:.2f}%")
    
train_report6_a = classification_report(y_train, ytrain_pred_rf1)
test_report6_a = classification_report(y_test, ytest_pred_rf1)

# Print the classification reports
print("Classification Report for Training Data:\n", train_report6_a)
print("\nClassification Report for Testing Data:\n", test_report6_a)


Accuracy for Testing Data: 71.11%
Accuracy for Training Data: 98.88%
Classification Report for Training Data:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       130
           1       1.00      0.96      0.98        49

    accuracy                           0.99       179
   macro avg       0.99      0.98      0.99       179
weighted avg       0.99      0.99      0.99       179


Classification Report for Testing Data:
               precision    recall  f1-score   support

           0       0.79      0.82      0.81        33
           1       0.45      0.42      0.43        12

    accuracy                           0.71        45
   macro avg       0.62      0.62      0.62        45
weighted avg       0.70      0.71      0.71        45



### I have observed that decision tree classifier is giving the best results of precision and recall scores as compared to others, hence we are going to go ahead with that model finally.