In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, plot_precision_recall_curve
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from pandas_profiling import ProfileReport
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

  from .autonotebook import tqdm as notebook_tqdm


# Predicting heart failures: Supporting clinical decisions for the health and wellness of cardiovascular disease patients

## Problem Overview

[Heart failure](https://www.uchealth.com/en/conditions/heart-failure) is a progressive condition where the heart muscle gradually loses its ability to pump blood throughout the body. As the current leading cause of death worldwide, cardiovascular diseases (CVDs) account for over [$20
billion direct and indirect costs](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2722492/#:~:text=CVD%20was%20also%20the%20most,and%20physician%20care%20for%2012%25.) for the Canadian healthcare system each year. With no known cure for CVD, patients often require complex treatments to manage the condition.

However, CVDs are preventable in their early stages. Individuals can interfere with its progression by controlling certain risk factors once it is identified by their healthcare provider. By analyzing clinical data, I hope to better identify individuals who are at risk of heart failure using machine learning. Its predictions could lend support to healthcare providers by reinforcing early detection and intervention strategies.

The goal of this analysis is to predict whether a patient will die from heart failure or survive, based on demographics such as their age or sex and various lab test results. My model outputs predictions for the `DEATH_EVENT` column: 0 means that the patient survived and 1 means that they died. The data is sourced from the 'Heart Failure Prediction' dataset on [Kaggle](https://www.kaggle.com/andrewmvd/heart-failure-clinical-data).

## Exploratory Data Analysis
First, I split the data into two sets with 75% assigned for training and 25% for testing my model. I separated each set into two groups: X = columns containing values I will use to predict and y = column I am trying to predict.

In [16]:
hf_df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
hf_train, hf_test = train_test_split(hf_df, random_state=123)

X_train = hf_train.drop(columns=['DEATH_EVENT'])
y_train = hf_train['DEATH_EVENT']

X_test = hf_test.drop(columns=['DEATH_EVENT'])
y_test = hf_test['DEATH_EVENT']

hf_train

FileNotFoundError: [Errno 2] No such file or directory: 'heart_failure_clinical_records_dataset.csv'