# Mid Term Project - Heart Failure Detection

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
from sklearn.metrics import mean_squared_error
import xgboost as xgb

%matplotlib inline
plt.rcParams["figure.figsize"] = [12,7]

## Load Dataset

In [2]:
df = pd.read_csv("heart_failure_clinical_records_dataset.csv")
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


Here, the target variable is `DEATH_EVENT`. That means, it is a binary classification problem.

## Exploratory Data Analysis

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


From dataset description, we can see that:
- The shape of the dataset is (299, 13). 
- There is no missing value.
- The data type of `age` is float. We'll convert it to integer.

In [4]:
df["age"] = df["age"].astype(int)

In [5]:
df.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.829431,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894997,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


From dataset description, we can see that:
- `anaemia`, `diabetes`, `high_blood_pressure`, `sex`, `smoking` and `DEATH_EVENT` are qualitative variables. One easy way to find out is to check the min and max values of those variables. We need to decode these variables.
- The rest are quantitative variables.

Decode the qualitative variables:

In [6]:
binary_values = {
    0: "no",
    1: "yes"
}
df["diabetes"] = df["diabetes"].map(binary_values)
df["high_blood_pressure"] = df["high_blood_pressure"].map(binary_values)
df["smoking"] = df["smoking"].map(binary_values)
df["DEATH_EVENT"] = df["DEATH_EVENT"].map(binary_values)


anaemia_values = {
    0: "decreased",
    1: "not decreased"
}
df["anaemia"] = df["anaemia"].map(anaemia_values)


sex_values = {
    0: "female",
    1: "male"
}
df["sex"] = df["sex"].map(sex_values)

In [7]:
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75,decreased,582,no,20,yes,265000.0,1.9,130,male,no,4,yes
1,55,decreased,7861,no,38,no,263358.03,1.1,136,male,no,6,yes
2,65,decreased,146,no,20,no,162000.0,1.3,129,male,yes,7,yes
3,50,not decreased,111,no,20,no,210000.0,1.9,137,male,no,7,yes
4,65,not decreased,160,yes,20,no,327000.0,2.7,116,female,no,8,yes


In [10]:
# sns.histplot(np.log1p(df["creatinine_phosphokinase"]), bins=30)
df["DEATH_EVENT"].value_counts()

no     203
yes     96
Name: DEATH_EVENT, dtype: int64

In [None]:
- Do normalization
- train val test split
- dictvectorize
- run decision tree
- run random forest
- run xgboost