In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## **Import important packages**

In [None]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

sns.set_style('whitegrid')

In [None]:
df=pd.read_csv("../input/heart-failure-prediction/heart.csv")     #load dataset
df.head()

## **Features Description**
* **Age** : age of the patient [years]
* **Sex** : sex of the patient [M: Male, F: Female]
* **ChestPainType** : chest pain type

    - TA: Typical Angina, 
    - ATA: Atypical Angina, 
    - NAP: Non-Anginal Pain, 
    - ASY: Asymptomatic
    
* **RestingBP** : resting blood pressure [mm Hg]
* **Cholesterol**: serum cholesterol [mm/dl]
* **FastingBS** : fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
* **RestingECG** : resting electrocardiogram results 
     - Normal: Normal, 
     - ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), 
     - LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria
     
* **MaxHR** : maximum heart rate achieved [Numeric value between 60 and 202]
* **ExerciseAngina** : exercise-induced angina [Y: Yes, N: No]
* **Oldpeak** : oldpeak = ST [Numeric value measured in depression]
* **ST_Slope** : the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
* **HeartDisease** : output class [1: heart disease, 0: Normal]

In [None]:
df.shape

In [None]:
df.duplicated().sum()

In [None]:
df.corr()

In [None]:
df.info

Description

In [None]:
df.describe()

## **Missing Value**

In [None]:
df.isnull().sum()

## **Feature Analysis**

1 Age

In [None]:
age_mid=df.Age.median()
age_mean=df.Age.mean()
age_std=df.Age.std()
print("the standard deviation of age is",age_std)
group=df.groupby(by="HeartDisease")
group1=group.get_group(1)
group0=group.get_group(0)
age1=group1["Age"]
age0=group0["Age"]
mean_age1=age1.mean()
mean_age0=age0.mean()
print("the mean of age with heart disease is",mean_age1)
print("the mean of age without heart disease is",mean_age0)

#plot
plt.figure(figsize=(15,5))
plt.hist(df.Age)
plt.vlines(age_mid, ymin=0, ymax=220, color='#D3E4CD',ls='--', lw=2.5, label='Median')
plt.vlines(age_mean, ymin=0, ymax=220,color='#D3E4CD', lw=2.5, label='Mean')
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.title("Age_hist")

#subplot
fig, axs=plt.subplots(nrows=1, ncols=2, figsize=(15, 4))
ax1=age1.plot(kind='hist',ax=axs[0])
ax1.vlines(mean_age1,ymin=0, ymax=220,color='#D3E4CD', lw=2.5, label='Mean_age1')
ax1.set_title("With Heart Disease")
ax2=age0.plot(kind='hist',ax=axs[1])
ax2.vlines(mean_age0, ymin=0, ymax=220,color='#D3E4CD', lw=2.5, label='Mean_age0')
ax2.set_title("Without Heart Disease")


fig.suptitle('Distribution of Ages by Heart Disease', size=16, c='#000')
plt.show()

**Analysis**:According to the histograms, Age distribution is approximatly Gaussian.The median and mean approximatly equal. After we divided the data set using HeartDisease. The destribution seems do not change much. So we assume that "Age" feature does not contribute too much on HeartDisease. 

2 Sex

In [None]:
import plotly.express as px
fig = px.pie(df['Sex'].value_counts().reset_index(), values = 'Sex', names = 'index', width = 500, height = 500)
fig.update_traces(textposition = 'inside', 
                  textinfo = 'percent + label', 
                  hole = 0.8, 
                  marker = dict(colors = ['#dd4124','#009473', '#336b87', '#b4b4b4'], line = dict(color = 'white', width = 2)),
                  hovertemplate = 'Clients: %{value}')

fig.update_layout(annotations = [dict(text = 'Proportion of different gender', 
                                      x = 0.5, y = 0.5, font_size = 15, showarrow = False, 
                                      font_family = 'monospace',
                                      font_color = 'black')],
                  showlegend = False)
fig.show()

In [None]:
fig = px.pie(group1['Sex'].value_counts().reset_index(), values = 'Sex', names = 'index', width = 500, height = 500)
fig.update_traces(textposition = 'inside', 
                  textinfo = 'percent + label', 
                  hole = 0.8, 
                  marker = dict(colors = ['#dd4124','#009473', '#336b87', '#b4b4b4'], line = dict(color = 'white', width = 2)),
                  hovertemplate = 'Clients: %{value}')

fig.update_layout(annotations = [dict(text = 'Proportion of different gender with disease', 
                                      x = 0.5, y = 0.5, font_size = 12, showarrow = False, 
                                      font_family = 'monospace',
                                      font_color = 'black')],
                  showlegend = False)
fig.show()

**Analysis**:From the bar graphic, we find the Male are more likely to get heartdisease which approximately consist 90%. This might be an important feature for classification.

3 ChestPainType

In [None]:
plt.figure(figsize=(15,5))
ax=sns.countplot(x="ChestPainType", hue="HeartDisease", data=df)

**Analysis**:According to the histogram, the patients with one particular chest pain type,named ASY, have more probability to have heart disease. And for some pain types, there is no evidence to show that ChestPainType has connection to heart disease. 

4 RestingBP

In [None]:
RestingBP_mid=df.Age.median()
RestingBP_mean=df.Age.mean()
RestingBP_std=df.Age.std()
print("the standard deviation of RestingBP is",RestingBP_std)
RestingBP1=group1["RestingBP"]
RestingBP0=group0["RestingBP"]
mean_RestingBP1=RestingBP1.mean()
mean_RestingBP0=RestingBP0.mean()
print("the mean of age with heart disease is",mean_RestingBP1)
print("the mean of age without heart disease is",mean_RestingBP0)

#plot
plt.figure(figsize=(15,5))
plt.hist(df.Age)
plt.vlines(RestingBP_mid, ymin=0, ymax=220, color='#D3E4CD',ls='--', lw=2.5, label='Median')
plt.vlines(RestingBP_mean, ymin=0, ymax=220,color='#D3E4CD', lw=2.5, label='Mean')
plt.xlabel("RestingBP")
plt.ylabel("Frequency")
plt.title("RestingBP_hist")

#subplot
fig, axs=plt.subplots(nrows=1, ncols=2, figsize=(15, 4))
ax1=RestingBP1.plot(kind='hist',ax=axs[0])
ax1.vlines(mean_RestingBP1,ymin=0, ymax=220,color='#D3E4CD', lw=2.5, label='Mean_RestingBP1')
ax1.set_title("With Heart Disease")
ax2=RestingBP0.plot(kind='hist',ax=axs[1])
ax2.vlines(mean_RestingBP0, ymin=0, ymax=220,color='#D3E4CD', lw=2.5, label='Mean_RestingBP0')
ax2.set_title("Without Heart Disease")


fig.suptitle('Distribution of RestingBP by Heart Disease', size=16, c='#000')
plt.show()

**Analysis**:the mean value for the people who have Heart Disease is a little higher than the poeple who don't. We can assume people with heart disease have higher resting blood presure.

5 Cholesterol

In [None]:
plt.figure(figsize=(15, 5))
sns.histplot(data=df, x="Cholesterol", hue="HeartDisease")

**Analysis**:Most heart disease patients have a abnormally low cholesterol.

6 FastingBS

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(data=df, x="FastingBS", hue="HeartDisease")

**Analysis**:People with fasting blood sugar higher than 120 mg/dl mostly have heart disease

7 RestingECG

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(data=df, x="RestingECG", hue="HeartDisease");

nan

**Analysis**:For all kinds of resting ECG, patients with heart disease is higher, especially for ST.

8 MaxHR

In [None]:
MaxHR_mid=df.MaxHR.median()
MaxHR_mean=df.MaxHR.mean()
MaxHR_std=df.MaxHR.std()
print("the standard deviation of MaxHR is",MaxHR_std)
group=df.groupby(by="HeartDisease")
group1=group.get_group(1)
group0=group.get_group(0)
MaxHR1=group1["MaxHR"]
MaxHR0=group0["MaxHR"]
mean_MaxHR1=MaxHR1.mean()
mean_MaxHR0=MaxHR0.mean()
print("the mean of MaxHR with heart disease is",mean_MaxHR1)
print("the mean of MaxHR without heart disease is",mean_MaxHR0)


#plot
plt.figure(figsize=(15,5))
plt.hist(df.MaxHR)
plt.vlines(MaxHR_mid, ymin=0, ymax=220, color='#D3E4CD',ls='--', lw=2.5, label='Median')
plt.vlines(MaxHR_mean, ymin=0, ymax=220,color='#D3E4CD', lw=2.5, label='Mean')
plt.xlabel("MaxHR")
plt.ylabel("Frequency")
plt.title("MaxHR_hist")


#subplot
fig, axs=plt.subplots(nrows=1, ncols=2, figsize=(15, 4))
ax1=MaxHR1.plot(kind='hist',ax=axs[0])
ax1.vlines(mean_MaxHR1,ymin=0, ymax=220,color='#D3E4CD', lw=2.5, label='Mean_MaxHR1')
ax1.set_title("With Heart Disease")
ax2=MaxHR0.plot(kind='hist',ax=axs[1])
ax2.vlines(mean_MaxHR0, ymin=0, ymax=220,color='#D3E4CD', lw=2.5, label='Mean_MaxHR0')
ax2.set_title("Without Heart Disease")


fig.suptitle('Distribution of MaxHR by Heart Disease', size=16, c='#000')
plt.show()

**Analysis**:the mean value of max heart rate for heart disease is lower. And the histogram is left skewed, that means most patients have low max heart rate

9 ExerciseAngina

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(data=df, x="ExerciseAngina", hue="HeartDisease");

**Analysis**:It is obviously that people with heart disease are more likely to have a exercise induced angina than people without heart disease

10 Oldpeak

In [None]:
Oldpeak_mid=df.Oldpeak.median()
Oldpeak_mean=df.Oldpeak.mean()
Oldpeak_std=df.Oldpeak.std()
print("the standard deviation of Oldpeak is",Oldpeak_std)
group=df.groupby(by="HeartDisease")
group1=group.get_group(1)
group0=group.get_group(0)
Oldpeak1=group1["Oldpeak"]
Oldpeak0=group0["Oldpeak"]
mean_Oldpeak1=Oldpeak1.mean()
mean_Oldpeak0=Oldpeak0.mean()
print("the mean of Oldpeak with heart disease is",mean_Oldpeak1)
print("the mean of Oldpeak without heart disease is",mean_Oldpeak0)


#plot
plt.figure(figsize=(15,5))
plt.hist(df.Oldpeak)
plt.vlines(Oldpeak_mid, ymin=0, ymax=400, color='#D3E4CD',ls='--', lw=2.5, label='Median')
plt.vlines(Oldpeak_mean, ymin=0, ymax=400,color='#D3E4CD', lw=2.5, label='Mean')
plt.xlabel("Oldpeak")
plt.ylabel("Frequency")
plt.title("Oldpeak_hist")


#subplot
fig, axs=plt.subplots(nrows=1, ncols=2, figsize=(15, 4))
ax1=Oldpeak1.plot(kind='hist',ax=axs[0])
ax1.vlines(mean_Oldpeak1,ymin=0, ymax=400,color='#D3E4CD', lw=2.5, label='Mean_Oldpeak1')
ax1.set_title("With Heart Disease")
ax2=Oldpeak0.plot(kind='hist',ax=axs[1])
ax2.vlines(mean_Oldpeak0, ymin=0, ymax=400,color='#D3E4CD', lw=2.5, label='Mean_Oldpeak0')
ax2.set_title("Without Heart Disease")


fig.suptitle('Distribution of Oldpeak by Heart Disease', size=16, c='#000')
plt.show()

**Analysis**:the patients' ST are vary from -2 to 4,but for the normal people, the ST are concerntrate on 0

11 ST_Slope

In [None]:
plt.figure(figsize=(15, 5))
sns.countplot(data=df, x="ST_Slope", hue="HeartDisease");

**Analysis**:when the slope of the peak exercise ST segment is flat ,the people is more likely to have heart disease

Heat Map

In [None]:
cor=df.drop('HeartDisease', axis=1).corr()
plt.figure(figsize=(15,5))
sns.heatmap(cor, annot=True, cmap="Set3");

In [None]:
from sklearn.ensemble import RandomForestClassifier

**Analysis**:From the heat map, we can find 