In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../input/heart-disease-uci/heart.csv')

# Exploratory Data Analysis

In [None]:
df_eda = df.copy()
df_eda.drop('target',axis=1)

df_eda.sex = df_eda.sex.map({0:'Female',1:'Male'})
df_eda.fbs = df_eda.fbs.map({0:'False',1:'True'})
df_eda.exang = df_eda.exang.map({0:'No',1:'Yes'})

# Target feature : Chest Pain Type (cp)

In [None]:
sns.countplot(df_eda.cp)
plt.title('Distribution of Chest Pain Type in data')

We can see here that our data is quite unbalanced (much people with no heart disease (0) and a few people with bad heart disease (3)).

## Sex

In [None]:
sns.countplot(df_eda.sex)
pd.crosstab(df_eda['sex'], df_eda['cp']).plot(kind='bar')
plt.title('Distribution of Sex in data')

Here, we have unbalanced data too, but we get more information than before : 

   * We have more data about men than women
   * The data seems to indicate that men are more likely to be affected by heart diseases.

# Fasting Blood Sugar

In [None]:
sns.countplot(df_eda.fbs)
plt.title('Fasting Blood Sugar > 120 mg/dl')

pd.crosstab(df_eda.fbs, df_eda.cp).plot(kind='bar')

The FBS doesn't seem to be a good indicator of hearth diseases presence :
* When fbs=True, we have as much probability to have a response of 0 and of 2, which makes a big difference in reality.
* When fbs=False, we can basically observe the same phenomenon, the difference can be explained by the amount of data, but the interpretation is not really clear.

## Exercise Induced Angina

In [None]:
sns.countplot(df_eda.exang)
plt.title('Distribution of Exercise Induced Angina')

pd.crosstab(df_eda.exang, df_eda.cp).plot(kind='bar')

Here, Exercised Induced Angina is a better indicator : even if we don't have much samples with EIA, we have basically 80% of these with no heart disease. Then, the fact that someone has EIA can be a good reason to think that he/she isn't developing a heart disease.

## Resting Electrocardiographic Results

In [None]:
sns.countplot(df_eda.restecg)
plt.title('Distribution of Resting Electrocardiographic Results')

pd.crosstab(df_eda.restecg, df_eda.cp).plot(kind='bar')

We have a quite balanced data when it comes to 0 or 1 resting electrocardiographic results, but we have not enough data for 2 RER.

## Slope, Ca, Thal

In [None]:
pd.crosstab(df_eda.slope, df_eda.cp).plot(kind='bar')
pd.crosstab(df_eda.ca, df_eda.cp).plot(kind='bar')
pd.crosstab(df_eda.thal, df_eda.cp).plot(kind='bar')

* The slope=2 part is interesting, because the data is quite balanced between the different values of cp
* The ca data is quite unbalanced, we can focus on the ca=0 part, which is an inderesting part because this segmented data is quite balanced
* Same response for thal=2

In [None]:
df_eda.head()

## Age

In [None]:
print("Age distribution according to Chest Pain Type")
facetgrid = sns.FacetGrid(df_eda,hue="cp",aspect = 4)
facetgrid.map(sns.kdeplot,"age",shade = True)
facetgrid.set(xlim = (0,df_eda["age"].max()))
facetgrid.add_legend()

This just enlightens the fact that people over 50 are more likely to have a heart disease. Old news.

## Resting Blood Pressure

In [None]:
print("Resting Blood Pressure distribution according to Chest Pain Type")
facetgrid = sns.FacetGrid(df_eda,hue="cp",aspect = 4)
facetgrid.map(sns.kdeplot,"trestbps",shade = True)
facetgrid.set(xlim = (0,df_eda.trestbps.max()))
facetgrid.add_legend()

This graph makes it possible to restrict the band in which we have the most chance of being affected by heart diseases: [75,200]. The curve cp = 1 is particularly interesting, because we have an even more restricted band ([110,145]) where the values obtained are very high.

## Cholesterol

In [None]:
print("Cholesterol distribution according to Chest Pain Type")
facetgrid = sns.FacetGrid(df_eda,hue="cp",aspect = 4)
facetgrid.map(sns.kdeplot,"chol",shade = True)
facetgrid.set(xlim = (0,df_eda.chol.max()))
facetgrid.add_legend()

Here, we don't get much more information, except the band [100,400].

## Maximum Heart Rate Achieved 

In [None]:
print("Maximum Heart Rate Achieved distribution according to Chest Pain Type")
facetgrid = sns.FacetGrid(df_eda,hue="cp",aspect = 4)
facetgrid.map(sns.kdeplot,"thalach",shade = True)
facetgrid.set(xlim = (0,df_eda.thalach.max()))
facetgrid.add_legend()

From 160 to 200, we get high chances to get a heart disease.

## ST depression induced by exercise relative to rest 

In [None]:
print("Oldpeak distribution according to Chest Pain Type")
facetgrid = sns.FacetGrid(df_eda,hue="cp",aspect = 4)
facetgrid.map(sns.kdeplot,"oldpeak",shade = True)
facetgrid.set(xlim = (0,df_eda.oldpeak.max()))
facetgrid.add_legend()

Here, we can see that we must be cautious between 0 and 2 in terms of oldpeak.

# Classification

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['target'],axis=1)
y = df.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

lr = LogisticRegression()
lr.fit(X_train, y_train)

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

print(f"Accuracy score of logistic regression is {100*lr.score(X_test, y_test)}")
print(f"Accuracy score of random forest classifier is {100*rfc.score(X_test, y_test)}")