# <b>1<span style='color:#0386f7de'>|</span> Importing libraries</b>

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import LogisticRegression 
from sklearn.preprocessing import OneHotEncoder 
from sklearn import metrics

# <b>2<span style='color:#0386f7de'>|</span> Exploratory Data </b>


In [3]:
#Loading CSV file "Dataset"
heart_database = pd.read_csv("Resources/heart_2020_cleaned.csv")
heart_database

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


# <b>3<span style='color:#0386f7de'>|</span> Explanation of the variables of the dataset </b> 

1. HeartDisease : Respondents that have ever reported having coronary heart disease (CHD) or myocardial infarction (MI).
2. BMI : Body Mass Index (BMI).
3. Smoking : Have you smoked at least 100 cigarettes in your entire life? ( The answer Yes or No ).
4. AlcoholDrinking : Heavy drinkers (adult men having more than 14 drinks per week and adult women having more than 7 drinks per week
5. Stroke : (Ever told) (you had) a stroke?
6. PhysicalHealth : Now thinking about your physical health, which includes physical illness and injury, for how many days during the past 30 days was your physical health not good? (0-30 days).
7. MentalHealth : Thinking about your mental health, for how many days during the past 30 days was your mental health not good? (0-30 days).
8. DiffWalking : Do you have serious difficulty walking or climbing stairs?
9. Sex : Are you male or female?
10. AgeCategory: Fourteen-level age category.
11. Race : Imputed race/ethnicity value.
12. Diabetic : (Ever told) (you had) diabetes?
13. PhysicalActivity : Adults who reported doing physical activity or exercise during the past 30 days other than their regular job.
14. GenHealth : Would you say that in general your health is...
15. SleepTime : On average, how many hours of sleep do you get in a 24-hour period?
16. Asthma : (Ever told) (you had) asthma?
17. KidneyDisease : Not including kidney stones, bladder infection or incontinence, were you ever told you had kidney disease?
18. SkinCancer : (Ever told) (you had) skin cancer?


# <b>4<span style='color:#0386f7de'>|</span> Exploring and evaluating the variables of the dataset </b>


In [4]:
# Explore Data types
heart_database.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

We have roughly 319795 records with 18 columns, according to the result. There are no null values, and there are 14 numeric and four categorical features. We can convert the string properties that only have two unique values, but first let's make sure there aren't any aberrant values.

In [5]:
# Exploring data sample size
print(heart_database.shape)

(319795, 18)


In [6]:
# Review number of Positive heart disease cases
heart_database['HeartDisease'].value_counts()

No     292422
Yes     27373
Name: HeartDisease, dtype: int64

In [9]:
heart_database.nunique()

HeartDisease           2
BMI                 3604
Smoking                2
AlcoholDrinking        2
Stroke                 2
PhysicalHealth        31
MentalHealth          31
DiffWalking            2
Sex                    2
AgeCategory           13
Race                   6
Diabetic               4
PhysicalActivity       2
GenHealth              5
SleepTime             24
Asthma                 2
KidneyDisease          2
SkinCancer             2
dtype: int64

In [12]:
# Replacing Bolean variables by 1 and 0
heart_database =  heart_database[heart_database.columns].replace({'Yes':1, 'No':0, 'Male':1,'Female':0,'No, borderline diabetes':'0','Yes (during pregnancy)':'1' })
heart_database['Diabetic'] = heart_database['Diabetic'].astype(int)

We'll utilize OneHotEncoder in the preprocessing stage later for some variables that have more than two unique values.

# <b>5<span style='color:#0386f7de'>|</span>Analysis of Numeric Variables</b>

In [7]:
heart_database.describe().T.style.set_properties(**{'background-color': 'grey','color': 'white','border-color': 'white'})

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BMI,319795.0,28.325399,6.3561,12.02,24.03,27.34,31.42,94.85
PhysicalHealth,319795.0,3.37171,7.95085,0.0,0.0,0.0,2.0,30.0
MentalHealth,319795.0,3.898366,7.955235,0.0,0.0,0.0,3.0,30.0
SleepTime,319795.0,7.097075,1.436007,1.0,6.0,7.0,8.0,24.0


# <b>6<span style='color:#0386f7de'>|</span>Exploratory AnalysisExplanation of the variables of the dataset</b>

In [None]:
# Analisis and comparing the different varaibles agains heart disease.

# <b>7<span style='color:#0386f7de'>|</span> Data Preprocessing</b>


In [None]:
# Standarize and encode the data in order to be ready for building up the Training and testing data sets.

# <b>8<span style='color:#0386f7de'>|</span> Split Dataset for Training and Testing</b>

In [None]:
# Split the dataset in training and testing to perform then Maching learning process.

# <b>9<span style='color:#0386f7de'>|</span> Modelling</b>

In [None]:
# We are going to perform a Classifier Analysis and a Tree decission Model to evaluate the best model to predict the heart dissiesa variable.

# <b>10<span style='color:#0386f7de'>|</span> Modelling Comparison and Conclusions</b>

In [None]:
# We are going to compare the model built in 9 to reach the conclusions. 