In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

## Import Cardiovascular CSV dataset

In [2]:
#Read and load csv files 
file_path = Path("Resources/cardio_train.csv")
df = pd.read_csv(file_path, sep = ';')
df.head(20)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [3]:
#calculating statistical data(mean,median,std, quartiles)
df.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [4]:
#Calculate the count of unique value in the index.
#NaN values are not included by default.
df.nunique()

id             70000
age             8076
gender             2
height           109
weight           287
ap_hi            153
ap_lo            157
cholesterol        3
gluc               3
smoke              2
alco               2
active             2
cardio             2
dtype: int64

In [5]:
#Describe the data types of each column.
df.dtypes

id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [6]:
#Count non-NA cells for each column or row.
df.count()

id             70000
age            70000
gender         70000
height         70000
weight         70000
ap_hi          70000
ap_lo          70000
cholesterol    70000
gluc           70000
smoke          70000
alco           70000
active         70000
cardio         70000
dtype: int64

In [7]:

df.isnull()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,False,False,False,False,False,False,False,False,False,False,False,False,False
69996,False,False,False,False,False,False,False,False,False,False,False,False,False
69997,False,False,False,False,False,False,False,False,False,False,False,False,False
69998,False,False,False,False,False,False,False,False,False,False,False,False,False


In [8]:
df.isna().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [9]:
df.value_counts()

id     age    gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  alco  active  cardio
0      18393  2       168     62.0    110    80     1            1     0      0     1       0         1
66623  18279  2       166     83.0    140    80     1            1     0      0     1       1         1
66631  23107  2       171     73.0    150    90     1            1     0      0     0       1         1
66630  18944  2       162     75.0    100    80     1            1     0      0     1       0         1
66628  18314  1       165     72.0    150    90     1            1     0      0     1       0         1
                                                                                                     ..
33338  16193  1       165     63.0    120    80     1            1     0      0     1       0         1
33339  20481  1       165     80.0    140    80     3            1     0      0     1       1         1
33340  23338  1       155     60.0    120    80     1            1   

In [10]:
max(df['age'])

23713

### Age Conversion

In [11]:
# convert age to years
df['age_year'] = (df['age']/365).round(0).astype('int')
cardio_df = df.drop('age', axis = 1)
cardio_df.head()


Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55
2,2,1,165,64.0,130,70,3,1,0,0,0,1,52
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48
4,4,1,156,56.0,100,60,1,1,0,0,0,0,48


In [12]:
cardio_df.isnull()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,False,False,False,False,False,False,False,False,False,False,False,False,False
69996,False,False,False,False,False,False,False,False,False,False,False,False,False
69997,False,False,False,False,False,False,False,False,False,False,False,False,False
69998,False,False,False,False,False,False,False,False,False,False,False,False,False


In [13]:
max(cardio_df['age_year'])

65

In [14]:
min(cardio_df['age_year'])

30

## Age-Bins

In [35]:
age_bins = [0,40,49,59,90]
group_names = ["<40","40-49", "50-59", "60-89"]

In [36]:
cardio_df["age_group"] = pd.cut(cardio_df["age_year"], age_bins, labels = group_names)
cardio_df

Unnamed: 0,id,age_year,age_group,gender,height,weight,bmi,bmi_range,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,50-59,2,168,62.0,21.97,18.6-24,110,80,1,1,0,0,1,0
1,1,55,50-59,1,156,85.0,34.93,>30.0,140,90,3,1,0,0,1,1
2,2,52,50-59,1,165,64.0,23.51,18.6-24,130,70,3,1,0,0,0,1
3,3,48,40-49,2,169,82.0,28.71,25.0-29,150,100,1,1,0,0,1,1
4,4,48,40-49,1,156,56.0,23.01,18.6-24,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,53,50-59,2,168,76.0,26.93,25.0-29,120,80,1,1,1,0,1,0
69996,99995,62,60-89,1,158,126.0,50.47,>30.0,140,90,2,2,0,0,1,1
69997,99996,52,50-59,2,183,105.0,31.35,>30.0,180,90,3,1,0,1,0,1
69998,99998,61,60-89,1,163,72.0,27.10,25.0-29,135,80,1,2,0,0,0,1


In [17]:
cardio_df.groupby("age_group").mean()["cholesterol"]


age_group
<40      1.174430
40-49    1.245459
50-59    1.374247
60-89    1.507547
Name: cholesterol, dtype: float64

In [18]:
cardio_df.groupby("age_group").mean()["gluc"]

age_group
<40      1.134597
40-49    1.152724
50-59    1.237985
60-89    1.291321
Name: gluc, dtype: float64

### Calculated BMI Feature based on Height and Weight

In [19]:
#converting height into meters
height_meters = cardio_df["height"]/100
cardio_df["bmi"] = round(cardio_df["weight"]/(height_meters**2),2)
cardio_df

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group,bmi
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50,50-59,21.97
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55,50-59,34.93
2,2,1,165,64.0,130,70,3,1,0,0,0,1,52,50-59,23.51
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48,40-49,28.71
4,4,1,156,56.0,100,60,1,1,0,0,0,0,48,40-49,23.01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,2,168,76.0,120,80,1,1,1,0,1,0,53,50-59,26.93
69996,99995,1,158,126.0,140,90,2,2,0,0,1,1,62,60-89,50.47
69997,99996,2,183,105.0,180,90,3,1,0,1,0,1,52,50-59,31.35
69998,99998,1,163,72.0,135,80,1,2,0,0,0,1,61,60-89,27.10


In [20]:
cardio_df.isnull()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group,bmi
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
69996,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
69997,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
69998,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [40]:
#Create range for BMI
bmi_bins = [0,18.5,24,29,60]
bmi_weight_range = ["18.5 (Underweight)","18.6-24.9 (Healthy)", "25.0-29(Overweight)", ">30.0(Obesity)"]

In [41]:
cardio_df["bmi_range"] = pd.cut(cardio_df["bmi"], bmi_bins, labels = bmi_weight_range)
cardio_df

Unnamed: 0,id,age_year,age_group,gender,height,weight,bmi,bmi_range,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,50-59,2,168,62.0,21.97,18.6-24.9 (Healthy),110,80,1,1,0,0,1,0
1,1,55,50-59,1,156,85.0,34.93,>30.0 (Obesity),140,90,3,1,0,0,1,1
2,2,52,50-59,1,165,64.0,23.51,18.6-24.9 (Healthy),130,70,3,1,0,0,0,1
3,3,48,40-49,2,169,82.0,28.71,25.0-29( Overweight),150,100,1,1,0,0,1,1
4,4,48,40-49,1,156,56.0,23.01,18.6-24.9 (Healthy),100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,53,50-59,2,168,76.0,26.93,25.0-29( Overweight),120,80,1,1,1,0,1,0
69996,99995,62,60-89,1,158,126.0,50.47,>30.0 (Obesity),140,90,2,2,0,0,1,1
69997,99996,52,50-59,2,183,105.0,31.35,>30.0 (Obesity),180,90,3,1,0,1,0,1
69998,99998,61,60-89,1,163,72.0,27.10,25.0-29( Overweight),135,80,1,2,0,0,0,1


In [42]:
cardio_df = cardio_df[["id","age_year","age_group","gender","height","weight","bmi","bmi_range","ap_hi","ap_lo", "cholesterol","gluc","smoke","alco","active","cardio"]]
cardio_df

Unnamed: 0,id,age_year,age_group,gender,height,weight,bmi,bmi_range,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,50-59,2,168,62.0,21.97,18.6-24.9 (Healthy),110,80,1,1,0,0,1,0
1,1,55,50-59,1,156,85.0,34.93,>30.0 (Obesity),140,90,3,1,0,0,1,1
2,2,52,50-59,1,165,64.0,23.51,18.6-24.9 (Healthy),130,70,3,1,0,0,0,1
3,3,48,40-49,2,169,82.0,28.71,25.0-29( Overweight),150,100,1,1,0,0,1,1
4,4,48,40-49,1,156,56.0,23.01,18.6-24.9 (Healthy),100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,53,50-59,2,168,76.0,26.93,25.0-29( Overweight),120,80,1,1,1,0,1,0
69996,99995,62,60-89,1,158,126.0,50.47,>30.0 (Obesity),140,90,2,2,0,0,1,1
69997,99996,52,50-59,2,183,105.0,31.35,>30.0 (Obesity),180,90,3,1,0,1,0,1
69998,99998,61,60-89,1,163,72.0,27.10,25.0-29( Overweight),135,80,1,2,0,0,0,1


## Export new updated csv file

In [None]:
cardio_df.to_csv("./Resources/cardio.csv", index = False)