In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt

## Import Cardiovascular CSV dataset

In [2]:
#Read and load csv files 
file_path = Path("Resources/cardio_train.csv")
df = pd.read_csv(file_path, sep = ';')
df.head(20)

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0
5,8,21914,1,151,67.0,120,80,2,2,0,0,0,0
6,9,22113,1,157,93.0,130,80,3,1,0,0,1,0
7,12,22584,2,178,95.0,130,90,3,3,0,0,1,1
8,13,17668,1,158,71.0,110,70,1,1,0,0,1,0
9,14,19834,1,164,68.0,110,60,1,1,0,0,0,0


In [3]:
#calculating statistical data(mean,median,std, quartiles)
df.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [4]:
#Calculate the count of unique value in the index.
#NaN values are not included by default.
df.nunique()

id             70000
age             8076
gender             2
height           109
weight           287
ap_hi            153
ap_lo            157
cholesterol        3
gluc               3
smoke              2
alco               2
active             2
cardio             2
dtype: int64

In [5]:
#Describe the data types of each column.
df.dtypes

id               int64
age              int64
gender           int64
height           int64
weight         float64
ap_hi            int64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
dtype: object

In [6]:
#Count non-NA cells for each column or row.
df.count()

id             70000
age            70000
gender         70000
height         70000
weight         70000
ap_hi          70000
ap_lo          70000
cholesterol    70000
gluc           70000
smoke          70000
alco           70000
active         70000
cardio         70000
dtype: int64

In [7]:

df.isnull()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,False,False,False,False,False,False,False,False,False,False,False,False,False
69996,False,False,False,False,False,False,False,False,False,False,False,False,False
69997,False,False,False,False,False,False,False,False,False,False,False,False,False
69998,False,False,False,False,False,False,False,False,False,False,False,False,False


In [8]:
df.isna().sum()

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [9]:
df.value_counts()

id     age    gender  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  alco  active  cardio
0      18393  2       168     62.0    110    80     1            1     0      0     1       0         1
66623  18279  2       166     83.0    140    80     1            1     0      0     1       1         1
66631  23107  2       171     73.0    150    90     1            1     0      0     0       1         1
66630  18944  2       162     75.0    100    80     1            1     0      0     1       0         1
66628  18314  1       165     72.0    150    90     1            1     0      0     1       0         1
                                                                                                     ..
33338  16193  1       165     63.0    120    80     1            1     0      0     1       0         1
33339  20481  1       165     80.0    140    80     3            1     0      0     1       1         1
33340  23338  1       155     60.0    120    80     1            1   

In [10]:
max(df['age'])

23713

### Age Conversion

In [11]:
# convert age to years
df['age_year'] = (df['age']/365).round(0).astype('int')
cardio_df = df.drop('age', axis = 1)
cardio_df.head()


Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55
2,2,1,165,64.0,130,70,3,1,0,0,0,1,52
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48
4,4,1,156,56.0,100,60,1,1,0,0,0,0,48


In [12]:
cardio_df.isnull()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,False,False,False,False,False,False,False,False,False,False,False,False,False
69996,False,False,False,False,False,False,False,False,False,False,False,False,False
69997,False,False,False,False,False,False,False,False,False,False,False,False,False
69998,False,False,False,False,False,False,False,False,False,False,False,False,False


In [13]:
max(cardio_df['age_year'])

65

In [14]:
min(cardio_df['age_year'])

30

## Age-Bins

In [15]:
age_bins = [0,40,49,59,90]
group_names = ["<40","40-49", "50-59", "60-89"]

In [16]:
cardio_df["age_group"] = pd.cut(cardio_df["age_year"], age_bins, labels = group_names)
cardio_df

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50,50-59
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55,50-59
2,2,1,165,64.0,130,70,3,1,0,0,0,1,52,50-59
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48,40-49
4,4,1,156,56.0,100,60,1,1,0,0,0,0,48,40-49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,2,168,76.0,120,80,1,1,1,0,1,0,53,50-59
69996,99995,1,158,126.0,140,90,2,2,0,0,1,1,62,60-89
69997,99996,2,183,105.0,180,90,3,1,0,1,0,1,52,50-59
69998,99998,1,163,72.0,135,80,1,2,0,0,0,1,61,60-89


In [17]:
cardio_df.groupby("age_group").mean()["cholesterol"]


age_group
<40      1.174430
40-49    1.245459
50-59    1.374247
60-89    1.507547
Name: cholesterol, dtype: float64

In [18]:
cardio_df.groupby("age_group").mean()["gluc"]

age_group
<40      1.134597
40-49    1.152724
50-59    1.237985
60-89    1.291321
Name: gluc, dtype: float64

In [19]:
cardio_df.describe()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997,53.338686
std,28851.302323,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003,6.765294
min,0.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0,30.0
25%,25006.75,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,48.0
50%,50001.5,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0,54.0
75%,74889.25,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0,58.0
max,99999.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0,65.0


In [20]:
cardio_df.isnull()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,False,False,False,False,False,False,False,False,False,False,False,False,False,False
69996,False,False,False,False,False,False,False,False,False,False,False,False,False,False
69997,False,False,False,False,False,False,False,False,False,False,False,False,False,False
69998,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [21]:
#Check for duplicates
cardio_df.duplicated(subset=['id']).value_counts()

False    70000
dtype: int64

In [22]:
cardio_df.isnull()

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,False,False,False,False,False,False,False,False,False,False,False,False,False,False
69996,False,False,False,False,False,False,False,False,False,False,False,False,False,False
69997,False,False,False,False,False,False,False,False,False,False,False,False,False,False
69998,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [23]:
#Calculated Frequency on ap_hi to find out negative and wrong values
api_hi_frequency = cardio_df["ap_hi"].value_counts()
api_hi_frequency

 120     27699
 140      9506
 130      8961
 110      8644
 150      4450
         ...  
 2000        1
-140         1
 207         1
-150         1
 93          1
Name: ap_hi, Length: 153, dtype: int64

In [24]:
#Calculated Frequency on ap_lo to find out negative and wrong values
api_lo_frequency = cardio_df["ap_lo"].value_counts()
api_lo_frequency

80      34847
90      14316
70      10245
100      4082
60       2727
        ...  
1007        1
8044        1
8500        1
880         1
7100        1
Name: ap_lo, Length: 157, dtype: int64

In [25]:
# Export api_lo_frequency csv files
api_lo_frequency.to_csv("./Resources/api_lo_frequency.csv", index = True)

In [26]:
#Dropping invalid values on ap_hi  and ap_lo 
cardio_df = cardio_df[(cardio_df["ap_lo"]>55)& (cardio_df["ap_lo"]<150) & (cardio_df["ap_hi"]>55)& (cardio_df["ap_hi"]<=220)]
cardio_df

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50,50-59
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55,50-59
2,2,1,165,64.0,130,70,3,1,0,0,0,1,52,50-59
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48,40-49
4,4,1,156,56.0,100,60,1,1,0,0,0,0,48,40-49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,2,168,76.0,120,80,1,1,1,0,1,0,53,50-59
69996,99995,1,158,126.0,140,90,2,2,0,0,1,1,62,60-89
69997,99996,2,183,105.0,180,90,3,1,0,1,0,1,52,50-59
69998,99998,1,163,72.0,135,80,1,2,0,0,0,1,61,60-89


In [27]:
#Export api_hi_frequency csv files
cardio_df.to_csv("./Resources/api_hi_lo_frequency.csv", index = True)

In [28]:
#Dropping invalid values for weight
cardio_df = cardio_df[(cardio_df["weight"]>40)& (cardio_df["weight"]<=200)]
cardio_df

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50,50-59
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55,50-59
2,2,1,165,64.0,130,70,3,1,0,0,0,1,52,50-59
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48,40-49
4,4,1,156,56.0,100,60,1,1,0,0,0,0,48,40-49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,2,168,76.0,120,80,1,1,1,0,1,0,53,50-59
69996,99995,1,158,126.0,140,90,2,2,0,0,1,1,62,60-89
69997,99996,2,183,105.0,180,90,3,1,0,1,0,1,52,50-59
69998,99998,1,163,72.0,135,80,1,2,0,0,0,1,61,60-89


In [29]:
#Export to csv files
cardio_df.to_csv("./Resources/cardio_weight.csv", index = True)

In [30]:
#Dropping invalid values for height
cardio_df = cardio_df[(cardio_df["height"]>120)& (cardio_df["height"]<=215)]
cardio_df

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50,50-59
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55,50-59
2,2,1,165,64.0,130,70,3,1,0,0,0,1,52,50-59
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48,40-49
4,4,1,156,56.0,100,60,1,1,0,0,0,0,48,40-49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,2,168,76.0,120,80,1,1,1,0,1,0,53,50-59
69996,99995,1,158,126.0,140,90,2,2,0,0,1,1,62,60-89
69997,99996,2,183,105.0,180,90,3,1,0,1,0,1,52,50-59
69998,99998,1,163,72.0,135,80,1,2,0,0,0,1,61,60-89


In [31]:
cardio_df

Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50,50-59
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55,50-59
2,2,1,165,64.0,130,70,3,1,0,0,0,1,52,50-59
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48,40-49
4,4,1,156,56.0,100,60,1,1,0,0,0,0,48,40-49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,2,168,76.0,120,80,1,1,1,0,1,0,53,50-59
69996,99995,1,158,126.0,140,90,2,2,0,0,1,1,62,60-89
69997,99996,2,183,105.0,180,90,3,1,0,1,0,1,52,50-59
69998,99998,1,163,72.0,135,80,1,2,0,0,0,1,61,60-89


### Calculated BMI Feature based on Height and Weight

In [66]:
#converting height into meters
height_meters = cardio_df["height"]/100
cardio_df["bmi"] = round(cardio_df["weight"]/(height_meters**2),2)
cardio_df

Unnamed: 0,id,age_year,age_group,gender,height,weight,bmi,bmi_range,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,50-59,2,168,62.0,21.97,Healthy (1),110,80,1,1,0,0,1,0
1,1,55,50-59,1,156,85.0,34.93,Obesity (3),140,90,3,1,0,0,1,1
2,2,52,50-59,1,165,64.0,23.51,Healthy (1),130,70,3,1,0,0,0,1
3,3,48,40-49,2,169,82.0,28.71,Overweight (2),150,100,1,1,0,0,1,1
4,4,48,40-49,1,156,56.0,23.01,Healthy (1),100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,53,50-59,2,168,76.0,26.93,Overweight (2),120,80,1,1,1,0,1,0
69996,99995,62,60-89,1,158,126.0,50.47,Obesity (3),140,90,2,2,0,0,1,1
69997,99996,52,50-59,2,183,105.0,31.35,Obesity (3),180,90,3,1,0,1,0,1
69998,99998,61,60-89,1,163,72.0,27.10,Overweight (2),135,80,1,2,0,0,0,1


In [33]:
#Create range for BMI
bmi_bins = [0,18.5,24,29,250]
bmi_weight_range = ["Underweight (0)","Healthy (1)", "Overweight (2)", "Obesity (3)"]

In [34]:
cardio_df["bmi_range"] = pd.cut(cardio_df["bmi"], bmi_bins, labels = bmi_weight_range)
cardio_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cardio_df["bmi_range"] = pd.cut(cardio_df["bmi"], bmi_bins, labels = bmi_weight_range)


Unnamed: 0,id,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_year,age_group,bmi,bmi_range
0,0,2,168,62.0,110,80,1,1,0,0,1,0,50,50-59,21.97,Healthy (1)
1,1,1,156,85.0,140,90,3,1,0,0,1,1,55,50-59,34.93,Obesity (3)
2,2,1,165,64.0,130,70,3,1,0,0,0,1,52,50-59,23.51,Healthy (1)
3,3,2,169,82.0,150,100,1,1,0,0,1,1,48,40-49,28.71,Overweight (2)
4,4,1,156,56.0,100,60,1,1,0,0,0,0,48,40-49,23.01,Healthy (1)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,2,168,76.0,120,80,1,1,1,0,1,0,53,50-59,26.93,Overweight (2)
69996,99995,1,158,126.0,140,90,2,2,0,0,1,1,62,60-89,50.47,Obesity (3)
69997,99996,2,183,105.0,180,90,3,1,0,1,0,1,52,50-59,31.35,Obesity (3)
69998,99998,1,163,72.0,135,80,1,2,0,0,0,1,61,60-89,27.10,Overweight (2)


In [35]:
cardio_df.groupby("bmi_range").count()["bmi"]

bmi_range
Underweight (0)      565
Healthy (1)        17915
Overweight (2)     28500
Obesity (3)        21502
Name: bmi, dtype: int64

In [36]:
cardio_df.dtypes

id                int64
gender            int64
height            int64
weight          float64
ap_hi             int64
ap_lo             int64
cholesterol       int64
gluc              int64
smoke             int64
alco              int64
active            int64
cardio            int64
age_year          int64
age_group      category
bmi             float64
bmi_range      category
dtype: object

### Reorder the columns

In [37]:
cardio_df = cardio_df[["id","age_year","age_group","gender","height","weight","bmi","bmi_range","ap_hi","ap_lo", "cholesterol","gluc","smoke","alco","active","cardio"]]
cardio_df

Unnamed: 0,id,age_year,age_group,gender,height,weight,bmi,bmi_range,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,50,50-59,2,168,62.0,21.97,Healthy (1),110,80,1,1,0,0,1,0
1,1,55,50-59,1,156,85.0,34.93,Obesity (3),140,90,3,1,0,0,1,1
2,2,52,50-59,1,165,64.0,23.51,Healthy (1),130,70,3,1,0,0,0,1
3,3,48,40-49,2,169,82.0,28.71,Overweight (2),150,100,1,1,0,0,1,1
4,4,48,40-49,1,156,56.0,23.01,Healthy (1),100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,53,50-59,2,168,76.0,26.93,Overweight (2),120,80,1,1,1,0,1,0
69996,99995,62,60-89,1,158,126.0,50.47,Obesity (3),140,90,2,2,0,0,1,1
69997,99996,52,50-59,2,183,105.0,31.35,Obesity (3),180,90,3,1,0,1,0,1
69998,99998,61,60-89,1,163,72.0,27.10,Overweight (2),135,80,1,2,0,0,0,1


In [38]:
cardio_df.isnull().sum()

id             0
age_year       0
age_group      0
gender         0
height         0
weight         0
bmi            0
bmi_range      0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [39]:
cardio_df.describe()

Unnamed: 0,id,age_year,gender,height,weight,bmi,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,68482.0,68482.0,68482.0,68482.0,68482.0,68482.0,68482.0,68482.0,68482.0,68482.0,68482.0,68482.0,68482.0,68482.0
mean,49966.048757,53.330729,1.349201,164.456573,74.168711,27.460545,126.641789,81.394673,1.364884,1.226103,0.088067,0.053576,0.803335,0.495108
std,28850.396294,6.764683,0.476721,7.840417,14.243992,5.212436,16.661567,9.447078,0.679148,0.572142,0.283394,0.225181,0.39748,0.49998
min,0.0,30.0,1.0,122.0,41.0,14.53,70.0,56.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,24991.5,48.0,1.0,159.0,65.0,23.88,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,49999.5,54.0,1.0,165.0,72.0,26.35,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74873.75,58.0,2.0,170.0,82.0,30.12,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,65.0,2.0,207.0,200.0,108.17,220.0,140.0,3.0,3.0,1.0,1.0,1.0,1.0


### Outliers for height

In [40]:
# Calculate quartiles for height
quartiles = cardio_df["height"].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

# Calculate upper and lower bounds for each drug
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

# Print lower and upper bounds for each drug
print(f'Lower Bound for : {lower_bound}')
print(f'Upper Bound for : {upper_bound}')

Lower Bound for : 142.5
Upper Bound for : 186.5


In [41]:
# Determine outliers using upper and lower bounds
outliers_count = (cardio_df.loc[(cardio_df['height'] >= upper_bound) | 
                                    (cardio_df['height'] <= lower_bound)]).count()

# Print outliers for each drug
print(f'Number of outliers: {outliers_count}')

Number of outliers: id             412
age_year       412
age_group      412
gender         412
height         412
weight         412
bmi            412
bmi_range      412
ap_hi          412
ap_lo          412
cholesterol    412
gluc           412
smoke          412
alco           412
active         412
cardio         412
dtype: int64


### Outliers for Weight

In [42]:
quartiles = cardio_df["weight"].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

# Calculate upper and lower bounds for each drug
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)

# Print lower and upper bounds for each drug
print(f'Lower Bound for : {lower_bound}')
print(f'Upper Bound for : {upper_bound}')

# Determine outliers using upper and lower bounds
outliers_count = (cardio_df['weight'].loc[(cardio_df['weight'] >= upper_bound) | 
                                    (cardio_df['weight'] <= lower_bound)]).count()

# Print outliers for each drug
print(f'Number of outliers: {outliers_count}')

Lower Bound for : 39.5
Upper Bound for : 107.5
Number of outliers: 1689


In [43]:
#Export to csv files
cardio_df.to_csv("./Resources/cardio_height.csv", index = True)

### Export Updated CSV File 

In [44]:
cardio_df.to_csv("./Resources/cardio.csv", index = False)

### Create Patient Table for Postgress Database

In [45]:
patient_data = cardio_df[['id', 'gender', 'height', 'weight','cardio']]
patient_data = patient_data.set_index(['id'],drop='True')
patient_data

Unnamed: 0_level_0,gender,height,weight,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2,168,62.0,0
1,1,156,85.0,1
2,1,165,64.0,1
3,2,169,82.0,1
4,1,156,56.0,0
...,...,...,...,...
99993,2,168,76.0,0
99995,1,158,126.0,1
99996,2,183,105.0,1
99998,1,163,72.0,1


### Create Health Table for Postgress Database

In [46]:
health_data = cardio_df[["id","ap_hi", "ap_lo", "cholesterol", "gluc"]]
health_data = health_data.set_index(['id'],drop='True')
health_data

Unnamed: 0_level_0,ap_hi,ap_lo,cholesterol,gluc
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,110,80,1,1
1,140,90,3,1
2,130,70,3,1
3,150,100,1,1
4,100,60,1,1
...,...,...,...,...
99993,120,80,1,1
99995,140,90,2,2
99996,180,90,3,1
99998,135,80,1,2


### Create Lifestyle Table for Postgress Database

In [47]:
lifestyle_data = cardio_df[["id", "smoke", "alco","active"]]
lifestyle_data = lifestyle_data.set_index(['id'],drop='True')
lifestyle_data

Unnamed: 0_level_0,smoke,alco,active
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,0,1
1,0,0,1
2,0,0,0
3,0,0,1
4,0,0,0
...,...,...,...
99993,1,0,1
99995,0,0,1
99996,0,1,0
99998,0,0,0


### Create Calculated Table for Postgress Database

In [48]:
calculated_data = cardio_df[["id","age_year","age_group","bmi", "bmi_range"]]
calculated_data = calculated_data.set_index(['id'],drop='True')
calculated_data

Unnamed: 0_level_0,age_year,age_group,bmi,bmi_range
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,50,50-59,21.97,Healthy (1)
1,55,50-59,34.93,Obesity (3)
2,52,50-59,23.51,Healthy (1)
3,48,40-49,28.71,Overweight (2)
4,48,40-49,23.01,Healthy (1)
...,...,...,...,...
99993,53,50-59,26.93,Overweight (2)
99995,62,60-89,50.47,Obesity (3)
99996,52,50-59,31.35,Obesity (3)
99998,61,60-89,27.10,Overweight (2)


### Load data to PostgreSQL tables

In [49]:
# Import module from sqlalchemy 
from sqlalchemy import create_engine
from config import db_password

### Create database engine to connect to postgres

In [50]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/cardio"
engine = create_engine(db_string)

In [51]:
# Save the patient_data to postgreSQL table
patient_data.to_sql(name='patient_table', con=engine, if_exists='replace')

In [52]:
# Save the health_data to postgreSQL table
health_data.to_sql(name="health_table",con=engine, if_exists='replace')

In [53]:
# Save the lifestyle_data to postgreSQL table
lifestyle_data.to_sql(name="lifestyle_data", con=engine, if_exists='replace')

In [54]:
# Save the manipulated_data to postgreSQL table
calculated_data.to_sql(name="calculated_data", con=engine, if_exists='replace')

### Separate the Features (X) from the Target (y) and drop additional features for accuracy score

In [55]:
y = cardio_df["cardio"]
X = cardio_df.drop(columns=["cardio","age_group","bmi_range", "id","height","weight","gluc"])

### Split our data into training and testing

In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(51361, 9)

### Create a Logistic Regression Model

In [57]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

### Fit (train) or model using the training data

In [58]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(max_iter=200, random_state=1)

### Make predictions

In [59]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,1
1,1,1
2,1,0
3,0,0
4,1,1
5,0,0
6,0,0
7,1,0
8,0,0
9,1,1


### Print accuracy score

In [60]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7282868991297238


### GridSearchCV implements a “fit” and a “score” method

In [61]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'penalty': ["l1", "l2"]}
model = LogisticRegression(solver='liblinear')
grid = GridSearchCV(model, param_grid, verbose=3)

In [62]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END ...................C=1, penalty=l1;, score=0.729 total time=   1.0s
[CV 2/5] END ...................C=1, penalty=l1;, score=0.724 total time=   0.9s
[CV 3/5] END ...................C=1, penalty=l1;, score=0.726 total time=   1.0s
[CV 4/5] END ...................C=1, penalty=l1;, score=0.726 total time=   1.0s
[CV 5/5] END ...................C=1, penalty=l1;, score=0.727 total time=   0.9s
[CV 1/5] END ...................C=1, penalty=l2;, score=0.729 total time=   0.1s
[CV 2/5] END ...................C=1, penalty=l2;, score=0.724 total time=   0.1s
[CV 3/5] END ...................C=1, penalty=l2;, score=0.727 total time=   0.1s
[CV 4/5] END ...................C=1, penalty=l2;, score=0.725 total time=   0.1s
[CV 5/5] END ...................C=1, penalty=l2;, score=0.726 total time=   0.1s
[CV 1/5] END ...................C=5, penalty=l1;, score=0.729 total time=   0.9s
[CV 2/5] END ...................C=5, penalty=l1;,

GridSearchCV(estimator=LogisticRegression(solver='liblinear'),
             param_grid={'C': [1, 5, 10], 'penalty': ['l1', 'l2']}, verbose=3)

In [63]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 1, 'penalty': 'l2'}
0.7262903251362418


In [64]:
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
# Build a classification task using 3 informative features
X, y = make_classification(n_samples=1000,
                           n_features=10,
                           n_informative=3,
                           n_redundant=0,
                           n_repeated=0,
                           n_classes=2,
                           random_state=0,
                           shuffle=False)


rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_)



{'max_features': 'log2', 'n_estimators': 700}


In [65]:
print(CV_rfc.best_params_)
print(CV_rfc.best_score_)

{'max_features': 'log2', 'n_estimators': 700}
0.6834757896800523
