# Load data

In [1]:
import pandas as pd
cardio_df = pd.read_csv(r'cardio_train.csv', sep=';')

In [2]:
cardio_df = cardio_df.drop(columns='id')
cardio_df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


# Preparing Data

In [3]:
cardio_df.isna().sum()

age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [4]:
cardio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.4 MB


In [5]:
cardio_df['cholesterol'].value_counts()

1    52385
2     9549
3     8066
Name: cholesterol, dtype: int64

In [6]:
cardio_df['gluc'].value_counts()

1    59479
3     5331
2     5190
Name: gluc, dtype: int64

In [7]:
cardio_df['cardio'].value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [8]:
cardio_feature = cardio_df.iloc[:,:-1]
cardio_result = cardio_df.iloc[:,-1]

In [9]:
cardio_feature['gender'] = cardio_feature['gender'].replace({1:0, 2:1})
cardio_feature['cholesterol'] = cardio_feature['cholesterol'].replace({1:0, 2:1, 3:2})
cardio_feature['gluc'] = cardio_feature['gluc'].replace({1:0, 2:1, 3:2})

In [10]:
cardio_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
dtypes: float64(1), int64(10)
memory usage: 5.9 MB


In [11]:
cardio_feature["age"] = cardio_feature["age"].apply(lambda x : x/365)
cardio_feature

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,50.391781,1,168,62.0,110,80,0,0,0,0,1
1,55.419178,0,156,85.0,140,90,2,0,0,0,1
2,51.663014,0,165,64.0,130,70,2,0,0,0,0
3,48.282192,1,169,82.0,150,100,0,0,0,0,1
4,47.873973,0,156,56.0,100,60,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69995,52.712329,1,168,76.0,120,80,0,0,1,0,1
69996,61.920548,0,158,126.0,140,90,1,1,0,0,1
69997,52.235616,1,183,105.0,180,90,2,0,0,1,0
69998,61.454795,0,163,72.0,135,80,0,1,0,0,0


In [12]:
# standardizing
from sklearn import preprocessing

std_scaler = preprocessing.StandardScaler()
cardio_feature = pd.DataFrame(std_scaler.fit_transform(cardio_feature), columns = cardio_feature.columns)
cardio_feature

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,-0.436062,1.364055,0.443452,-0.847873,-0.122182,-0.088238,-0.539322,-0.395720,-0.310879,-0.238384,0.494167
1,0.307686,-0.733108,-1.018168,0.749831,0.072610,-0.035180,2.400793,-0.395720,-0.310879,-0.238384,0.494167
2,-0.247997,-0.733108,0.078047,-0.708942,0.007679,-0.141297,2.400793,-0.395720,-0.310879,-0.238384,-2.023607
3,-0.748152,1.364055,0.565254,0.541435,0.137541,0.017879,-0.539322,-0.395720,-0.310879,-0.238384,0.494167
4,-0.808543,-0.733108,-1.018168,-1.264666,-0.187113,-0.194356,-0.539322,-0.395720,-0.310879,-0.238384,-2.023607
...,...,...,...,...,...,...,...,...,...,...,...
69995,-0.092762,1.364055,0.443452,0.124642,-0.057251,-0.088238,-0.539322,-0.395720,3.216684,-0.238384,0.494167
69996,1.269492,-0.733108,-0.774565,3.597913,0.072610,-0.035180,0.930735,1.351719,-0.310879,-0.238384,0.494167
69997,-0.163286,1.364055,2.270477,2.139139,0.332333,-0.035180,2.400793,-0.395720,-0.310879,4.194906,-2.023607
69998,1.200589,-0.733108,-0.165556,-0.153219,0.040145,-0.088238,-0.539322,1.351719,-0.310879,-0.238384,-2.023607


# Splitting data

In [13]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(cardio_feature, cardio_result, test_size=0.3)

In [14]:
pd.crosstab(y_train, columns='N', normalize=True)

col_0,N
cardio,Unnamed: 1_level_1
0,0.503612
1,0.496388


In [15]:
pd.crosstab(y_test, columns='N', normalize=True)

col_0,N
cardio,Unnamed: 1_level_1
0,0.492571
1,0.507429


# Modeling

In [16]:
from sklearn.linear_model import LogisticRegression
Logres_classifier = LogisticRegression(solver='lbfgs', max_iter=10000)
Logres_classifier.fit(x_train, y_train)

LogisticRegression(max_iter=10000)

In [17]:
from sklearn.ensemble import RandomForestClassifier
Randfor_classifier = RandomForestClassifier()
Randfor_classifier.fit(x_train, y_train)

RandomForestClassifier()

In [18]:
import xgboost
XGB_classifier = xgboost.XGBClassifier()
XGB_classifier.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [19]:
from sklearn.ensemble import GradientBoostingClassifier
GB_classifier = GradientBoostingClassifier()
GB_classifier.fit(x_train, y_train)

GradientBoostingClassifier()

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
#Voting_classifier = VotingClassifier()

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators=50, random_state=1)
clf3 = xgboost.XGBClassifier()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('xgb', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, x_train, y_train, scoring='accuracy', cv=5)
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.72 (+/- 0.00) [Logistic Regression]
Accuracy: 0.72 (+/- 0.01) [Random Forest]
Accuracy: 0.73 (+/- 0.00) [naive Bayes]


In [None]:
from sklearn.naive_bayes import GaussianNB
GNB_classifier = GaussianNB()
GNB_classifier.fit(x_train, y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree_cardio = DecisionTreeClassifier()
tree_cardio.fit(x_train, y_train)

# Evaluation

In [None]:
from sklearn.metrics import classification_report

In [None]:
result = tree_cardio.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))

In [None]:
result = GNB_classifier.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))

In [None]:
result = Logres_classifier.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))

In [None]:
result = Randfor_classifier.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))

In [None]:
result = XGB_classifier.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))

In [None]:
result = GB_classifier.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))