# Load data

In [3]:
import pandas as pd
cardio_df = pd.read_csv('cardio_train.csv', sep=';')

In [4]:
cardio_df = cardio_df.drop(columns='id')
cardio_df

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
69995,19240,2,168,76.0,120,80,1,1,1,0,1,0
69996,22601,1,158,126.0,140,90,2,2,0,0,1,1
69997,19066,2,183,105.0,180,90,3,1,0,1,0,1
69998,22431,1,163,72.0,135,80,1,2,0,0,0,1


# Preparing Data

In [5]:
cardio_df.isna().sum()

age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

In [6]:
cardio_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.4 MB


In [7]:
cardio_df['cholesterol'].value_counts()

1    52385
2     9549
3     8066
Name: cholesterol, dtype: int64

In [8]:
cardio_df['gluc'].value_counts()

1    59479
3     5331
2     5190
Name: gluc, dtype: int64

In [9]:
cardio_df['cardio'].value_counts()

0    35021
1    34979
Name: cardio, dtype: int64

In [10]:
cardio_feature = cardio_df.iloc[:,:-1]
cardio_result = cardio_df.iloc[:,-1]

In [11]:
cardio_feature['gender'] = cardio_feature['gender'].replace({1:0, 2:1})
#cardio_feature['cholesterol'] = cardio_feature['cholesterol'].replace({1:0, 2:1, 3:2})
#cardio_feature['gluc'] = cardio_feature['gluc'].replace({1:0, 2:1, 3:2})

In [12]:
cardio_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
dtypes: float64(1), int64(10)
memory usage: 5.9 MB


In [13]:
cardio_feature["age"] = cardio_feature["age"].apply(lambda x : x/365)
cardio_feature

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active
0,50.391781,1,168,62.0,110,80,1,1,0,0,1
1,55.419178,0,156,85.0,140,90,3,1,0,0,1
2,51.663014,0,165,64.0,130,70,3,1,0,0,0
3,48.282192,1,169,82.0,150,100,1,1,0,0,1
4,47.873973,0,156,56.0,100,60,1,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
69995,52.712329,1,168,76.0,120,80,1,1,1,0,1
69996,61.920548,0,158,126.0,140,90,2,2,0,0,1
69997,52.235616,1,183,105.0,180,90,3,1,0,1,0
69998,61.454795,0,163,72.0,135,80,1,2,0,0,0


# Splitting data

In [14]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(cardio_feature, cardio_result, test_size=0.3)

In [15]:
pd.crosstab(y_train, columns='N', normalize=True)

col_0,N
cardio,Unnamed: 1_level_1
0,0.500918
1,0.499082


In [16]:
pd.crosstab(y_test, columns='N', normalize=True)

col_0,N
cardio,Unnamed: 1_level_1
0,0.498857
1,0.501143


# Modeling

In [17]:
from sklearn.linear_model import LogisticRegression
Logres_classifier = LogisticRegression(solver='lbfgs', max_iter=10000)
Logres_classifier.fit(x_train, y_train)

LogisticRegression(max_iter=10000)

In [18]:
from sklearn.ensemble import RandomForestClassifier
Randfor_classifier = RandomForestClassifier()
Randfor_classifier.fit(x_train, y_train)

RandomForestClassifier()

In [20]:
import xgboost
XGB_classifier = xgboost.XGBClassifier()
XGB_classifier.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, ...)

In [21]:
from sklearn.tree import DecisionTreeClassifier
tree_cardio = DecisionTreeClassifier()
tree_cardio.fit(x_train, y_train)

DecisionTreeClassifier()

### Randomforest_classifier Tuning

In [26]:
from sklearn.ensemble import RandomForestClassifier

Randforest_classifier = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=6, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Randforest_classifier.fit(x_train, y_train)

RandomForestClassifier(max_features=6)

# Evaluation

In [22]:
from sklearn.metrics import classification_report

In [23]:
result = tree_cardio.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))

              precision    recall  f1-score   support

           0       0.64      0.64      0.64     10476
           1       0.64      0.63      0.64     10524

    accuracy                           0.64     21000
   macro avg       0.64      0.64      0.64     21000
weighted avg       0.64      0.64      0.64     21000



In [21]:
result = Logres_classifier.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))

              precision    recall  f1-score   support

           0       0.70      0.76      0.73     10469
           1       0.74      0.68      0.71     10531

    accuracy                           0.72     21000
   macro avg       0.72      0.72      0.72     21000
weighted avg       0.72      0.72      0.72     21000



In [22]:
result = Randfor_classifier.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))

              precision    recall  f1-score   support

           0       0.71      0.74      0.72     10469
           1       0.73      0.71      0.72     10531

    accuracy                           0.72     21000
   macro avg       0.72      0.72      0.72     21000
weighted avg       0.72      0.72      0.72     21000



In [27]:
# Randomforest_classifier Tuning
result = Randforest_classifier.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))

              precision    recall  f1-score   support

           0       0.71      0.73      0.72     10476
           1       0.73      0.70      0.71     10524

    accuracy                           0.72     21000
   macro avg       0.72      0.72      0.72     21000
weighted avg       0.72      0.72      0.72     21000



In [23]:
result = XGB_classifier.predict(x_test)
print(classification_report(y_true=y_test, y_pred=result))

              precision    recall  f1-score   support

           0       0.72      0.77      0.74     10469
           1       0.76      0.70      0.73     10531

    accuracy                           0.74     21000
   macro avg       0.74      0.74      0.74     21000
weighted avg       0.74      0.74      0.74     21000

