In [8]:
## import all files
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


In [9]:
## laod data and data cleaning
df = pd.read_csv("cardio_train.csv", sep=";")

# Clean
df.drop(columns=['id'], inplace=True)
df['age_years'] = (df['age']/365).astype(int)
df.drop(columns=['age'], inplace=True)

df = df[df['ap_lo'] <= df['ap_hi']]
df = df[(df['height']>100) & (df['height']<220)]
df = df[(df['weight']>30) & (df['weight']<200)]

df['BMI'] = df['weight'] / ((df['height']/100)**2)


In [10]:
df

Unnamed: 0,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,age_years,BMI
0,2,168,62.0,110,80,1,1,0,0,1,0,50,21.967120
1,1,156,85.0,140,90,3,1,0,0,1,1,55,34.927679
2,1,165,64.0,130,70,3,1,0,0,0,1,51,23.507805
3,2,169,82.0,150,100,1,1,0,0,1,1,48,28.710479
4,1,156,56.0,100,60,1,1,0,0,0,0,47,23.011177
...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,2,168,76.0,120,80,1,1,1,0,1,0,52,26.927438
69996,1,158,126.0,140,90,2,2,0,0,1,1,61,50.472681
69997,2,183,105.0,180,90,3,1,0,1,0,1,52,31.353579
69998,1,163,72.0,135,80,1,2,0,0,0,1,61,27.099251


In [11]:
## split into x and y
X = df.drop(columns=["cardio"])
y = df["cardio"]


In [13]:
## split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

In [14]:
## scaling numeric data 

from sklearn.preprocessing import StandardScaler

# Columns you want to scale
numeric_cols = ['age_years', 'height', 'weight', 'ap_hi', 'ap_lo', 'BMI']

# 1️⃣ Create scaler
scaler = StandardScaler()

# 2️⃣ Fit scaler ONLY on training numeric data
scaler.fit(X_train[numeric_cols])

# 3️⃣ Transform both train + test numeric data
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

X_train_scaled[numeric_cols] = scaler.transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])


<h1>Train Model </h1>

In [15]:
## logistic regression
log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train_scaled, y_train)

y_pred_lr = log_reg.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.7239723535831212


In [16]:
## KNN 
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

y_pred_knn = knn.predict(X_test_scaled)
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))


KNN Accuracy: 0.667806475081848


In [17]:
## desicion tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train)

y_pred_dt = dt.predict(X_test_scaled)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))


Decision Tree Accuracy: 0.6397962895598399


In [18]:
##random forest 
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train_scaled, y_train)

y_pred_rf = rf.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Random Forest Accuracy: 0.7084030556566024


In [19]:
## 10- fold cross validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

models = {
    "Logistic Regression": log_reg,
    "KNN": knn,
    "Decision Tree": dt,
    "Random Forest": rf
}

for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=kfold, scoring='accuracy')
    print(f"{name} 10-Fold CV Accuracy: {scores.mean():.4f}")


Logistic Regression 10-Fold CV Accuracy: 0.7246
KNN 10-Fold CV Accuracy: 0.6654
Decision Tree 10-Fold CV Accuracy: 0.6366
Random Forest 10-Fold CV Accuracy: 0.7089


In [21]:
## hyper parameter tunning
param_grid = {
    "n_estimators": [100, 200, 300, 400],
    "max_depth": [10, 15, 20, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_rf = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_rf.fit(X_train_scaled, y_train)

print("Best Random Forest Accuracy:", grid_rf.best_score_)
print("Best Parameters:", grid_rf.best_params_)


NameError: name 'GridSearchCV' is not defined