In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
df = pd.read_csv('Preprocessed_cardio_dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
0,0,50.391781,2,168,62.0,110.0,80,1,1,0,0,1,0,21.96712
1,1,55.419178,1,156,85.0,140.0,90,3,1,0,0,1,1,34.927679
2,2,51.663014,1,165,64.0,130.0,70,3,1,0,0,0,1,23.507805
3,3,48.282192,2,169,82.0,150.0,100,1,1,0,0,1,1,28.710479
4,4,47.873973,1,156,56.0,100.0,60,1,1,0,0,0,0,23.011177


In [3]:
df.tail()

Unnamed: 0.1,Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
69971,69995,52.712329,2,168,76.0,120.0,80,1,1,1,0,1,0,26.927438
69972,69996,61.920548,1,158,126.0,140.0,90,2,2,0,0,1,1,39.742883
69973,69997,52.235616,2,183,105.0,170.0,90,3,1,0,1,0,1,31.353579
69974,69998,61.454795,1,163,72.0,135.0,80,1,2,0,0,0,1,27.099251
69975,69999,56.273973,1,170,72.0,120.0,80,2,1,0,0,1,0,24.913495


In [7]:
demo = df.copy()
demo.drop(columns=["Unnamed: 0"], inplace=True)

In [9]:
demo.tail()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
69971,52.712329,2,168,76.0,120.0,80,1,1,1,0,1,0,26.927438
69972,61.920548,1,158,126.0,140.0,90,2,2,0,0,1,1,39.742883
69973,52.235616,2,183,105.0,170.0,90,3,1,0,1,0,1,31.353579
69974,61.454795,1,163,72.0,135.0,80,1,2,0,0,0,1,27.099251
69975,56.273973,1,170,72.0,120.0,80,2,1,0,0,1,0,24.913495


In [11]:
demo.dtypes

age            float64
gender           int64
height           int64
weight         float64
ap_hi          float64
ap_lo            int64
cholesterol      int64
gluc             int64
smoke            int64
alco             int64
active           int64
cardio           int64
BMI            float64
dtype: object

In [13]:
from sklearn.model_selection import train_test_split

X = demo.drop(['cardio'], axis=1)
y = demo['cardio']


X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

print(f"training shape : {X_train.shape}")
print(f"testing shape : {X_test.shape} ")

training shape : (55980, 12)
testing shape : (13996, 12) 


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


X = demo.drop('cardio', axis=1) 
y = demo['cardio']            


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


numerical_cols = ['age', 'height', 'weight', 'ap_hi', 'ap_lo', 'BMI']


scaler = StandardScaler()


X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

# TRANSFORM the testing data (X_test) using the parameters learned from the training data
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

print("Data Scaling and Train/Test Split are complete!")
print(f"Final shape of Training Data: {X_train.shape}")

Data Scaling and Train/Test Split are complete!
Final shape of Training Data: (55980, 12)


In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

base_model = DecisionTreeClassifier()
model = BaggingClassifier(estimator=base_model, n_estimators=10)
model.fit(X_train, y_train)

In [19]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy(%): {accuracy * 100}%")


Accuracy(%): 69.79136896256072%


In [23]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import SVC


# ------------------ DATA PREPARATION ------------------
# # Remove unnamed column if exists
# df = df.loc[:, ~df.columns.str.contains("^Unnamed")]

# Features & target
X = demo.drop('cardio', axis=1)
y = demo['cardio']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# ------------------ MODELS ------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=500,max_depth=12,random_state=42),
    # "SVM": SVC(kernel='rbf')
}


# ------------------ TRAIN & EVALUATE ------------------
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}")


Logistic Regression Accuracy: 0.7236
KNN Accuracy: 0.6900
Decision Tree Accuracy: 0.6353
Random Forest Accuracy: 0.7347


In [25]:
import pickle
results = {}

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Accuracy
    acc = accuracy_score(y_test, y_pred) * 100
    results[name] = acc

    # Save model + scaler
    with open(f"{name}.pkl", "wb") as file:
        pickle.dump(
            {
                "model": model,
                "scaler": scaler
            },
            file
        )

    print(f"{name} accuracy: {acc:.4f}")


Logistic Regression accuracy: 72.3635
KNN accuracy: 68.9983
Decision Tree accuracy: 63.5253
Random Forest accuracy: 73.4710
