In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score,recall_score,precision_score
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/mujahidashraf/data/refs/heads/main/heart_disease_data.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,48,1,0,130,256,1,0,150,1,0.0,2,2,3,0
1,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
2,44,0,2,118,242,0,1,149,0,0.3,1,1,2,1
3,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
4,56,1,3,120,193,0,0,162,0,1.9,1,0,3,1


In [None]:
X=df.drop(columns=['target'])
y=df['target']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10)

In [None]:
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
predicted = tree_model.predict(X_test)

In [None]:
cm = confusion_matrix(y_test, predicted)

print('Confusion matrix\n\n', cm)

Confusion matrix

 [[83  8]
 [ 6 85]]


In [None]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92        91
           1       0.91      0.93      0.92        91

    accuracy                           0.92       182
   macro avg       0.92      0.92      0.92       182
weighted avg       0.92      0.92      0.92       182



In [None]:
print("Accuracy:", accuracy_score(y_test, predicted))

Accuracy: 0.9230769230769231


In [None]:
tree_model.get_params(deep=True)

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': 42,
 'splitter': 'best'}

In [None]:
param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],  # Splitting criteria
    'max_depth': [3, 5,10,15, None],     # Tree depth
    'min_samples_split': [1,2, 5, 10],   # Minimum samples to split a node,
    'min_samples_leaf': [1, 2, 4]      # Minimum samples per leaf
}


dt = DecisionTreeClassifier(random_state=42)

# Apply GridSearchCV
grid_search = GridSearchCV(dt, param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the model
grid_search.fit(X_train, y_train)

In [None]:
print("Best Parameters:", grid_search.best_params_)

# Get best model
best_model = grid_search.best_estimator_

# Predict on test data
y_pred = best_model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}
Test Accuracy: 0.9120879120879121


In [None]:
cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

Confusion matrix

 [[83  8]
 [ 8 83]]


In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91        91
           1       0.91      0.91      0.91        91

    accuracy                           0.91       182
   macro avg       0.91      0.91      0.91       182
weighted avg       0.91      0.91      0.91       182



In [None]:
feature_importance = tree_model.feature_importances_

# Create a DataFrame
feature_df = pd.DataFrame({
    'Feature': df.drop(columns=['target']).columns,
    'Importance': feature_importance
})

# Sort by importance
feature_df = feature_df.sort_values(by="Importance", ascending=False)
print(feature_df)

     Feature  Importance
11        ca    0.240190
2         cp    0.152187
9    oldpeak    0.141180
4       chol    0.136221
12      thal    0.102472
0        age    0.094153
7    thalach    0.068437
3   trestbps    0.031441
10     slope    0.024373
5        fbs    0.006393
1        sex    0.002397
8      exang    0.000555
6    restecg    0.000000


In [None]:
selected_features = feature_df[feature_df['Importance'] > 0.01]['Feature'].tolist()
X_selected = df[selected_features]
X_selected.head()

Unnamed: 0,ca,cp,oldpeak,chol,thal,age,thalach,trestbps,slope
0,2,0,0.0,256,3,48,150,130,2
1,1,0,0.0,203,3,61,161,148,2
2,1,2,0.3,242,2,44,149,118,1
3,1,0,1.0,275,2,47,118,110,1
4,0,3,1.9,193,3,56,162,120,1


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size = 0.3, random_state = 10)

In [None]:
scalers = {
    "StandardScaler": StandardScaler(),
    "MinMaxScaler": MinMaxScaler(),
    "RobustScaler": RobustScaler()
}

# Define classifiers
models = {
    "MultinomialNB": GaussianNB(),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "LogisticRegression": LogisticRegression(max_iter=500),
    "DecisionTreeClassifier": DecisionTreeClassifier(random_state=42)
}

# Run models with each scaler
results = {}

for scaler_name, scaler in scalers.items():
    for model_name, model in models.items():
        # Create pipeline
        pipeline = Pipeline([
            ("scaler", scaler),
            ("classifier", model)
        ])

        # Train model
        pipeline.fit(X_train, y_train)

        # Predict
        y_pred = pipeline.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)

        # Store results
        results[f"{model_name} with {scaler_name}"] = accuracy

# Display accuracy scores
for model, acc in results.items():
    print(f"{model}: {acc:.4f}")

MultinomialNB with StandardScaler: 0.8297
KNN with StandardScaler: 0.8352
LogisticRegression with StandardScaler: 0.8462
DecisionTreeClassifier with StandardScaler: 0.9231
MultinomialNB with MinMaxScaler: 0.8297
KNN with MinMaxScaler: 0.8516
LogisticRegression with MinMaxScaler: 0.8571
DecisionTreeClassifier with MinMaxScaler: 0.9231
MultinomialNB with RobustScaler: 0.8297
KNN with RobustScaler: 0.8297
LogisticRegression with RobustScaler: 0.8407
DecisionTreeClassifier with RobustScaler: 0.9231
