In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("data/cleveland_clean.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0,1
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0,0


In [3]:
# train, test split
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(242, 13) (61, 13) (242,) (61,)


In [4]:
# Stacking prediction

from sklearn.ensemble import (
    AdaBoostClassifier,
    GradientBoostingClassifier,
    RandomForestClassifier,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier


dtc = DecisionTreeClassifier(random_state=42)
rfc = RandomForestClassifier(random_state=42)
knn = KNeighborsClassifier()
xgb = XGBClassifier(random_state=42)
gc = GradientBoostingClassifier(random_state=42)
svc = SVC(kernel="rbf", random_state=42)
ad = AdaBoostClassifier(random_state=42)

classifiers = {
    "DecisionTree": dtc,
    "RandomForest": rfc,
    "KNN": knn,
    "XGBoost": xgb,
    "GradientBoost": gc,
    "SVC": svc,
    "AdaBoost": ad,
}

# Train all models
for key in classifiers.keys():
    classifier = classifiers[key]
    classifier.fit(X_train, y_train)

# Predict all models
y_pred_train = {}
y_pred_test = {}
for key in classifiers.keys():
    classifier = classifiers[key]
    y_pred_train[key] = classifier.predict(X_train)
    y_pred_test[key] = classifier.predict(X_test)

# Stacking: use np.column_stack and pass the list of predictions
X_train_stack = np.column_stack([y_pred_train[key] for key in classifiers.keys()])
X_test_stack = np.column_stack([y_pred_test[key] for key in classifiers.keys()])

# Train a model on the stacked data
stacked_model = XGBClassifier(random_state=42)
stacked_model.fit(X_train_stack, y_train)

# Predict on the stacked model
y_pred_train_stack = stacked_model.predict(X_train_stack)
y_pred_test_stack = stacked_model.predict(X_test_stack)



In [5]:
# confusion matrix
from sklearn.metrics import confusion_matrix

cm_train = confusion_matrix(y_train, y_pred_train_stack)
print(cm_train)

cm_test = confusion_matrix(y_test, y_pred_test_stack)
print(cm_test)

[[135   0]
 [  0 107]]
[[22  7]
 [ 8 24]]


In [6]:
accuracy_for_train = np.round((cm_train[0][0] + cm_train[1][1]) / len(y_train), 2)
accuracy_for_test = np.round((cm_test[0][0] + cm_test[1][1]) / len(y_test), 2)
print(
    "Accuracy for training set for KNeighborsClassifier = {}".format(accuracy_for_train)
)
print("Accuracy for test set for KNeighborsClassifier = {}".format(accuracy_for_test))

Accuracy for training set for KNeighborsClassifier = 1.0
Accuracy for test set for KNeighborsClassifier = 0.75
