In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.preprocessing import Normalizer
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# STEP 1 ,Preparing the Data:

In [2]:
# Read the CSV file
file_path = 'heart_disease_uci.csv'

df = pd.read_csv(file_path)
# df = df.sample(frac=1, random_state=42)
df.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


In [3]:
# show number of missing values
df.isna().sum()

id            0
age           0
sex           0
dataset       0
cp            0
trestbps     59
chol         30
fbs          90
restecg       2
thalch       55
exang        55
oldpeak      62
slope       309
ca          611
thal        486
num           0
dtype: int64

In [4]:
# Fill missing values
for label , content in df.items():
     if pd.api.types.is_numeric_dtype(content):
        if pd.isnull(content).sum():
            df[label] = content.fillna(content.median())

# Categorical data into numbers and fill missing values
for label , content in df.items():
     if not pd.api.types.is_numeric_dtype(content):
            df[label] = pd.Categorical(content).codes

In [5]:
# show number of missing values after replace
df.isna().sum()

id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64

In [6]:
df

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,1,0,3,145.0,233.0,1,0,150.0,0,2.3,0,0.0,0,0
1,2,67,1,0,0,160.0,286.0,0,0,108.0,1,1.5,1,3.0,1,2
2,3,67,1,0,0,120.0,229.0,0,0,129.0,1,2.6,1,2.0,2,1
3,4,37,1,0,2,130.0,250.0,0,1,187.0,0,3.5,0,0.0,1,0
4,5,41,0,0,1,130.0,204.0,0,0,172.0,0,1.4,2,0.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
915,916,54,0,3,0,127.0,333.0,1,2,154.0,0,0.0,-1,0.0,-1,1
916,917,62,1,3,3,130.0,139.0,0,2,140.0,-1,0.5,-1,0.0,-1,0
917,918,55,1,3,0,122.0,223.0,1,2,100.0,0,0.0,-1,0.0,0,2
918,919,58,1,3,0,130.0,385.0,1,0,140.0,-1,0.5,-1,0.0,-1,0


In [7]:
# column num to binary،
for i, item in enumerate(df['num']):
  if item != 0:
    df['num'][i]= 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['num'][i]= 1


In [8]:
# Drop unnecessary columns
df.drop(['dataset','id'],axis= 'columns', inplace = True)

In [9]:
# Shuffle data and separate data and labels
df = df.sample(frac=1, random_state=42)
X = df.drop('num', axis='columns')
y = df['num']

In [10]:
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal
319,36,1,1,120.0,166.0,0,1,180.0,0,0.0,-1,0.0,-1
377,45,1,1,140.0,224.0,1,1,122.0,0,0.0,-1,0.0,-1
538,48,1,0,160.0,329.0,0,1,92.0,1,1.5,1,0.0,-1
296,59,1,0,164.0,176.0,1,0,90.0,0,1.0,1,2.0,0
531,40,0,0,150.0,392.0,0,1,130.0,0,2.0,1,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,59,1,0,140.0,177.0,0,1,162.0,1,0.0,2,1.0,2
270,61,1,0,140.0,207.0,0,0,138.0,1,1.9,2,1.0,2
860,75,1,0,160.0,310.0,1,1,112.0,1,2.0,0,0.0,2
435,53,0,1,140.0,216.0,0,1,142.0,1,2.0,1,0.0,-1


In [11]:
# Normalizer
transformer = Normalizer().fit(X)
X_normal = transformer.transform(X)

In [12]:
X_normal

array([[ 0.1308814 ,  0.00363559,  0.00363559, ..., -0.00363559,
         0.        , -0.00363559],
       [ 0.15283595,  0.00339635,  0.00339635, ..., -0.00339635,
         0.        , -0.00339635],
       [ 0.12622131,  0.00262961,  0.        , ...,  0.00262961,
         0.        , -0.00262961],
       ...,
       [ 0.20053118,  0.00267375,  0.        , ...,  0.        ,
         0.        ,  0.0053475 ],
       [ 0.17741934,  0.        ,  0.00334753, ...,  0.00334753,
         0.        , -0.00334753],
       [ 0.15415036,  0.        ,  0.        , ...,  0.00540878,
         0.00270439,  0.00270439]])

In [13]:
# Split data to test and train
X_train, X_test, y_train, y_test = train_test_split(X_normal,y,test_size=0.2)

In [14]:
# Split the train data into training and validation sets
X_train_base, X_val, y_train_base, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## STEP2 ,Model Selection:

In [15]:
# Initialize base models and fit
svm = SVC(probability=True)
dt = DecisionTreeClassifier()
gb = GradientBoostingClassifier()

# Initialize the meta-learner
meta_learner = LogisticRegression(max_iter=10000)

## STEP 3, Training the Base Models

In [16]:
svm.fit(X_train, y_train)
dt.fit(X_train, y_train)
gb.fit(X_train, y_train)

# First method:

## STEP 4 and 5, Developing a Meta Model and Predictions on the Validation Set

In [17]:
# Create a stacking classifier using the base models and the meta-learner
stacking_clf = StackingClassifier(
    estimators=[('svm', svm), ('dt', dt), ('gb', gb)],
    final_estimator=meta_learner,
    cv=5
)

## STEP 6, Training the Meta Model

In [18]:
# Train the stacking classifier on the base models
stacking_clf.fit(X_train_base, y_train_base)

# Make predictions on the validation set
val_predictions = stacking_clf.predict(X_val)

# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val, val_predictions)
print("Accuracy of stacking on the validation set:", accuracy)

# Now, fit the stacking classifier on the entire training set
stacking_clf.fit(X_train, y_train)

Accuracy of stacking on the validation set: 0.8040540540540541


## STEP 7, Making Test Set Predictions

In [19]:
y_pred = stacking_clf.predict(X_test)

## STEP 8, Model Evaluation

In [20]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')

# Generate and print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate and print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 82.61%
Precision: 0.8673
Recall: 0.8173
F1-score: 0.8416
Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.84      0.81        80
           1       0.87      0.82      0.84       104

    accuracy                           0.83       184
   macro avg       0.82      0.83      0.82       184
weighted avg       0.83      0.83      0.83       184

Confusion Matrix:
[[67 13]
 [19 85]]


# Second method

## STEP 4, Predictions on the Validation Set

In [21]:
svm_pred = svm.predict(X_val)
dt_pred = dt.predict(X_val)
gb_pred = gb.predict(X_val)

## STEP 5, Developing a Meta Model

In [22]:
# Create a stacking classifier using the base models and the meta-learner
X_meta = np.column_stack((svm_pred, dt_pred, gb_pred))

## STEP 6, Training the Meta Model

In [23]:
# Train the stacking classifier on the base models
meta_learner.fit(X_train_base, y_train_base)
# Make predictions on the validation set
val_predictions = meta_learner.predict(X_val)

# Calculate accuracy on the validation set
accuracy = accuracy_score(y_val, val_predictions)
print("Accuracy of stacking on the validation set:", accuracy)

# Now, fit the stacking classifier on the entire training set
meta_learner.fit(X_train, y_train)

Accuracy of stacking on the validation set: 0.7702702702702703


## STEP 7, Making Test Set Predictions

In [24]:
y_pred = meta_learner.predict(X_test)

## STEP 8, Model Evaluation

In [25]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-score: {f1:.4f}')

# Generate and print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Generate and print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 64.13%
Precision: 0.7065
Recall: 0.6250
F1-score: 0.6633
Classification Report:
              precision    recall  f1-score   support

           0       0.58      0.66      0.62        80
           1       0.71      0.62      0.66       104

    accuracy                           0.64       184
   macro avg       0.64      0.64      0.64       184
weighted avg       0.65      0.64      0.64       184

Confusion Matrix:
[[53 27]
 [39 65]]
