# **Model and evaluation Notebook 2**

## Objectives

- Fit and evaluate a classification model to predict if a patient will suffer from heart disease or not.
- Fulfil business requirement 2.

## Inputs

* outputs/datasets/cleaned/TargetTestSet.csv
* outputs/datasets/cleaned/TargetTrainSet.csv
* outputs/datasets/cleaned/TestSetCleaned.csv
* outputs/datasets/cleaned/TrainSetCleaned.csv
* outputs/datasets/collection/heart.csv

## Outputs

* Test set (features and target)
* Data cleaning and Feature Engineering , and modeling pipeline
* Heatmap for confusion matrix report plot



---

# Set up the Working Directory

Define and confirm the working directory.

In [None]:
import os
current_dir = os.getcwd()
os.chdir(os.path.dirname(current_dir))
current_dir = os.getcwd()
current_dir

---

# Load data

In [None]:
import numpy as np
import pandas as pd
df = (pd.read_csv("outputs/datasets/collection/heart.csv"))

# Separate predictors and target
X = df.drop(['target'], axis=1)
y = df['target']

print(X.shape)
X.head(3)

### Loading clean train and test set

In [None]:
import numpy as np
import pandas as pd

y_test = (pd.read_csv("outputs/datasets/cleaned/TargetTestSet.csv"))
y_train = (pd.read_csv("outputs/datasets/cleaned/TargetTrainSet.csv"))
X_test = (pd.read_csv("outputs/datasets/cleaned/TestSetCleaned.csv"))
X_train = (pd.read_csv("outputs/datasets/cleaned/TrainSetCleaned.csv"))

An error was found , for the y_train and y_test being a 2d array

In [None]:
y_train.head(2)

Used ravel to make both of them a 1d array

In [None]:
y_train = y_train.values.ravel()
y_test = y_test.values.ravel()

# Finding the best combination of features

The following are the two combination of features study and found in previous notebooks:

- best_features = From ModelAndEvaluation notebook, found with feature importance.
- best_correlation_features = From FeatureSelection notebook, found with correlational study.

In [None]:
best_features = ['ca', 'cp', 'thal']
best_features

In [None]:
best_correlation_features = ['cp', 'chol', 'thalach','exang', 'oldpeak', 'ca']
best_correlation_features

In [None]:
# Identify the difference between the two lists
difference = list(set(best_correlation_features) - set(best_features))
if difference:
    explanation = f"The original best features based on correlation were {', '.join(best_correlation_features)}."
    explanation += f" After additional analysis, the best features are {', '.join(best_features)}."
    explanation += f" The change made was replacing 'thalach' with 'thal'."
else:
    explanation = "The best features based on correlation match the updated best features."

print(explanation)

In [None]:
features_selected = ['cp', 'chol','exang', 'oldpeak', 'ca', 'thal']

In [None]:
X_train_filtered = X_train.filter(features_selected)
X_test_filtered = X_test.filter(features_selected)

print(X_train_filtered.shape, y_train.shape, X_test_filtered.shape, y_test.shape)
X_train_filtered.head(3)

# Reaxamine performance of the pipeline

Loading pipeline from " ModelAndEvaluation notebook " section "We create a new pipeline, using the best model and the best hyperparameters from the research above."

We know that the following pipeline has the best model and the best hyperparameters

In [None]:
from xgboost import XGBClassifier
from feature_engine.transformation import YeoJohnsonTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

def xgbclassifier_pipeline():
    # Define the hyperparameters
    hyperparameters = {
        'learning_rate': 0.01,
        'max_depth': 3,
        'n_estimators': 100
    }

    model = XGBClassifier(**hyperparameters, random_state=0)

    pipeline_base = Pipeline([
        ("YeoJohnsonTransformer", YeoJohnsonTransformer(variables=['cp', 'chol','exang', 'oldpeak', 'ca', 'thal'])),
        ("scaler", StandardScaler()),
        ('model', model)
    ])

    return pipeline_base

# Create the XGBoost pipeline with hyperparameters
xgb_pipeline = xgbclassifier_pipeline()
xgb_pipeline.fit(X_train_filtered, y_train)

Check feature importance after fitting data.

In [None]:
# Fit the XGBoost pipeline
xgb_pipeline.fit(X_train_filtered, y_train)

model = xgb_pipeline.named_steps['model']

feature_importances = model.feature_importances_

feature_names = xgb_pipeline.named_steps['YeoJohnsonTransformer'].variables + ['scaler']  # Add 'scaler' for scaled features

feature_importance_dict = {feature: importance for feature, importance in zip(feature_names, feature_importances)}

sorted_feature_importance = dict(sorted(feature_importance_dict.items(), key=lambda item: item[1], reverse=True))

print(sorted_feature_importance)


In [None]:
from sklearn.metrics import classification_report, confusion_matrix

def confusion_matrix_and_report(X, y, pipeline, label_map):

    prediction = pipeline.predict(X)

    print('---  Confusion Matrix  ---')
    print(pd.DataFrame(confusion_matrix(y_true=prediction, y_pred=y),
          columns=[["Actual " + sub for sub in label_map]],
          index=[["Prediction " + sub for sub in label_map]]
          ))
    print("\n")

    print('---  Classification Report  ---')
    print(classification_report(y, prediction, target_names=label_map), "\n")


def clf_performance(X_train_selected, y_train, X_test_filtered, y_test, pipeline, label_map):
    print("#### Train Set #### \n")
    confusion_matrix_and_report(X_train_filtered, y_train, pipeline, label_map)

    print("#### Test Set ####\n")
    confusion_matrix_and_report(X_test_filtered, y_test, pipeline, label_map)

In [None]:
print(classification_report(y_pred=xgb_pipeline.predict(X_test_filtered), y_true=y_test))

Heatmap for classification report

In [None]:
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Generate classification report
report = classification_report(y_pred=xgb_pipeline.predict(X_test_filtered), y_true=y_test, output_dict=True)

# Convert the classification report to a DataFrame for easy plotting
df_report = pd.DataFrame(report).transpose()

# Plot the classification report using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df_report.iloc[:-1, :3], annot=True, fmt='.2f', cmap='Blues')
plt.title('Final Classification Report')

version = 'v3'
file_path = f'outputs/ml_pipeline/predict_heart_disease/{version}'
# Define the filename for this plot
plot_filename_classification_report_1 = f'{file_path}/classification_report_3.png'

plt.show()

Combination of features that finally meet the business requirements metrics with a performance of:

- **Accuracy of 0.88**
- **Precision on class 0 of 0.94**
- Precision on class 1 of 0.84
- **Recall of class 1 of 0.95**
- Recall of class 0 of 0.79

Developer chose this as the best combination of features .

---

# Save and push file to repo

We will generate the following files:

- Train set
- Test set
- Data cleaning and Feature Engineering , and modeling pipeline
- Heatmap plot for confusion matrix report

In [None]:
import os

version = 'v3'
file_path = f'outputs/ml_pipeline/predict_heart_disease/{version}'

try:
  os.makedirs(name=file_path)
except Exception as e:
  print(e)

## Train Set

In [None]:
print(X_train.shape)
X_train.head()

In [None]:
X_train = X_train_filtered
X_test = X_test_filtered

X_train.head(3)

In [None]:
X_train.to_csv(f"{file_path}/X_train.csv", index=False)

Retransforming y to dataframe

In [None]:
import pandas as pd

y_train = pd.DataFrame({'target': y_train})

In [None]:
y_train.head()

In [None]:
y_train.to_csv(f"{file_path}/y_train.csv", index=False)

## Test set

In [None]:
print(X_test.shape)
X_test.head()

In [None]:
X_test.to_csv(f"{file_path}/X_test.csv", index=False)

Retransforming y to dataframe

In [None]:
y_test = pd.DataFrame({'target' : y_test})

In [None]:
y_test.head()

In [None]:
y_test.to_csv(f"{file_path}/y_test.csv", index=False)

## Save pipeline

In [None]:
xgbclassifier_pipeline = xgb_pipeline
xgbclassifier_pipeline

In [None]:
import joblib

joblib.dump(xgbclassifier_pipeline, f"{file_path}/xgbclassifier_pipeline.pkl")

## Save heatmap report on performance

In [None]:
# Generate classification report
report = classification_report(y_pred=xgb_pipeline.predict(X_test_filtered), y_true=y_test, output_dict=True)

# Convert the classification report to a DataFrame for easy plotting
df_report = pd.DataFrame(report).transpose()

# Plot the classification report using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df_report.iloc[:-1, :3], annot=True, fmt='.2f', cmap='Blues')
plt.title('Classification Report 1')

# Save the plot to the specified filename
plt.savefig(plot_filename_classification_report_1, bbox_inches='tight')

# Display the saved plot
plt.show()