Build a random forest classifier to predict the risk of heart disease based on a dataset of patient 
information. The dataset contains 303 instances with 14 features, including age, sex, chest pain type
resting blood pressure, serum cholesterol, and maximum heart rate achieved.
Dataset link: https://drive.google.com/file/d/1bGoIE4Z2kG5nyh-fGZAJ7LH0ki3UfmSJ/view?
usp=share_

link
Q1. Preprocess the dataset by handling missing values, encoding categorical variables, and scaling the
numerical features if nece

ssary.
Q2. Split the dataset into a training set (70%) and a test set

 (30%).
Q3. Train a random forest classifier on the training set using 100 trees and a maximum depth of 10 for each
tree. Use the default values for other hyperpa

rameters.
Q4. Evaluate the performance of the model on the test set using accuracy, precision, recall, and

 F1 score.
Q5. Use the feature importance scores to identify the top 5 most important features in predicting heart
disease risk. Visualise the feature importances using 

a bar chart.
Q6. Tune the hyperparameters of the random forest classifier using grid search or rando  search. Try
different values of the number of trees, maximum depth, minimum samples split, and m nimum samples
leaf. Use 5-fold cross-validation to evaluate the performance of each set of h

yperparameters.
Q7. Report the best set of hyperparameters found by the search and the correspon ing performance
metrics. Compare the performance of the tuned model with t

he default model.
Q8. Interpret the model by analysing the decision boundaries of the random forest clssifier. Plot the
decision boundaries on a scatter plot of two of the most important features. Discuss the insights and
limitations of the model for predicting heart disease risk.

Step 1: Preprocess the Dataset


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np

# Load the dataset
df = pd.read_csv('heart.csv')

# Handle missing values
# Assuming no missing values, if there are, we can use SimpleImputer for filling

# Encode categorical variables
categorical_features = ['sex', 'cp', 'restecg', 'slope', 'thal', 'ca']
numeric_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

# Preprocessing pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Preprocess the data
X = df.drop('target', axis=1)
y = df['target']
X = preprocessor.fit_transform(X)



Step 2: Split the Dataset



# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Step 3: Train a Random Forest Classifier


# Train the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)


Step 4: Evaluate Model Performance


# Make predictions on the test set
y_pred = rf.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Step 5: Feature Importance


# Feature Importance
importances = rf.feature_importances_
indices = np.argsort(importances)[-5:]

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], align="center")
plt.yticks(range(len(indices)), [df.columns[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()


Step 6: Hyperparameter Tuning


from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")


Step 7: Report Best Hyperparameters


# Evaluate the best model
best_rf = grid_search.best_estimator_
y_pred_best = best_rf.predict(X_test)

accuracy_best = accuracy_score(y_test, y_pred_best)
precision_best = precision_score(y_test, y_pred_best)
recall_best = recall_score(y_test, y_pred_best)
f1_best = f1_score(y_test, y_pred_best)

print(f"Best Model Accuracy: {accuracy_best}")
print(f"Best Model Precision: {precision_best}")
print(f"Best Model Recall: {recall_best}")
print(f"Best Model F1 Score: {f1_best}")

# Compare with the default model
print(f"Default Model vs Best Model - Accuracy: {accuracy} vs {accuracy_best}")
print(f"Default Model vs Best Model - Precision: {precision} vs {precision_best}")
print(f"Default Model vs Best Model - Recall: {recall} vs {recall_best}")
print(f"Default Model vs Best Model - F1 Score: {f1} vs {f1_best}")


Step 8: Interpret the Model


# Plot decision boundaries (using two most important features)
from matplotlib.colors import ListedColormap

def plot_decision_boundaries(clf, X, y):
    # Create a meshgrid for plotting
    h = .02  # step size in the mesh
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

    # Predict on meshgrid
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # Plot contour and training examples
    plt.contourf(xx, yy, Z, alpha=0.8, cmap=ListedColormap(('orange', 'blue')))
    plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k', marker='o', s=50, cmap=ListedColormap(('orange', 'blue')))
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.title("Decision Boundary")
    plt.show()

# For simplicity, let's use the first two most important features
X_two_features = X[:, indices[-2:]]
plot_decision_boundaries(best_rf, X_two_features, y)