# Heart Disease Prediction using Random Forest
This notebook implements a Random Forest classifier to predict heart disease. It includes visualization, model training, evaluation, and saving the model for deployment.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load dataset
# Replace 'heart.csv' with your dataset path
df = pd.read_csv('heart.csv')
df.head()

In [None]:
# Check class balance
balance_ratio = df['target'].value_counts()[1] / df['target'].value_counts()[0]
print(f"Class balance (Disease : Healthy) = {balance_ratio:.2f}:1")

In [None]:
# Visualizations
fig, axes = plt.subplots(2, 2, figsize=(14,8))

# Age distribution
axes[0,0].hist([df[df['target']==0]['age'], df[df['target']==1]['age']],
                label=['Healthy','Disease'], bins=15, color=['green','red'])
axes[0,0].set_title('Age Distribution')
axes[0,0].legend()

# Cholesterol distribution
axes[0,1].hist([df[df['target']==0]['chol'], df[df['target']==1]['chol']],
                label=['Healthy','Disease'], bins=15, color=['green','red'])
axes[0,1].set_title('Cholesterol')
axes[0,1].legend()

# Max heart rate distribution
axes[1,0].hist([df[df['target']==0]['thalach'], df[df['target']==1]['thalach']],
                label=['Healthy','Disease'], bins=15, color=['green','red'])
axes[1,0].set_title('Max Heart Rate')
axes[1,0].legend()

# Target distribution
df['target'].value_counts().plot(kind='bar', ax=axes[1,1], color=['green','red'])
axes[1,1].set_title('Target Distribution')
axes[1,1].set_xticklabels(['Healthy','Disease'], rotation=0)

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm', linewidths=0.5, square=True)
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

In [None]:
# Split features and target
X = df.drop(columns='target', axis=1)
y = df['target']
print(X.shape, y.shape)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
joblib.dump(scaler, 'heart_scaler.pkl')

In [None]:
# Random Forest model
rf_model = RandomForestClassifier(
    n_estimators=300,
    max_depth=6,
    random_state=42,
    class_weight='balanced'
)

# Cross-validation
cv_score = cross_val_score(rf_model, X_train_scaled, y_train, cv=5, scoring='accuracy')
print(f"CV Accuracy: {cv_score.mean():.4f} +/- {cv_score.std():.4f}")

In [None]:
# Train the model
rf_model.fit(X_train_scaled, y_train)

# Predictions
y_train_pred = rf_model.predict(X_train_scaled)
y_test_pred = rf_model.predict(X_test_scaled)

# Accuracy
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

In [None]:
# Classification report
print(classification_report(y_test, y_test_pred, target_names=['Healthy','Disease']))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_test_pred)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Healthy','Disease'], yticklabels=['Healthy','Disease'])
plt.title('Random Forest Confusion Matrix', fontsize=14, fontweight='bold')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

In [None]:
# Save trained model
model_data = {
    'model': rf_model,
    'feature_names': X.columns.tolist()
}
joblib.dump(model_data, 'heart_disease_rf_model.pkl')