In [6]:
model = RandomForestClassifier(n_estimators=20, random_state=42)
model.fit(X_train_scaled, y_train)

In [7]:
model = RandomForestClassifier(n_estimators=2, random_state=42)
model.fit(X_train_scaled, y_train)

In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score

# Define file name
file_name = "/Users/marileonard/Desktop/DS Capstone/heart disease classification dataset (2).csv"

df = pd.read_csv(file_name)

# 1. Handle unnamed column (likely an index)
# Drop the first column if it's an 'Unnamed' index
if df.columns[0].startswith('Unnamed'):
    df = df.iloc[:, 1:]

# 2. Data Cleaning and Preparation
# Convert the categorical target ('yes'/'no') to binary (1/0)
df['target'] = df['target'].map({'yes': 1, 'no': 0})

# Convert the categorical sex ('male'/'female') to binary (1/0)
df['sex'] = df['sex'].map({'male': 1, 'female': 0})

# Replace any non-standard missing values (like empty strings) with NaN
df = df.replace('', np.nan)

# Convert all columns to numeric, coercing any non-convertible values (if any) to NaN
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Separate features (X) and target (y)
X = df.drop('target', axis=1)
y = df['target']

# 3. Handle Missing Values (Imputation)
# Use median imputation for numerical stability and robustness to outliers
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# 4. Split the data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42, stratify=y)

# 5. Standardize the data (Scale features to have mean=0 and variance=1)
# Standardizing is crucial for many models, though less critical for Random Forest, it's a good practice.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Train the Random Forest Classifier
# Use 100 trees (n_estimators=100)
rf_classifier = RandomForestClassifier(n_estimators=5, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

# 7. Evaluate the model
y_pred = rf_classifier.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)

recall = recall_score(y_test, y_pred)

# Calculate F1 score
f1 = f1_score(y_test, y_pred)

print(f"Random Forest Classifier Accuracy on the Test Set: {accuracy:.4f}")
print(f"Recall Score: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Random Forest Classifier Accuracy on the Test Set: 0.7541
Recall Score: 0.8182
F1 Score: 0.7826


In [9]:
new_model = RandomForestClassifier(n_estimators=5, random_state=42)
new_model.fit(X_train_scaled, y_train)

y_pred_new = new_model.predict(X_test_scaled)

new_accuracy = accuracy_score(y_test, y_pred_new)

print(f"New Accuracy with 5 Estimators: {new_accuracy:.4f}")

New Accuracy with 5 Estimators: 0.7541


In [10]:
new_model = RandomForestClassifier(n_estimators= 10, random_state=42)
new_model.fit(X_train_scaled, y_train)

y_pred_new = new_model.predict(X_test_scaled)

new_accuracy = accuracy_score(y_test, y_pred_new)

print(f"New Accuracy with 10 Estimators: {new_accuracy:.4f}")

New Accuracy with 10 Estimators: 0.8033


In [11]:
new_model = RandomForestClassifier(n_estimators=100, random_state=42)
new_model.fit(X_train_scaled, y_train)

y_pred_new = new_model.predict(X_test_scaled)

new_accuracy = accuracy_score(y_test, y_pred_new)

print(f"New Accuracy with 100 Estimators: {new_accuracy:.4f}")

New Accuracy with 100 Estimators: 0.8525
