In [19]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

In [21]:
# Load Breast Cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target

In [23]:
# Print dataset name
print("Dataset Name:", "Breast Cancer Wisconsin (Diagnostic)")

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

Dataset Name: Breast Cancer Wisconsin (Diagnostic)


In [25]:
# Baseline model with default parameters
baseline_clf = RandomForestClassifier(random_state=42)
baseline_clf.fit(X_train, y_train)
y_pred_baseline = baseline_clf.predict(X_test)
baseline_acc = accuracy_score(y_test, y_pred_baseline)
print("\nBaseline Accuracy:", baseline_acc)
print(classification_report(y_test, y_pred_baseline))


Baseline Accuracy: 0.956140350877193
              precision    recall  f1-score   support

           0       0.95      0.93      0.94        42
           1       0.96      0.97      0.97        72

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [27]:
# Model with more trees
clf_more_trees = RandomForestClassifier(n_estimators=300, random_state=42)
clf_more_trees.fit(X_train, y_train)
y_pred_more_trees = clf_more_trees.predict(X_test)
acc_more_trees = accuracy_score(y_test, y_pred_more_trees)
print("\nAccuracy with more trees:", acc_more_trees)


Accuracy with more trees: 0.9473684210526315


In [28]:
# Model with shallow depth
clf_shallow = RandomForestClassifier(max_depth=3, random_state=42)
clf_shallow.fit(X_train, y_train)
y_pred_shallow = clf_shallow.predict(X_test)
acc_shallow = accuracy_score(y_test, y_pred_shallow)
print("\nAccuracy with shallow trees:", acc_shallow)


Accuracy with shallow trees: 0.956140350877193


In [31]:
# Model with higher min_samples_split
clf_more_split = RandomForestClassifier(min_samples_split=10, random_state=42)
clf_more_split.fit(X_train, y_train)
y_pred_more_split = clf_more_split.predict(X_test)
acc_more_split = accuracy_score(y_test, y_pred_more_split)
print("\nAccuracy with higher min_samples_split:", acc_more_split)


Accuracy with higher min_samples_split: 0.956140350877193


In [33]:

# Compare results in a DataFrame
results_df = pd.DataFrame({
    "Model": [
        "Baseline",
        "More Trees (300)",
        "Shallow Depth (3)",
        "Higher min_samples_split (10)"
    ],
    "Accuracy": [
        baseline_acc,
        acc_more_trees,
        acc_shallow,
        acc_more_split
    ]
})

print("\nComparison of Model Accuracies:")
print(results_df)


Comparison of Model Accuracies:
                           Model  Accuracy
0                       Baseline  0.956140
1               More Trees (300)  0.947368
2              Shallow Depth (3)  0.956140
3  Higher min_samples_split (10)  0.956140
