In [None]:
# Decision Tree & Random Forest with Evaluation and Visualization

# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# 2. Load dataset
data = load_iris()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name="target")

# 3. Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 4. Train Decision Tree (no depth limit to observe overfitting)
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# 5. Visualize Decision Tree
plt.figure(figsize=(12, 8))
plot_tree(dt, filled=True, feature_names=data.feature_names, class_names=data.target_names)
plt.title("Decision Tree Classifier")
plt.show()

# 6. Evaluate Decision Tree
y_pred_dt = dt.predict(X_test)
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nClassification Report (Decision Tree):\n", classification_report(y_test, y_pred_dt))

# 7. Analyze Overfitting: Control Tree Depth
depth_values = range(1, 11)
train_acc = []
test_acc = []

for depth in depth_values:
    dt_depth = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt_depth.fit(X_train, y_train)
    train_acc.append(dt_depth.score(X_train, y_train))
    test_acc.append(dt_depth.score(X_test, y_test))

# Plot training vs test accuracy
plt.figure(figsize=(8, 5))
plt.plot(depth_values, train_acc, marker='o', label="Train Accuracy")
plt.plot(depth_values, test_acc, marker='o', label="Test Accuracy")
plt.xlabel("Tree Depth")
plt.ylabel("Accuracy")
plt.title("Decision Tree Depth vs Accuracy")
plt.legend()
plt.grid(True)
plt.show()

# 8. Train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# 9. Compare accuracy
y_pred_rf = rf.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report (Random Forest):\n", classification_report(y_test, y_pred_rf))

# 10. Interpret Feature Importances
importances = pd.Series(rf.feature_importances_, index=data.feature_names)
importances.sort_values(ascending=True).plot(kind='barh', figsize=(8, 5))
plt.title("Random Forest Feature Importances")
plt.show()

# 11. Cross-validation evaluation
cv_scores_dt = cross_val_score(dt, X, y, cv=5)
cv_scores_rf = cross_val_score(rf, X, y, cv=5)

print("Decision Tree CV Mean Accuracy:", np.mean(cv_scores_dt))
print("Random Forest CV Mean Accuracy:", np.mean(cv_scores_rf))
