<a href="https://colab.research.google.com/github/nikhildhavale/pythonLearning/blob/main/decisiontreeclassificationandregression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Decision Tree Classification (Breast Cancer) and Regression (Diabetes)
# Run in Google Colab / Jupyter. Requires scikit-learn, matplotlib, pandas, numpy.

# 1. IMPORTS
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer, load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree, export_text
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay,
    mean_absolute_error, mean_squared_error, r2_score
)

# For nicer plots
plt.rcParams["figure.figsize"] = (8, 5)
np.random.seed(42)


# -----------------------------
# PART A — Classification Demo
# -----------------------------
print("\n--- Decision Tree Classifier on Breast Cancer dataset ---\n")

# 2. LOAD DATA
bc = load_breast_cancer()
X_bc, y_bc = bc.data, bc.target
print("Breast cancer features shape:", X_bc.shape)
print("Target names:", bc.target_names)

# Optional DataFrame view
df_bc = pd.DataFrame(X_bc, columns=bc.feature_names)
df_bc["target"] = y_bc
print("\nClass distribution (0=malignant, 1=benign):")
print(df_bc["target"].value_counts())

# 3. TRAIN-TEST SPLIT
Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    X_bc, y_bc, test_size=0.2, random_state=42, stratify=y_bc
)
print("\nShapes after split:", Xb_train.shape, Xb_test.shape)

# 4. TRAIN DecisionTreeClassifier
# Choose reasonable max_depth to avoid overfitting in the demo (you can tune this)
dt_clf = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_leaf=5)
dt_clf.fit(Xb_train, yb_train)
print("Trained DecisionTreeClassifier.")

# 5. PREDICTIONS & EVAL
y_train_pred = dt_clf.predict(Xb_train)
y_test_pred  = dt_clf.predict(Xb_test)

train_acc = accuracy_score(yb_train, y_train_pred)
test_acc  = accuracy_score(yb_test,  y_test_pred)
print(f"\nAccuracy -> Train: {train_acc:.4f}  Test: {test_acc:.4f}")

print("\nClassification report (Test):")
print(classification_report(yb_test, y_test_pred, target_names=bc.target_names))

# Confusion matrix
cm = confusion_matrix(yb_test, y_test_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=bc.target_names)
fig, ax = plt.subplots(figsize=(5,4))
disp.plot(ax=ax, cmap="Blues", values_format='d')
plt.title("Decision Tree — Confusion Matrix (Test)")
plt.show()

# 6. FEATURE IMPORTANCE (top 10)
importances = dt_clf.feature_importances_
feat_df = pd.DataFrame({
    "feature": bc.feature_names,
    "importance": importances
}).sort_values("importance", ascending=False).reset_index(drop=True)

print("\nTop features by importance:")
display(feat_df.head(10))

# Plot feature importances (top 10)
topk = 10
ax = feat_df.head(topk).plot.barh(x="feature", y="importance", legend=False)
ax.invert_yaxis()
plt.title("Top 10 Feature Importances (Decision Tree Classifier)")
plt.xlabel("Importance")
plt.show()

# 7. SMALL TREE PLOT (limited depth to keep plots readable)
fig, ax = plt.subplots(figsize=(16,6))
plot_tree(dt_clf, feature_names=bc.feature_names, class_names=bc.target_names, filled=True, max_depth=5, ax=ax)
plt.title("Decision Tree (first 5 levels shown)")
plt.show()

# Optional: textual representation of rules (small)
tree_rules = export_text(dt_clf, feature_names=list(bc.feature_names))
print("\nExtracted rules (text) — first 400 characters:\n")
print(tree_rules + ("\n... (truncated) ..." if len(tree_rules) > 400 else ""))


# -----------------------------
# PART B — Regression Demo
# -----------------------------
print("\n\n--- Decision Tree Regressor on Diabetes dataset ---\n")

# 1) LOAD DATA
db = load_diabetes()
X_db, y_db = db.data, db.target.astype(float)
print("Diabetes features shape:", X_db.shape)
print("Feature names:", list(db.feature_names))

# Optional DataFrame
df_db = pd.DataFrame(X_db, columns=db.feature_names)
df_db["target"] = y_db
print("\nFirst five rows (diabetes):")
display(df_db.head())

# 2) TRAIN-TEST SPLIT
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_db, y_db, test_size=0.2, random_state=42)
print("\nShapes after split:", Xr_train.shape, Xr_test.shape)

# 3) TRAIN DecisionTreeRegressor
dt_reg = DecisionTreeRegressor(random_state=42, max_depth=10, min_samples_leaf=3)
dt_reg.fit(Xr_train, yr_train)
print("Trained DecisionTreeRegressor.")

# 4) PREDICT & EVALUATE
y_pred_train = dt_reg.predict(Xr_train)
y_pred_test  = dt_reg.predict(Xr_test)

def regression_report(y_true, y_pred, label=""):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"{label}MAE : {mae:.3f}")
    print(f"{label}MSE : {mse:.3f}")
    print(f"{label}RMSE: {rmse:.3f}")
    print(f"{label}R^2 : {r2:.4f}")

print("\nRegression performance (Train):")
regression_report(yr_train, y_pred_train, label="Train ")
print("\nRegression performance (Test):")
regression_report(yr_test, y_pred_test, label="Test  ")

# 5) DIAGNOSTIC PLOTS
# Predicted vs Actual (Test)
plt.figure()
plt.scatter(yr_test, y_pred_test, alpha=0.7)
min_val = min(yr_test.min(), y_pred_test.min())
max_val = max(yr_test.max(), y_pred_test.max())
plt.plot([min_val, max_val], [min_val, max_val], linestyle="--")
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Decision Tree Regressor — Predicted vs Actual (Test)")
plt.grid(True)
plt.show()

# Residuals
residuals = yr_test - y_pred_test
plt.figure()
plt.scatter(y_pred_test, residuals, alpha=0.7)
plt.axhline(0.0, linestyle="--")
plt.xlabel("Predicted")
plt.ylabel("Residual (Actual - Predicted)")
plt.title("Decision Tree Regressor — Residuals (Test)")
plt.grid(True)
plt.show()

# 6) FEATURE IMPORTANCES
imp_reg = pd.DataFrame({
    "feature": db.feature_names,
    "importance": dt_reg.feature_importances_
}).sort_values("importance", ascending=False).reset_index(drop=True)

print("\nTop features by importance (regression):")
display(imp_reg.head(10))

ax = imp_reg.head(10).plot.barh(x="feature", y="importance", legend=False)
ax.invert_yaxis()
plt.title("Top 10 Feature Importances (Decision Tree Regressor)")
plt.xlabel("Importance")
plt.show()

# 7) SMALL TREE PLOT (regressor)
fig, ax = plt.subplots(figsize=(24,12))
plot_tree(dt_reg, feature_names=db.feature_names, filled=True, max_depth=3, ax=ax)
plt.title("Decision Tree Regressor (first 5 levels shown)")
plt.show()

print("\nDone.")
