In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error

# Create fake data

In [None]:

# Set seed for reproducibility
np.random.seed(42)

# Generate 50 fake houses
n_houses = 50

# Features: square footage (1000-4000 sq ft)
square_footage = np.random.uniform(1000, 4000, n_houses)

# Features: age of house (0-50 years)
age = np.random.uniform(0, 50, n_houses)

# Target: actual price (based on rough formula + randomness)
# Higher square footage = higher price, newer house = higher price
price = 100000 + (square_footage * 150) + (2000 * (50 - age)) + np.random.normal(0, 50000, n_houses)
price = np.maximum(price, 100000)  # Ensure no negative prices

# Create DataFrame
df = pd.DataFrame({
    'square_footage': square_footage,
    'age': age,
    'price': price
})

# For classification: label houses as "expensive" (above median) or "not expensive"
median_price = df['price'].median()
df['expensive'] = (df['price'] > median_price).astype(int)  # 1 = expensive, 0 = not expensive

print("Sample of data:")
print(df.head(10))
print(f"\nMedian price: ${median_price:,.0f}")
print(f"Price range: ${df['price'].min():,.0f} to ${df['price'].max():,.0f}")


# Classification Tree Version...

In [None]:

# ============================================================================
# CLASSIFICATION TREE
# ============================================================================

print("\n" + "="*70)
print("CLASSIFICATION TREE: Is this house expensive?")
print("="*70)

# Split data
X = df[['square_footage', 'age']]
y_classify = df['expensive']

X_train, X_test, y_train, y_test = train_test_split(X, y_classify, test_size=0.3, random_state=42)

# Train classifier (small tree to keep it readable)
clf = DecisionTreeClassifier(max_depth=3, random_state=42)
clf.fit(X_train, y_train)

# Evaluate
train_acc = accuracy_score(y_train, clf.predict(X_train))
test_acc = accuracy_score(y_test, clf.predict(X_test))

print(f"\nTraining accuracy: {train_acc:.2%}")
print(f"Testing accuracy: {test_acc:.2%}")

print("\nExample predictions on test data:")
sample_predictions = X_test.head(5).copy()
sample_predictions['predicted_expensive'] = clf.predict(X_test.head(5))
sample_predictions['predicted_expensive'] = sample_predictions['predicted_expensive'].map({1: 'Yes', 0: 'No'})
print(sample_predictions)

# Visualize the tree
plt.figure(figsize=(20, 10))
plot_tree(clf,
          feature_names=['Square Footage', 'Age'],
          class_names=['Not Expensive', 'Expensive'],
          filled=True,
          rounded=True)
plt.title("Classification Tree: Predicting Expensive vs Not Expensive", fontsize=16)
plt.tight_layout()
plt.show()


# Regression Tree version...

In [None]:

# ============================================================================
# REGRESSION TREE
# ============================================================================

print("\n" + "="*70)
print("REGRESSION TREE: What will this house sell for?")
print("="*70)

# Split data (same features, different target)
y_regress = df['price']

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_regress, test_size=0.3, random_state=42)

# Train regressor (same depth for fair comparison)
reg = DecisionTreeRegressor(max_depth=3, random_state=42)
reg.fit(X_train_r, y_train_r)

# Evaluate with mean squared error (or RMSE for interpretability)
train_rmse = np.sqrt(mean_squared_error(y_train_r, reg.predict(X_train_r)))
test_rmse = np.sqrt(mean_squared_error(y_test_r, reg.predict(X_test_r)))

print(f"\nTraining RMSE: ${train_rmse:,.0f}")
print(f"Testing RMSE: ${test_rmse:,.0f}")

print("\nExample predictions on test data:")
sample_predictions_r = X_test_r.head(5).copy()
sample_predictions_r['predicted_price'] = reg.predict(X_test_r.head(5))
sample_predictions_r['predicted_price'] = sample_predictions_r['predicted_price'].apply(lambda x: f"${x:,.0f}")
print(sample_predictions_r)

# Visualize the tree
plt.figure(figsize=(20, 10))
plot_tree(reg,
          feature_names=['Square Footage', 'Age'],
          filled=True,
          rounded=True)
plt.title("Regression Tree: Predicting House Price", fontsize=16)
plt.tight_layout()
plt.show()


Let's compare...

In [None]:

# ============================================================================
# COMPARISON
# ============================================================================

print("\n" + "="*70)
print("KEY DIFFERENCES")
print("="*70)

print("""
CLASSIFICATION TREE:
- Outputs: Category labels ("Expensive" or "Not Expensive")
- Leaf values: Class labels (discrete)
- Quality metric: Gini impurity or entropy (measure of purity)
- Goal: Minimize misclassification

REGRESSION TREE:
- Outputs: Numbers (predicted price: $450,000)
- Leaf values: Average of training values (continuous)
- Quality metric: Variance reduction (how tight are the values?)
- Goal: Minimize prediction error (RMSE, MSE)

SAME STRUCTURE:
- Both split features recursively (e.g., "if square footage > 2,500")
- Both use binary left/right decisions
- Both make predictions by following the path to a leaf
""")