# Decision Tree Regressor
* Decision Tree Regressor is not a linear model so it does not have the same assumptions as OLS. This allows decision trees to be more versatile for different types of data and can model complex and non-linear relationships between features and the target. The downside is that decision trees are prone to overfitting if not properly pruned or regularized - use techniques such as tree depth limitation and pruning to enhance the generalizability of the model

In [1]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Load the dataset
california_housing = fetch_california_housing()
X = california_housing.data
y = california_housing.target

In [3]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Create a decision tree regressor
model = DecisionTreeRegressor(random_state=42)

# Define hyperparameters for tuning
param_grid = {
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 15, 20, 25],
    'min_samples_leaf': [1, 2, 4, 8]
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_

# Train a model using the best parameters
best_model = DecisionTreeRegressor(random_state=42, **best_params)
best_model.fit(X_train, y_train)

print(f"Best Parameters: {best_params}")

# Evaluate the best regressor on the test data
y_pred = best_model.predict(X_test)

print(f"Training R-Squared: {best_model.score(X_train, y_train)}")
print(f"Testing R-Squared: {best_model.score(X_test, y_test)}")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

Best Parameters: {'max_depth': None, 'min_samples_leaf': 8, 'min_samples_split': 25}
Training R-Squared: 0.8529686232485982
Testing R-Squared: 0.7229773320846045
Mean Squared Error: 0.3630129912125816
