In [None]:
# Data Loading

import pandas as pd

# Load processed training set (24 features)
TRAIN_PATH = "../data/train"
PROCESSED_TRAIN_FILE = f"{TRAIN_PATH}/housing_train_processed.csv"

housing = pd.read_csv(PROCESSED_TRAIN_FILE)

# Split features/labels
X = housing.drop("median_house_value", axis=1)
y = housing["median_house_value"]

X.head()
y.head()

In [None]:
# Model Fitting

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize and fit model
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(X, y)

# Display basic training results
y_pred_train = tree_reg.predict(X)
rmse_train = np.sqrt(mean_squared_error(y, y_pred_train))
print(f"Training RMSE: {rmse_train:.2f}")

In [None]:
# Cross-Validation

from sklearn.model_selection import cross_val_score

# 10-fold cross-validation
scores = cross_val_score(tree_reg, X, y,
                         scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

print("Cross-validation RMSE scores:", rmse_scores)
print("Mean:", rmse_scores.mean())
print("Standard deviation:", rmse_scores.std())

In [None]:
# Hyperparameter Tuning

from sklearn.model_selection import GridSearchCV

# Grid search for max_depth and min_samples_split
param_grid = {
    "max_depth": [None, 5, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42),
                           param_grid, scoring="neg_mean_squared_error", cv=5)
grid_search.fit(X, y)

print("Best parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate best model
y_pred_best = best_model.predict(X)
rmse_best = np.sqrt(mean_squared_error(y, y_pred_best))
print(f"Training RMSE with best parameters: {rmse_best:.2f}")

In [None]:
# Model Saving

import joblib
import os

# Save trained model
MODELS_PATH = "../models"
os.makedirs(MODELS_PATH, exist_ok=True)

model_file = os.path.join(MODELS_PATH, "decision_tree_model.pkl")
joblib.dump(best_model, model_file)
print(f"Model saved to {model_file}")