In [1]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import root_mean_squared_error

In [2]:
path = '../data/raw/'
df = pd.read_csv(os.path.join(path, 'housing.csv'))
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


---

### Data Splitting

In [3]:
X = df.drop('median_house_value', axis=1)
y = df['median_house_value']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2 ,random_state=42)

In [5]:
X_train.head(3)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,NEAR OCEAN
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,NEAR OCEAN
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,NEAR OCEAN


---

### Preprocessing and Data Preparation

In [6]:
X_test.isna().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
dtype: int64

In [7]:
imputer = SimpleImputer(strategy='median')
X_train['total_bedrooms'] = imputer.fit_transform(X_train[['total_bedrooms']])
X_test['total_bedrooms'] = imputer.transform(X_test[['total_bedrooms']])

In [8]:
X_test.isna().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
ocean_proximity       0
dtype: int64

In [9]:
encoder = OneHotEncoder(handle_unknown='ignore' , sparse_output=False)

encoded_train = encoder.fit_transform(X_train[['ocean_proximity']])
encoded_test = encoder.transform(X_test[['ocean_proximity']])

encoded_cols = encoder.get_feature_names_out()
print(encoded_cols)

['ocean_proximity_<1H OCEAN' 'ocean_proximity_INLAND'
 'ocean_proximity_ISLAND' 'ocean_proximity_NEAR BAY'
 'ocean_proximity_NEAR OCEAN']


In [10]:
encoded_train_df = pd.DataFrame(encoded_train, columns=encoded_cols, index=X_train.index)
encoded_test_df = pd.DataFrame(encoded_test, columns=encoded_cols, index=X_test.index)

In [11]:
# Drop original ocean_proximity and add encoded columns
X_train = X_train.drop("ocean_proximity", axis=1).join(encoded_train_df)
X_test = X_test.drop("ocean_proximity", axis=1).join(encoded_test_df)

print("Final shapes:")
print("X_train_encoded:", X_train.shape)
print("X_test_encoded:", X_test.shape)

print(encoded_train)

Final shapes:
X_train_encoded: (16512, 13)
X_test_encoded: (4128, 13)
[[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 ...
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]]


In [12]:
X_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
14196,-117.03,32.71,33.0,3126.0,627.0,2300.0,623.0,3.2596,0.0,0.0,0.0,0.0,1.0
8267,-118.16,33.77,49.0,3382.0,787.0,1314.0,756.0,3.8125,0.0,0.0,0.0,0.0,1.0
17445,-120.48,34.66,4.0,1897.0,331.0,915.0,336.0,4.1563,0.0,0.0,0.0,0.0,1.0
14265,-117.11,32.69,36.0,1421.0,367.0,1418.0,355.0,1.9425,0.0,0.0,0.0,0.0,1.0
2271,-119.8,36.78,43.0,2382.0,431.0,874.0,380.0,3.5542,0.0,1.0,0.0,0.0,0.0


---

### Model Training and Evaluation

In [13]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [14]:
dt = DecisionTreeRegressor(
    random_state=42
)

dt.fit(X_train, y_train)

0,1,2
,"criterion  criterion: {""squared_error"", ""friedman_mse"", ""absolute_error"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in the half mean Poisson deviance to find splits. .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 0.24  Poisson deviance criterion.",'squared_error'
,"splitter  splitter: {""best"", ""random""}, default=""best"" The strategy used to choose the split at each node. Supported strategies are ""best"" to choose the best split and ""random"" to choose the best random split.",'best'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. For an example of how ``max_depth`` influences the model, see :ref:`sphx_glr_auto_examples_tree_plot_tree_regression.py`.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: int, float or {""sqrt"", ""log2""}, default=None The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None, then `max_features=n_features`. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",
,"random_state  random_state: int, RandomState instance or None, default=None Controls the randomness of the estimator. The features are always randomly permuted at each split, even if ``splitter`` is set to ``""best""``. When ``max_features < n_features``, the algorithm will select ``max_features`` at random at each split before finding the best split among them. But the best found split may vary across different runs, even if ``max_features=n_features``. That is the case, if the improvement of the criterion is identical for several splits and one split has to be selected at random. To obtain a deterministic behaviour during fitting, ``random_state`` has to be fixed to an integer. See :term:`Glossary ` for details.",42
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow a tree with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0


In [15]:
y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

dt_train_rmse = root_mean_squared_error(
    y_train, y_train_pred
)

dt_test_rmse = root_mean_squared_error(
    y_test, y_test_pred
)

print("Decision Tree Train RMSE:", dt_train_rmse)
print("Decision Tree Test RMSE :", dt_test_rmse)

Decision Tree Train RMSE: 0.0
Decision Tree Test RMSE : 69136.03126855678


In [16]:
dt_train_r2 = dt.score(X_train, y_train)
dt_test_r2 = dt.score(X_test, y_test)

print("Decision Tree Train R2:", dt_train_r2)
print("Decision Tree Test R2 :", dt_test_r2)

Decision Tree Train R2: 1.0
Decision Tree Test R2 : 0.6352443474568316


#### Save the Unconstrained Decision Tree Metrics

In [17]:
import json
from pathlib import Path

metrics_path = Path("../artifacts/decision_tree/metrics.json")

metrics = {}

# Load existing OR create new
if metrics_path.exists():
    with open(metrics_path, "r") as f:
        metrics = json.load(f)
    print(f"Loaded existing metrics from {metrics_path}")
else:
    metrics = {}
    print(f"Created new metrics file at {metrics_path}")

# Base structure
metrics.setdefault("model_family", "decision_tree")
metrics.setdefault("models", {})

# Unconstrained Decision Tree
metrics["models"]["decision_tree_unconstrained"] = {
    "train": {
        "rmse": dt_train_rmse,
        "r2": dt_train_r2
    },
    "test": {
        "rmse": dt_test_rmse,
        "r2": dt_test_r2
    }
}

# Save
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=4)


Loaded existing metrics from ../artifacts/decision_tree/metrics.json


### Add Constraints to the Decision Tree

In [18]:
dt_limited = DecisionTreeRegressor(
    max_depth=8,
    min_samples_leaf=20,
    random_state=42
)

dt_limited.fit(X_train, y_train)

y_test_pred = dt_limited.predict(X_test)
y_train_pred = dt_limited.predict(X_train)

dtl_train_rmse = root_mean_squared_error(y_test, y_test_pred)
dtl_test_rmse = root_mean_squared_error(y_train, y_train_pred)

dt1_train_r2 = dt_limited.score(X_train, y_train)
dt1_test_r2 = dt_limited.score(X_test, y_test)

print("Limited Tree Test RMSE:", dtl_train_rmse)
print("Limited Train Test RMSE:", dtl_test_rmse)

print("Limited Tree Train R2:", dt1_train_r2)
print("Limited Tree Test R2 :", dt1_test_r2)

Limited Tree Test RMSE: 62340.244165104
Limited Train Test RMSE: 58730.94597264978
Limited Tree Train R2: 0.7419673671801547
Limited Tree Test R2 : 0.7034280055237578


In [19]:
### Save the model
import joblib

model_path = Path("../artifacts/decision_tree/decision_tree_model.joblib")
joblib.dump(dt_limited, model_path)
print(f"Saved Decision Tree model to {model_path}")

Saved Decision Tree model to ../artifacts/decision_tree/decision_tree_model.joblib


#### Save the Constrained Decision Tree Metrics

In [20]:
import json
from pathlib import Path

metrics_path = Path("../artifacts/decision_tree/metrics.json")

# Load existing OR create new
if metrics_path.exists():
    with open(metrics_path, "r") as f:
        metrics = json.load(f)
    print(f"Loaded existing metrics from {metrics_path}")
else:
    metrics = {}
    print(f"Created new metrics file at {metrics_path}")

# Add constrained tree
metrics["models"]["decision_tree_constrained"] = {
    "params": {
        "max_depth": 8,
        "min_samples_leaf": 20
    },
    "train": {
        "rmse": dtl_train_rmse,
        "r2": dt1_train_r2
    },
    "test": {
        "rmse": dtl_test_rmse,
        "r2": dt1_test_r2
    }
}

# Save back
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=4)


Loaded existing metrics from ../artifacts/decision_tree/metrics.json


----

## Random Forest Model

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

rf = RandomForestRegressor(
    n_estimators=200,      # number of trees
    random_state=42,
    n_jobs=-1              # use all cores
)

rf.fit(X_train, y_train)

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",200
,"criterion  criterion: {""squared_error"", ""absolute_error"", ""friedman_mse"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in Poisson deviance to find splits. Training using ""absolute_error"" is significantly slower than when using ""squared_error"". .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 1.0  Poisson criterion.",'squared_error'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None or 1.0, then `max_features=n_features`. .. note::  The default of 1.0 is equivalent to bagged trees and more  randomness can be achieved by setting smaller values, e.g. 0.3. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to 1.0. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",1.0
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [34]:
sum(tree.tree_.node_count for tree in rf.estimators_)

4010144

In [22]:
y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

rf_train_rmse = root_mean_squared_error(
    y_train, y_train_pred,
)

rf_test_rmse = root_mean_squared_error(
    y_test, y_test_pred
)

print("Random Forest Train RMSE:", rf_train_rmse)
print("Random Forest Test RMSE :", rf_test_rmse)

Random Forest Train RMSE: 17877.84765250773
Random Forest Test RMSE : 48823.452434549225


In [23]:
rf_train_r2 = r2_score(y_train, y_train_pred)
rf_test_r2 = r2_score(y_test, y_test_pred)

print("Random Forest Train R²:", rf_train_r2)
print("Random Forest Test R² :", rf_test_r2)

Random Forest Train R²: 0.9760904663266143
Random Forest Test R² : 0.818092706383268


In [24]:
### Saving the model
import joblib
model_path = Path("../artifacts/random_forest/rf_model.joblib")
joblib.dump(rf, model_path)
print(f"Saved Random Forest model to {model_path}")

Saved Random Forest model to ../artifacts/random_forest/rf_model.joblib


In [25]:
import json
from pathlib import Path

metrics_path = Path("../artifacts/random_forest/metrics.json")

# Load existing OR create new
if metrics_path.exists():
    with open(metrics_path, "r") as f:
        metrics = json.load(f)
    print(f"Loaded existing metrics from {metrics_path}")
else:
    metrics = {}
    print(f"Created new metrics file at {metrics_path}")

# Base structure
metrics.setdefault("model_family", "random_forest")
metrics.setdefault("models", {})


# Add constrained tree
metrics["models"]["random_forest_constrained"] = {
    "params": {
        "n_estimators": 200,
        "random_state": 42,
        "n_jobs": -1
    },
    "train": {
        "rmse": rf_train_rmse,
        "r2": rf_train_r2
    },
    "test": {
        "rmse": rf_test_rmse,
        "r2": rf_test_r2
    }
}

# Save back
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=4)


Loaded existing metrics from ../artifacts/random_forest/metrics.json


### Tunning Hyperparameters

In [26]:
rf_tuned = RandomForestRegressor(
    n_estimators=300,
    max_depth=20,
    min_samples_leaf=5,
    max_features="sqrt",
    random_state=42,
    n_jobs=-1
)

rf_tuned.fit(X_train, y_train)

y_test_pred = rf_tuned.predict(X_test)
y_train_pred = rf_tuned.predict(X_train)

rf_tuned_test_rmse = root_mean_squared_error(y_test, y_test_pred)
rf_tuned_train_rmse = root_mean_squared_error(y_train, y_train_pred)

rf_tuned_train_r2 = r2_score(y_train, y_train_pred)
rf_tuned_test_r2 = r2_score(y_test, y_test_pred)

print("Tuned RF Test RMSE:", rf_tuned_test_rmse)
print("Tuned RF Train RMSE:", rf_tuned_train_rmse)

print("\nTuned Test RF R²:", rf_tuned_test_r2)
print("Tuned RF Train R²:", rf_tuned_train_r2)

Tuned RF Test RMSE: 51756.54763352121
Tuned RF Train RMSE: 38908.311701515304

Tuned Test RF R²: 0.7955798329055327
Tuned RF Train R²: 0.8867533367830849


In [27]:
import json
from pathlib import Path

metrics_path = Path("../artifacts/random_forest/metrics.json")

# Load existing OR create new
if metrics_path.exists():
    with open(metrics_path, "r") as f:
        metrics = json.load(f)
    print(f"Loaded existing metrics from {metrics_path}")
else:
    metrics = {}
    print(f"Created new metrics file at {metrics_path}")

# Add constrained tree
metrics["models"]["random_forest_tuned"] = {
    "params": {
        "n_estimators": 300,
        "max_depth": 20,
        "min_samples_leaf": 5,
        "max_features": "sqrt",
        "random_state": 42,
        "n_jobs": -1
    },
    "train": {
        "rmse": rf_tuned_train_rmse,
        "r2": rf_tuned_train_r2
    },
    "test": {
        "rmse": rf_tuned_test_rmse,
        "r2": rf_tuned_test_r2
    }
}

# Save back
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=4)


Loaded existing metrics from ../artifacts/random_forest/metrics.json


---

## Random Forest Validation & Baseline Locking

In [28]:
## Performing CV - To check whether the ~50k RMSE is stable or just lucky/unlucky split
from sklearn.model_selection import cross_val_score

In [29]:
# Use the baseline Random Forest (the one you selected)
rf_cv = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

In [30]:
# 5-fold cross-validation using RMSE
cv_scores = cross_val_score(
    rf_cv,
    X_train,
    y_train,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

In [31]:
# Convert to positive RMSE
cv_rmse_scores = -cv_scores

print("CV RMSE scores:", cv_rmse_scores)
print("Mean CV RMSE :", cv_rmse_scores.mean())
print("Std CV RMSE  :", cv_rmse_scores.std())

CV RMSE scores: [49007.74235718 49072.69656354 49148.01942817 48300.83519482
 50324.89957053]
Mean CV RMSE : 49170.838622848365
Std CV RMSE  : 651.9993262385468


#### Save the CV Results

In [32]:
import json
from pathlib import Path

metrics_path = Path("../artifacts/random_forest/metrics.json")

# Load existing OR create new
if metrics_path.exists():
    with open(metrics_path, "r") as f:
        metrics = json.load(f)
    print(f"Loaded existing metrics from {metrics_path}")
else:
    metrics = {}
    print(f"Created new metrics file at {metrics_path}")

# Add constrained tree
metrics["models"]["random_forest_constrained"]["cross_validation"] = {
    "cross_validation": {
        "folds": 5,
        "rmse_scores": [
            cv_rmse_scores.tolist()
        ],
        "rmse_mean": cv_rmse_scores.mean(),
        "rmse_std": cv_rmse_scores.std()
    }
}

# Save back
with open(metrics_path, "w") as f:
    json.dump(metrics, f, indent=4)


Loaded existing metrics from ../artifacts/random_forest/metrics.json


---