In [63]:
# Cell 1: Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import matplotlib.pyplot as plt

%matplotlib inline


In [64]:
# Cell 2: Load dataset
data = pd.read_csv("../data/cleaned_california_housing.csv")
X = data.drop(columns=["MedHouseVal"])
y = data["MedHouseVal"]

print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)


Features (X) shape: (19794, 8)
Target (y) shape: (19794,)


In [65]:
# Cell 3: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (15835, 8)
Testing set size: (3959, 8)


In [66]:
# Cell 4: Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)

mse_linear = mean_squared_error(y_test, y_pred_linear)
mae_linear = mean_absolute_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("Linear Regression Performance:")
print(f"MSE: {mse_linear:.4f}, MAE: {mae_linear:.4f}, R²: {r2_linear:.4f}")


Linear Regression Performance:
MSE: 0.4731, MAE: 0.4988, R²: 0.6162


In [67]:
# Cell 5: Random Forest Regressor with feature importance extraction
rf_model = RandomForestRegressor(random_state=42)
param_grid = {"n_estimators": [50, 100], "max_depth": [None, 10]}
grid_search = GridSearchCV(rf_model, param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

best_rf_model = grid_search.best_estimator_
y_pred_rf = best_rf_model.predict(X_test)

# Extracting feature importances
feature_importances = pd.DataFrame({
    "Feature": X.columns,
    "Importance": best_rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Save feature importances
feature_importances.to_csv("../results/feature_importances/rf_feature_importances.csv", index=False)

mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Performance:")
print(f"MSE: {mse_rf:.4f}, MAE: {mae_rf:.4f}, R²: {r2_rf:.4f}")


Fitting 3 folds for each of 4 candidates, totalling 12 fits
Random Forest Performance:
MSE: 0.2579, MAE: 0.3272, R²: 0.7908


In [68]:
# Cell 6: Save regression results
regression_results = pd.DataFrame({
    "Actual": y_test,
    "Linear_Predictions": y_pred_linear,
    "RF_Predictions": y_pred_rf
})
regression_results.to_csv("../results/regression/regression_predictions.csv", index=False)

metrics = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest"],
    "MSE": [mse_linear, mse_rf],
    "MAE": [mae_linear, mae_rf],
    "R²": [r2_linear, r2_rf]
})
metrics.to_csv("../results/regression/regression_metrics.csv", index=False)

print("Regression results, metrics, and feature importances saved.")


Regression results, metrics, and feature importances saved.
