In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from GenerateAttributes import GenerateAttributes
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Load the dataset
data = pd.read_csv('dataset_full.csv')
data.drop(data[data['ocean_proximity'].isin(['ISLAND'])].index, inplace=True)

# Split the data into features and target variable
X = data.drop('median_house_value', axis=1)
y = data['median_house_value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=X['ocean_proximity'])

# Generate features after splitting to prevent data leakage
X_train['income_to_value_ratio'] = X_train['median_income'] / (y_train + 1)  # +1 to avoid division by zero
X_test['income_to_value_ratio'] = X_test['median_income'] / (y_test + 1)

# Define feature categories
categories = ['ocean_proximity']
num_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
                'total_bedrooms', 'population', 'households', 'median_income','income_to_value_ratio']
features_to_generate = ['total_rooms', 'households', 'population', 'total_bedrooms','median_icome']

# Define transformers
numerical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline([
    ("onehot", OneHotEncoder(sparse_output=False))
])

# Define column transformer
ct1 = ColumnTransformer([
    ("num", numerical_transformer, num_features),
    ("cat", categorical_transformer, categories)
], remainder='passthrough')

ct1.set_output(transform='pandas')
ct1.verbose_feature_names_out = False

# Define preprocessing pipeline
predprocesor = Pipeline([
    ('ct1', ct1),
    ('generate', GenerateAttributes(columns=features_to_generate))
])

# Define the model with regularization parameters
model = RandomForestRegressor(
    n_estimators=100,         # Number of trees in the forest
    max_depth=20,             # Maximum depth of the tree
    min_samples_split=20,     # Minimum number of samples required to split an internal node
    min_samples_leaf=5,       # Minimum number of samples required to be at a leaf node
    random_state=42
)


# Define full pipeline
full_pipeline = make_pipeline(
    predprocesor,
    model
)

# Fit the model
full_pipeline.fit(X_train, y_train)

# Predict on training set
y_train_pred = full_pipeline.predict(X_train)

# Predict on testing set
y_test_pred = full_pipeline.predict(X_test)

# Calculate performance metrics
def print_metrics(y_true, y_pred, dataset_name):
    print(f"Metrics for {dataset_name} set:")
    print(f"R²: {r2_score(y_true, y_pred):.4f}")
    print(f"Mean Absolute Error: {mean_absolute_error(y_true, y_pred):.4f}")
    print(f"Mean Squared Error: {mean_squared_error(y_true, y_pred):.4f}")
    print(f"Root Mean Squared Error: {np.sqrt(mean_squared_error(y_true, y_pred)):.4f}")
    print("")

print_metrics(y_train, y_train_pred, "Training")
print_metrics(y_test, y_test_pred, "Testing")

Metrics for Training set:
R²: 0.9978
Mean Absolute Error: 1661.2488
Mean Squared Error: 28682958.6529
Root Mean Squared Error: 5355.6474

Metrics for Testing set:
R²: 0.9972
Mean Absolute Error: 2020.3898
Mean Squared Error: 38302707.6267
Root Mean Squared Error: 6188.9181



- Underfitting vs Overfitting

Interpretation:
- R² (Coefficient of Determination): Closer to 1 indicates better model performance.
- Mean Absolute Error (MAE): Lower values indicate better model performance.
- Mean Squared Error (MSE): Lower values indicate better model performance.
- Root Mean Squared Error (RMSE): Lower values indicate better model performance.

By comparing these metrics for the training and testing sets, you can determine if the model is underfitting:

- If the errors (MAE, MSE, RMSE) are high on both the training and testing sets, and R² is low, the model is likely underfitting.
- If the training error is significantly lower than the testing error, it indicates overfitting.
- Ideally, you want both training and testing errors to be low and similar, indicating good generalization.