In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

In [21]:
# =========================================================
# I. DATA LOADING AND PREPARATION
# =========================================================
# Attempt to load the file, handling potential path/index issues from previous runs
from sklearn.discriminant_analysis import StandardScaler


try:
    # Assuming the structure from the last successful read (Median_House_Value as a column)
    california_houses = pd.read_csv("datasets/California_Houses.csv")
except FileNotFoundError:
    california_houses = pd.read_csv("California_Houses.csv")

# Separate features (X) and target (T)
X = california_houses.drop(columns=['Median_House_Value'])
T = california_houses['Median_House_Value']


# Shuffle the data to ensure randomness
shuffled_data = pd.concat([X, T], axis=1).sample(frac=1, random_state=42).reset_index(drop=True)
X_shuffled = shuffled_data.drop(columns=['Median_House_Value'])
T_shuffled = shuffled_data['Median_House_Value']

# Define the split points (70% train, 15% validation, 15% test)
total_rows = X_shuffled.shape[0]
train_end = int(total_rows * 0.7)
validation_end = int(total_rows * 0.85)

# Assign the training data portion (0% to 70%)
X_train_raw = X_shuffled.iloc[:train_end]
T_train = T_shuffled.iloc[:train_end]

# Assign the validation data portion (70% to 85%)
X_validation_raw = X_shuffled.iloc[train_end:validation_end]
T_validation = T_shuffled.iloc[train_end:validation_end]

# Assign the test data portion (85% to 100%)
X_test_raw = X_shuffled.iloc[validation_end:]
T_test = T_shuffled.iloc[validation_end:]


# --- Feature Scaling (Crucial for Gradient Descent) ---

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler ONLY on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train_raw)

# Use the SAME fitted scaler to transform the validation and test data
X_validation_scaled = scaler.transform(X_validation_raw)
X_test_scaled = scaler.transform(X_test_raw)


# --- Add Bias Term ---

# Add bias term (column of ones) to all three scaled sets
X_train_b_scaled = np.c_[np.ones((len(X_train_scaled), 1)), X_train_scaled]
X_validation_b_scaled = np.c_[np.ones((len(X_validation_scaled), 1)), X_validation_scaled]
X_test_b_scaled = np.c_[np.ones((len(X_test_scaled), 1)), X_test_scaled]


# --- Reshape Target Variables ---

# Reshape target variables into column vectors
T_train_col = T_train.values.reshape(-1, 1)
T_validation_col = T_validation.values.reshape(-1, 1)
T_test_col = T_test.values.reshape(-1, 1)



In [47]:
# =========================================================
# II. DIRECT SOLUTION (NORMAL EQUATION)
# =========================================================

print("="*60)
print("             1. DIRECT SOLUTION (NORMAL EQUATION)")
print("="*60)

# --- 1. Train (Using UN-SCALED features for the Normal Equation) ---
# The Normal Equation is not sensitive to feature scaling.
X_train_b_raw = np.c_[np.ones((len(X_train_raw), 1)), X_train_raw.values]

# W* = (X_T * X)^-1 * X_T * T
W_normal = np.linalg.pinv(X_train_scaled.T @ X_train_scaled) @ X_train_scaled.T @ T_train_col

# --- 2. Predict on Test Set ---
X_test_b_raw = np.c_[np.ones((len(X_test_raw), 1)), X_test_raw.values]
T_validation_predict_normal = X_validation_scaled @ W_normal
T_test_predict_normal = X_train_scaled @ W_normal

             1. DIRECT SOLUTION (NORMAL EQUATION)


In [48]:
# =========================================================
# III. GRADIENT DESCENT (BATCH GD)
# =========================================================

print("\n" + "="*60)
print("             2. GRADIENT DESCENT (BATCH GD)")
print("="*60)

def gradient_descent(X, y, learning_rate=0.01, n_iterations=2000):
    m = len(y)
    # Initialize theta (coefficients) to zeros
    theta = np.zeros((X.shape[1], 1))

    for _ in range(n_iterations):
        # Calculate Error
        error = (X @ theta) - y
        
        # Calculate Gradient (Vectorized Partial Derivative)
        gradients = (1/m) * X.T @ error

        # Update theta
        theta = theta - learning_rate * gradients

    return theta

# --- 1. Train (Using SCALED features) ---
learning_rate = 0.01
n_iterations = 2000
W_gd = gradient_descent(X_train_b_scaled, T_train_col, learning_rate, n_iterations)

# --- 2. Predict on Test Set ---
# T_validation_predict_gd = X_validation_b_scaled @ W_gd
T_test_predict_gd = X_test_b_scaled @ W_gd



             2. GRADIENT DESCENT (BATCH GD)


In [50]:
# =========================================================
# IV. COMPARISON AND EVALUATION
# =========================================================

def compare_metrics(T_true, T_pred_normal, T_pred_gd, model_type):
    """Calculates and compares metrics for both models on a given set."""
    mse_normal = mean_squared_error(T_true, T_pred_normal)
    rmse_normal = np.sqrt(mse_normal)
    r2_normal = r2_score(T_true, T_pred_normal)
    
    mse_gd = mean_squared_error(T_true, T_pred_gd)
    rmse_gd = np.sqrt(mse_gd)
    r2_gd = r2_score(T_true, T_pred_gd)
    
    metrics_df = pd.DataFrame({
        'Metric': ['RMSE', 'R-squared ($R^2$)'],
        'Normal Equation': [f'{rmse_normal:,.2f}', f'{r2_normal:.4f}'],
        'Gradient Descent': [f'{rmse_gd:,.2f}', f'{r2_gd:.4f}']
    })
    
    print(f"\n--- Model Performance Comparison on the {model_type} ---")
    # print(metrics_df.to_markdown(index=False))


# --- 1. Compare Coefficients ---
feature_names = ['Intercept'] + list(X_train_raw.columns)
coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Normal Eq. (Raw Scale)': W_normal.flatten(),
    'GD (Scaled)': W_gd.flatten()
})

print("\n" + "="*60)
print("           COEFFICIENT COMPARISON")
print("="*60)
print("Note: Normal Eq. weights apply to raw features, GD weights to scaled features.")
# print(coefficients_df.to_markdown(floatfmt=".4f"))


# --- 2. Compare Metrics on Test Set ---
print("\n" + "="*60)
print("         TEST SET METRICS COMPARISON")
print("="*60)
com = compare_metrics(T_test_col, T_test_predict_normal, T_test_predict_gd, "Test Set")
coefficients_df

ValueError: All arrays must be of the same length