Trained coefficients: [ 0.20492331 -1.76041769  2.59150772 -1.72845641  1.44219696  0.22161989]
Trained intercept: -0.08854931784837666
MAE: 56812.04
RMSE: 77607.49
R² Score: 0.5637
Predicted median_house_value for custom input: 255036.20


In [44]:
# Gradient descent with enhanced features using scikit-learn Ridge regression
# Loading data from updated_ocean.csv only

import numpy as np
import time
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load data
with open('updated_ocean.csv', 'r') as file:
    lines = file.readlines()

x_data = []
y_data = []

for line in lines[1:]:
    parts = line.strip().split(',')
    if len(parts) < 10:
        continue
    try:
        longitude = float(parts[0])
        latitude = float(parts[1])
        age = float(parts[2])
        total_rooms = float(parts[3])
        total_bedrooms = float(parts[4])
        population = float(parts[5])
        households = float(parts[6])
        income = min(float(parts[7]), 15)
        value = float(parts[8])
        ocean_proximity = float(parts[9])

        if total_rooms == 0 or households == 0:
            continue

        rooms_per_person = total_rooms / population if population != 0 else 0
        bedrooms_per_room = total_bedrooms / total_rooms
        income_per_household = income / households
        rooms_per_household = total_rooms / households
        population_per_household = population / households

        features = [
            longitude,
            latitude,
            age,
            income,
            rooms_per_person,
            bedrooms_per_room,
            rooms_per_household,
            population_per_household,
            income_per_household,
            ocean_proximity
        ]

        x_data.append(features)
        y_data.append(value)
    except:
        continue

x_data = np.array(x_data)
y_data = np.array(y_data)

# Clip target outliers at 99th percentile
y_data = np.clip(y_data, a_min=None, a_max=np.percentile(y_data, 99))

# Normalize target (min-max scaling)
y_min = y_data.min()
y_max = y_data.max()
y_data_norm = (y_data - y_min) / (y_max - y_min)

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data_norm, test_size=0.2, random_state=42)

# Standardize features (fit on train, transform both)
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Ridge regression (L2 regularization)
alpha = 0.01
model = Ridge(alpha=alpha, fit_intercept=True, max_iter=20000)

start_time = time.time()
model.fit(x_train_scaled, y_train)
fit_duration = time.time() - start_time
print("Fitting time (seconds):", fit_duration)




Fitting time (seconds): 0.007634878158569336


In [45]:
 #Predictions and evaluation
y_pred_norm = model.predict(x_test_scaled)

mae = mean_absolute_error(y_test, y_pred_norm)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_norm))
r2 = r2_score(y_test, y_pred_norm)

# Predict custom input (remember to scale features)
custom_input = np.array([
    -122.23,
    37.88,
    41,
    8.3252,
    88 / 322,
    129 / 880,
    880 / 126,
    322 / 126,
    8.3252 / 123,
    259212
])

custom_input_scaled = scaler.transform(custom_input.reshape(1, -1))
predicted_normalized = model.predict(custom_input_scaled)
predicted_normalized = np.clip(predicted_normalized, 0, 1)
predicted_value = predicted_normalized * (y_max - y_min) + y_min

print("Predicted median_house_value:", predicted_value.item())
print("MAE:", mae * (y_max - y_min))
print("RMSE:", rmse * (y_max - y_min))
print("R² Score:", r2)
print("Trained coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Predicted median_house_value: 326340.94596507214
MAE: 48512.84090043857
RMSE: 67730.10644496528
R² Score: 0.6499285799152235
Trained coefficients: [-0.11897212 -0.12450676  0.02193537  0.16825466  0.09429322  0.02834084
 -0.07015727 -0.00289767 -0.00560579  0.02975974]
Intercept: 0.39627814676614104
