In [45]:
# Add comments before import
# Gradient descent with enhanced features, regularization, and standardization

import numpy as np

# Step 1: Load and parse data
with open('updated_housing.csv', 'r') as file:
    lines = file.readlines()

x_data = []
y_data = []

for i, line in enumerate(lines[1:], start=2):
    parts = line.strip().split(',')
    if len(parts) < 10:
        continue
    try:
        longitude = float(parts[0])
        latitude = float(parts[1])
        age = float(parts[2])
        total_rooms = float(parts[3])
        total_bedrooms = float(parts[4])
        population = float(parts[5])
        households = float(parts[6])
        income = min(float(parts[7]), 15)  # clip high income
        value = float(parts[8])

        if total_rooms == 0 or households == 0:
            continue

        rooms_per_person = total_rooms / population if population != 0 else 0
        bedrooms_per_room = total_bedrooms / total_rooms
        income_per_household = income / households
        rooms_per_household = total_rooms / households
        population_per_household = population / households

        features = [
            longitude,
            latitude,
            age,
            income,
            rooms_per_person,
            bedrooms_per_room,
            rooms_per_household,
            population_per_household,
            income_per_household
        ]

        x_data.append(features)
        y_data.append(value)
    except:
        continue

x_data = np.array(x_data)
y_data = np.array(y_data)

# Step 2: Clip target outliers
y_data = np.clip(y_data, a_min=None, a_max=np.percentile(y_data, 99))

# Step 3: Standardize features
x_mean = x_data.mean(axis=0)
x_std = x_data.std(axis=0)
x_data = (x_data - x_mean) / x_std
x_data = np.nan_to_num(x_data)

# Add bias term
x_data = np.hstack([np.ones((x_data.shape[0], 1)), x_data])

# Normalize target (min-max)
y_min = y_data.min()
y_max = y_data.max()
y_data = (y_data - y_min) / (y_max - y_min)
y_data = y_data.reshape(-1, 1)

# Step 4: Train-test split
split = int(0.8 * len(x_data))
x_train, x_test = x_data[:split], x_data[split:]
y_train, y_test = y_data[:split], y_data[split:]

# Step 5: Gradient Descent with L2 regularization
weights = np.random.randn(x_train.shape[1], 1) * 0.01
alpha = 0.001
epochs = 20000
lambda_reg = 0.1
n = x_train.shape[0]

for epoch in range(epochs):
    predictions = x_train @ weights
    errors = predictions - y_train
    gradient = (x_train.T @ errors) / n + lambda_reg * weights
    gradient[0] -= lambda_reg * weights[0]  # don't regularize bias
    weights -= alpha * gradient

# Step 6: Evaluation
def evaluate(x, y_true, weights):
    preds = x @ weights
    errors = y_true - preds
    mae = np.mean(np.abs(errors))
    rmse = np.sqrt(np.mean(errors**2))
    y_mean = np.mean(y_true)
    total_var = np.sum((y_true - y_mean)**2)
    residual_var = np.sum(errors**2)
    r2 = 1 - residual_var / total_var
    return mae, rmse, r2

mae, rmse, r2 = evaluate(x_test, y_test, weights)

# Step 7: Predict custom input
custom_input = np.array([
    -122.23,     # longitude
    37.88,       # latitude
    41,          # housing_median_age
    8.3252,      # median_income
    880 / 322,   # rooms_per_person
    129 / 880,   # bedrooms_per_room
    880 / 126,   # rooms_per_household
    322 / 126,   # population_per_household
    8.3252 / 126 # income_per_household
])

custom_input = (custom_input - x_mean) / x_std
custom_input = np.nan_to_num(custom_input)
custom_input = np.insert(custom_input, 0, 1.0)
predicted_normalized = custom_input @ weights
predicted_normalized = np.clip(predicted_normalized, 0, 1)
predicted_value = predicted_normalized * (y_max - y_min) + y_min

# Step 8: Output
print("Trained weights:", weights.flatten())
print("Predicted median_house_value:", predicted_value.item())
print("MAE:", mae * (y_max - y_min))
print("RMSE:", rmse * (y_max - y_min))
print("R² Score:", r2)

Trained weights: [ 0.38798288 -0.05588284 -0.07095369  0.03397746  0.16080368  0.06126588
  0.02889495 -0.04479205 -0.00894681 -0.00349121]
Predicted median_house_value: 391313.3049810674
MAE: 53550.71750967181
RMSE: 73574.56164320438
R² Score: 0.628508772566651


Trained weights: [ 0.38815284 -0.05900047 -0.0741089   0.03359259  0.1602105   0.06207382
  0.02843193 -0.04538673 -0.00861879 -0.00343335]
Predicted median_house_value: 391274.4483565571
MAE: 53279.0088844788
RMSE: 73242.57419108084
R² Score: 0.6318537378356985


In [25]:
0.6318537378356985

0.6318537378356985

In [2]:
# Gradient descent with enhanced features, regularization, standardization, and ocean_proximity one-hot encoding

import numpy as np

# Step 1: Load and parse data
with open('updated_housing.csv', 'r') as file:
    lines = file.readlines()

x_data = []
y_data = []

# Collect unique ocean_proximity categories (strings of numbers)
categories = set()
for line in lines[1:]:
    parts = line.strip().split(',')
    if len(parts) < 10:
        continue
    categories.add(parts[9])

categories = sorted(categories)
cat_to_index = {cat: i for i, cat in enumerate(categories)}

for line in lines[1:]:
    parts = line.strip().split(',')
    if len(parts) < 10:
        continue
    try:
        longitude = float(parts[0])
        latitude = float(parts[1])
        age = float(parts[2])
        total_rooms = float(parts[3])
        total_bedrooms = float(parts[4])
        population = float(parts[5])
        households = float(parts[6])
        income = min(float(parts[7]), 15)  # clip high income
        value = float(parts[8])
        ocean_proximity = parts[9]

        if total_rooms == 0 or households == 0:
            continue

        rooms_per_person = total_rooms / population if population != 0 else 0
        bedrooms_per_room = total_bedrooms / total_rooms
        income_per_household = income / households
        rooms_per_household = total_rooms / households
        population_per_household = population / households

        features = [
            longitude,
            latitude,
            age,
            income,
            rooms_per_person,
            bedrooms_per_room,
            rooms_per_household,
            population_per_household,
            income_per_household
        ]

        # One-hot encode ocean_proximity
        one_hot = [0] * len(categories)
        one_hot[cat_to_index[ocean_proximity]] = 1
        features.extend(one_hot)

        x_data.append(features)
        y_data.append(value)
    except:
        continue

x_data = np.array(x_data)
y_data = np.array(y_data)

# Step 2: Clip target outliers
y_data = np.clip(y_data, a_min=None, a_max=np.percentile(y_data, 99))

# Step 3: Standardize features (except one-hot encoded ocean_proximity columns)
num_features = 9
num_categories = len(categories)
x_num = x_data[:, :num_features]
x_cat = x_data[:, num_features:]

x_mean = x_num.mean(axis=0)
x_std = x_num.std(axis=0)
x_num = (x_num - x_mean) / x_std
x_num = np.nan_to_num(x_num)

# Combine standardized numerical features and one-hot categorical features
x_data = np.hstack([x_num, x_cat])

# Add bias term
x_data = np.hstack([np.ones((x_data.shape[0], 1)), x_data])

# Normalize target (min-max)
y_min = y_data.min()
y_max = y_data.max()
y_data = (y_data - y_min) / (y_max - y_min)
y_data = y_data.reshape(-1, 1)

# Step 4: Train-test split
split = int(0.8 * len(x_data))
x_train, x_test = x_data[:split], x_data[split:]
y_train, y_test = y_data[:split], y_data[split:]

# Step 5: Gradient Descent with L2 regularization
weights = np.random.randn(x_train.shape[1], 1) * 0.01
alpha = 0.001
epochs = 20000
lambda_reg = 0.1
n = x_train.shape[0]

for epoch in range(epochs):
    predictions = x_train @ weights
    errors = predictions - y_train
    gradient = (x_train.T @ errors) / n + lambda_reg * weights
    gradient[0] -= lambda_reg * weights[0]  # don't regularize bias
    weights -= alpha * gradient

# Step 6: Evaluation
def evaluate(x, y_true, weights):
    preds = x @ weights
    errors = y_true - preds
    mae = np.mean(np.abs(errors))
    rmse = np.sqrt(np.mean(errors**2))
    y_mean = np.mean(y_true)
    total_var = np.sum((y_true - y_mean)**2)
    residual_var = np.sum(errors**2)
    r2 = 1 - residual_var / total_var
    return mae, rmse, r2

mae, rmse, r2 = evaluate(x_test, y_test, weights)

# Step 7: Predict custom input
# Custom input must have 9 numerical features + one-hot ocean_proximity vector
custom_input_num = np.array([
    -122.23,     # longitude
    37.88,       # latitude
    41,          # housing_median_age
    8.3252,      # median_income
    880 / 322,   # rooms_per_person
    129 / 880,   # bedrooms_per_room
    880 / 126,   # rooms_per_household
    322 / 126,   # population_per_household
    8.3252 / 126 # income_per_household
])

custom_input_num = (custom_input_num - x_mean) / x_std
custom_input_num = np.nan_to_num(custom_input_num)

# Example ocean_proximity value as string, e.g. '3'
custom_ocean_proximity = '3'
custom_one_hot = [0] * len(categories)
if custom_ocean_proximity in cat_to_index:
    custom_one_hot[cat_to_index[custom_ocean_proximity]] = 1

custom_input = np.hstack([custom_input_num, custom_one_hot])
custom_input = np.insert(custom_input, 0, 1.0)  # add bias

predicted_normalized = custom_input @ weights
predicted_normalized = np.clip(predicted_normalized, 0, 1)
predicted_value = predicted_normalized * (y_max - y_min) + y_min

# Step 8: Output
print("Trained weights:", weights.flatten())
print("Predicted median_house_value:", predicted_value.item())
print("MAE:", mae * (y_max - y_min))
print("RMSE:", rmse * (y_max - y_min))
print("R² Score:", r2)

Trained weights: [ 0.37458268 -0.03583697 -0.04272122  0.0260748   0.14837575  0.0608504
  0.02041791 -0.04154916 -0.00828297 -0.00274418  0.04383553 -0.04736629
  0.00038598  0.04131973  0.02975843]
Predicted median_house_value: 374143.0011870252
MAE: 50036.146528997655
RMSE: 70257.7502138597
R² Score: 0.6612481516373423


In [3]:
# Gradient descent with enhanced features, regularization, and standardization
# Using numeric ocean_proximity from updated_ocean.csv

import numpy as np

# Load housing data
with open('updated_housing.csv', 'r') as f_housing:
    housing_lines = f_housing.readlines()

# Load ocean_proximity numeric data
with open('updated_ocean.csv', 'r') as f_ocean:
    ocean_lines = f_ocean.readlines()

x_data = []
y_data = []

for i, line in enumerate(housing_lines[1:], start=1):  # align indexes with ocean file (skip header)
    parts = line.strip().split(',')
    ocean_parts = ocean_lines[i].strip().split(',')
    if len(parts) < 9 or len(ocean_parts) < 10:
        continue
    try:
        # Original features
        longitude = float(parts[0])
        latitude = float(parts[1])
        age = float(parts[2])
        total_rooms = float(parts[3])
        total_bedrooms = float(parts[4])
        population = float(parts[5])
        households = float(parts[6])
        income = min(float(parts[7]), 15)  # clip high income
        value = float(parts[8])

        # Skip invalid
        if total_rooms == 0 or households == 0:
            continue

        # Derived features
        rooms_per_person = total_rooms / population if population != 0 else 0
        bedrooms_per_room = total_bedrooms / total_rooms
        income_per_household = income / households
        rooms_per_household = total_rooms / households
        population_per_household = population / households

        # Numeric ocean_proximity from 10th column of updated_ocean.csv
        ocean_proximity = float(ocean_parts[9])

        features = [
            longitude,
            latitude,
            age,
            income,
            rooms_per_person,
            bedrooms_per_room,
            rooms_per_household,
            population_per_household,
            income_per_household,
            ocean_proximity  # new 10th feature
        ]

        x_data.append(features)
        y_data.append(value)
    except:
        continue

x_data = np.array(x_data)
y_data = np.array(y_data)

# Clip target outliers
y_data = np.clip(y_data, a_min=None, a_max=np.percentile(y_data, 99))

# Standardize features
x_mean = x_data.mean(axis=0)
x_std = x_data.std(axis=0)
x_data = (x_data - x_mean) / x_std
x_data = np.nan_to_num(x_data)

# Add bias term
x_data = np.hstack([np.ones((x_data.shape[0], 1)), x_data])

# Normalize target (min-max)
y_min = y_data.min()
y_max = y_data.max()
y_data = (y_data - y_min) / (y_max - y_min)
y_data = y_data.reshape(-1, 1)

# Train-test split
split = int(0.8 * len(x_data))
x_train, x_test = x_data[:split], x_data[split:]
y_train, y_test = y_data[:split], y_data[split:]

# Gradient Descent with L2 regularization
weights = np.random.randn(x_train.shape[1], 1) * 0.01
alpha = 0.001
epochs = 20000
lambda_reg = 0.1
n = x_train.shape[0]

for epoch in range(epochs):
    predictions = x_train @ weights
    errors = predictions - y_train
    gradient = (x_train.T @ errors) / n + lambda_reg * weights
    gradient[0] -= lambda_reg * weights[0]  # don't regularize bias
    weights -= alpha * gradient

# Evaluation function
def evaluate(x, y_true, weights):
    preds = x @ weights
    errors = y_true - preds
    mae = np.mean(np.abs(errors))
    rmse = np.sqrt(np.mean(errors**2))
    y_mean = np.mean(y_true)
    total_var = np.sum((y_true - y_mean)**2)
    residual_var = np.sum(errors**2)
    r2 = 1 - residual_var / total_var
    return mae, rmse, r2

mae, rmse, r2 = evaluate(x_test, y_test, weights)

# Predict custom input (include ocean_proximity at last)
custom_input = np.array([
    -122.23,     # longitude
    37.88,       # latitude
    41,          # housing_median_age
    8.3252,      # median_income
    880 / 322,   # rooms_per_person
    129 / 880,   # bedrooms_per_room
    880 / 126,   # rooms_per_household
    322 / 126,   # population_per_household
    8.3252 / 126, # income_per_household
    259212         # example ocean_proximity numeric (adjust accordingly)
])

custom_input = (custom_input - x_mean) / x_std
custom_input = np.nan_to_num(custom_input)
custom_input = np.insert(custom_input, 0, 1.0)
predicted_normalized = custom_input @ weights
predicted_normalized = np.clip(predicted_normalized, 0, 1)
predicted_value = predicted_normalized * (y_max - y_min) + y_min

print("Trained weights:", weights.flatten())
print("Predicted median_house_value:", predicted_value.item())
print("MAE:", mae * (y_max - y_min))
print("RMSE:", rmse * (y_max - y_min))
print("R² Score:", r2)

Trained weights: [ 0.38908313 -0.02495756 -0.03160015  0.02297935  0.14504154  0.05840965
  0.01804127 -0.03869517 -0.00800746 -0.00262673  0.05541898]
Predicted median_house_value: 401006.07756051864
MAE: 48581.512215039125
RMSE: 68656.660837491
R² Score: 0.6765117199229292


In [8]:
# Gradient descent with enhanced features, regularization, and standardization
# Loading all data from updated_ocean.csv only

import numpy as np

with open('updated_ocean.csv', 'r') as file:
    lines = file.readlines()

x_data = []
y_data = []

for i, line in enumerate(lines[1:], start=2):  # skip header
    parts = line.strip().split(',')
    if len(parts) < 10:
        continue
    try:
        longitude = float(parts[0])
        latitude = float(parts[1])
        age = float(parts[2])
        total_rooms = float(parts[3])
        total_bedrooms = float(parts[4])
        population = float(parts[5])
        households = float(parts[6])
        income = min(float(parts[7]), 15)  # clip high income
        value = float(parts[8])
        ocean_proximity = float(parts[9])  # numeric ocean proximity feature

        if total_rooms == 0 or households == 0:
            continue

        rooms_per_person = total_rooms / population if population != 0 else 0
        bedrooms_per_room = total_bedrooms / total_rooms
        income_per_household = income / households
        rooms_per_household = total_rooms / households
        population_per_household = population / households

        features = [
            longitude,
            latitude,
            age,
            income,
            rooms_per_person,
            bedrooms_per_room,
            rooms_per_household,
            population_per_household,
            income_per_household,
            ocean_proximity
        ]

        x_data.append(features)
        y_data.append(value)
    except:
        continue

x_data = np.array(x_data)
y_data = np.array(y_data)

# Clip target outliers
y_data = np.clip(y_data, a_min=None, a_max=np.percentile(y_data, 99))

# Standardize features
x_mean = x_data.mean(axis=0)
x_std = x_data.std(axis=0)
x_data = (x_data - x_mean) / x_std
x_data = np.nan_to_num(x_data)

# Add bias term
x_data = np.hstack([np.ones((x_data.shape[0], 1)), x_data])

# Normalize target (min-max)
y_min = y_data.min()
y_max = y_data.max()
y_data = (y_data - y_min) / (y_max - y_min)
y_data = y_data.reshape(-1, 1)

# Train-test split
split = int(0.8 * len(x_data))
x_train, x_test = x_data[:split], x_data[split:]
y_train, y_test = y_data[:split], y_data[split:]

# Gradient Descent with L2 regularization
weights = np.random.randn(x_train.shape[1], 1) * 0.01
#these value of alpha , epochs gaves result with time taken if we change 0.01->0.001 , 10k ->20k,30k
alpha = 0.01
epochs = 10000
lambda_reg = 0.1
n = x_train.shape[0]

for epoch in range(epochs):
    predictions = x_train @ weights
    errors = predictions - y_train
    gradient = (x_train.T @ errors) / n + lambda_reg * weights
    gradient[0] -= lambda_reg * weights[0]  # no regularization on bias
    weights -= alpha * gradient

def evaluate(x, y_true, weights):
    preds = x @ weights
    errors = y_true - preds
    mae = np.mean(np.abs(errors))
    rmse = np.sqrt(np.mean(errors**2))
    y_mean = np.mean(y_true)
    total_var = np.sum((y_true - y_mean)**2)
    residual_var = np.sum(errors**2)
    r2 = 1 - residual_var / total_var
    return mae, rmse, r2

mae, rmse, r2 = evaluate(x_test, y_test, weights)

# Predict custom input example (include ocean_proximity)
custom_input = np.array([
    -122.23,     # longitude
    37.88,       # latitude
    41,          # housing_median_age
    8.3252,      # median_income
    880 / 322,   # rooms_per_person
    129 / 880,   # bedrooms_per_room
    880 / 126,   # rooms_per_household
    322 / 126,   # population_per_household
    8.3252 / 126, # income_per_household
    259212         # ocean_proximity numeric (example)
])

custom_input = (custom_input - x_mean) / x_std
custom_input = np.nan_to_num(custom_input)
custom_input = np.insert(custom_input, 0, 1.0)  # bias term
predicted_normalized = custom_input @ weights
predicted_normalized = np.clip(predicted_normalized, 0, 1)
predicted_value = predicted_normalized * (y_max - y_min) + y_min

print("Trained weights:", weights.flatten())
print("Predicted median_house_value:", predicted_value.item())
print("MAE:", mae * (y_max - y_min))
print("RMSE:", rmse * (y_max - y_min))
print("R² Score:", r2)
#MAE: 48515.653783799724
#RMSE: 68558.2030025164
#R² Score: 0.6774388584676602

Trained weights: [ 0.38914547 -0.02669392 -0.03344972  0.02290499  0.14496131  0.05915814
  0.01791219 -0.03939036 -0.0077711  -0.00260295  0.05492466]
Predicted median_house_value: 400959.50446354994
MAE: 48515.653783799724
RMSE: 68558.2030025164
R² Score: 0.6774388584676602


Trained weights: [ 0.38888469 -0.01962883 -0.02584124  0.02302891  0.14387209  0.05349689
  0.01765066 -0.03373562 -0.00898169 -0.00265787  0.05775227]
Predicted median_house_value: 400339.2131064219
MAE: 48854.977193454615
RMSE: 69021.69236713304
R² Score: 0.6730627513501537


Trained weights: [ 0.38914547 -0.02669397 -0.03344977  0.02290499  0.1449613   0.05915814
  0.01791219 -0.03939036 -0.00777109 -0.00260295  0.05492465]
Predicted median_house_value: 400959.5027396754
MAE: 48515.65224212291
RMSE: 68558.20065714387
R² Score: 0.6774388805372594
