In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# =========================================================
# I. DATA LOADING AND PREPARATION
# =========================================================
# Attempt to load the file, handling potential path/index issues from previous runs
try:
    # Assuming the structure from the last successful read (Median_House_Value as a column)
    california_houses = pd.read_csv("datasets/California_Houses.csv")
except FileNotFoundError:
    california_houses = pd.read_csv("California_Houses.csv")

# Separate features (X) and target (T)
X = california_houses.drop(columns=['Median_House_Value'])
T = california_houses['Median_House_Value']


# Shuffle the data
shuffled_data = pd.concat([X, T], axis=1).sample(frac=1, random_state=42).reset_index(drop=True)
X_shuffled = shuffled_data.drop(columns=['Median_House_Value'])
T_shuffled = shuffled_data['Median_House_Value']


#Splitting data 70% train
total_rows = X_shuffled.shape[0]
train_end = int(total_rows * 0.7)
validation_end = int(total_rows * 0.85)

#assign data portions to train validation and test
X_train = X_shuffled.iloc[:train_end]
T_train = T_shuffled.iloc[:train_end]

X_validation = X_shuffled.iloc[train_end:validation_end]
T_validation = T_shuffled.iloc[train_end:validation_end]

X_test = X_shuffled.iloc[validation_end:]
T_test = T_shuffled.iloc[validation_end:]

# normalization
X_train = (X_train - X_train.min()) / (X_train.max() - X_train.min())
X_validation = (X_validation - X_train.min()) / (X_train.max() - X_train.min())
X_test = (X_test - X_train.min()) / (X_train.max() - X_train.min())

# Add a bias term (column of ones) to the training data
X_train_b = np.c_[np.ones((len(X_train), 1)), X_train]

# Add a bias term to the validation data
X_validation_b = np.c_[np.ones((len(X_validation), 1)), X_validation]

# Add a bias term to the test data
X_test_b = np.c_[np.ones((len(X_test), 1)), X_test]

# Reshape the training target variable into a column vector
T_train_col = T_train.values.reshape(-1, 1)

# Reshape the validation target variable into a column vector
T_validation_col = T_validation.values.reshape(-1, 1)

# Reshape the test target variable into a column vector
T_test_col = T_test.values.reshape(-1, 1)



array([[1.00000000e+00, 4.58930000e+00, 3.90000000e+01, ...,
        7.32571538e+05, 7.00855737e+04, 3.72706993e+04],
       [1.00000000e+00, 3.98530000e+00, 2.40000000e+01, ...,
        7.69657977e+05, 1.56611953e+05, 1.33968764e+05],
       [1.00000000e+00, 2.21670000e+00, 1.80000000e+01, ...,
        4.55949596e+05, 2.16156065e+05, 2.84095244e+05],
       ...,
       [1.00000000e+00, 2.93440000e+00, 3.60000000e+01, ...,
        1.84842756e+05, 4.85477346e+05, 5.53467812e+05],
       [1.00000000e+00, 5.71920000e+00, 1.50000000e+01, ...,
        6.94699029e+05, 2.78245773e+04, 4.67503151e+04],
       [1.00000000e+00, 2.57550000e+00, 5.20000000e+01, ...,
        7.37236735e+05, 6.70261570e+04, 1.08674196e+03]])

In [76]:
# Separate features (X) and target (T)
X = california_houses.drop(columns=['Median_House_Value'])
T = california_houses['Median_House_Value']

mean = X.mean()
std = X.std()

X = (X - X.min()) / (X.max() - X.min())
display(X.head(10))
display(T.head(10))

Unnamed: 0,Median_Income,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Latitude,Longitude,Distance_to_coast,Distance_to_LA,Distance_to_SanDiego,Distance_to_SanJose,Distance_to_SanFrancisco
0,0.539668,0.784314,0.022331,0.019863,0.008941,0.020556,0.567481,0.211155,0.027398,0.546362,0.61434,0.079961,0.023023
1,0.538027,0.392157,0.180503,0.171477,0.06721,0.186976,0.565356,0.212151,0.030283,0.544152,0.612446,0.077112,0.022614
2,0.466028,1.0,0.03726,0.02933,0.013818,0.028943,0.564293,0.210159,0.02439,0.544477,0.612688,0.076894,0.020323
3,0.354699,1.0,0.032352,0.036313,0.015555,0.035849,0.564293,0.209163,0.022918,0.54505,0.613164,0.077396,0.019459
4,0.230776,1.0,0.04133,0.043296,0.015752,0.042427,0.564293,0.209163,0.022918,0.54505,0.613164,0.077396,0.019459
5,0.243921,1.0,0.023323,0.032899,0.011491,0.031574,0.564293,0.209163,0.022918,0.54505,0.613164,0.077396,0.019459
6,0.217873,1.0,0.064423,0.075729,0.030578,0.084361,0.563231,0.209163,0.020146,0.544235,0.612457,0.076234,0.018913
7,0.180694,1.0,0.078895,0.106456,0.032344,0.106233,0.563231,0.209163,0.020146,0.544235,0.612457,0.076234,0.018913
8,0.108998,0.803922,0.064932,0.103042,0.033717,0.097681,0.563231,0.208167,0.01871,0.54481,0.612935,0.076755,0.018035
9,0.220087,1.0,0.090213,0.109559,0.043387,0.11725,0.563231,0.209163,0.020146,0.544235,0.612457,0.076234,0.018913


0    452600.0
1    358500.0
2    352100.0
3    341300.0
4    342200.0
5    269700.0
6    299200.0
7    241400.0
8    226700.0
9    261100.0
Name: Median_House_Value, dtype: float64

In [77]:
# Shuffle the data as a whole not x alone and T alone
shuffled_data = pd.concat([X, T], axis=1).sample(frac=1, random_state=42).reset_index(drop=True)



In [78]:
# Separate shuffled X and T
X_shuffled = shuffled_data.drop(columns=['Median_House_Value'])
T_shuffled = shuffled_data['Median_House_Value']

In [79]:
#Splitting data 70% train
total_rows = X_shuffled.shape[0]
train_end = int(total_rows * 0.7)
validation_end = int(total_rows * 0.85)

#assign data portions to train validation and test
X_train = X_shuffled.iloc[:train_end]
T_train = T_shuffled.iloc[:train_end]

X_validation = X_shuffled.iloc[train_end:validation_end]
T_validation = T_shuffled.iloc[train_end:validation_end]

X_test = X_shuffled.iloc[validation_end:]
T_test = T_shuffled.iloc[validation_end:]



In [80]:
#apply direct sol

# The design matrix X_Design_Matrix includes a column of ones for the bias term 
X_Design_Matrix = np.c_[np.ones((len(X_train), 1)), X_train.values]

# Convert T_train to a numpy arr
T_Column_Vector = T_train.values.reshape(-1, 1)

# 3. Apply Normal Equation: w* = (X_T * X)^-1 * X_T * T
# Calculate the optimal weight vector w* using the Normal Equation
# Note: np.linalg.pinv is safer for inversion in real-world data than np.linalg.inv
W_Star_Column_Vector = np.linalg.pinv(X_Design_Matrix.T @ X_Design_Matrix) @ X_Design_Matrix.T @ T_Column_Vector

# Create a DataFrame to display the coefficients
feature_names = ['Intercept'] + list(X_train.columns)
coefficients_df = pd.DataFrame(W_Star_Column_Vector, index=feature_names, columns=['Coefficient Value'])

print("\n--- Learned Regression Coefficients (Normal Equation) ---")
print(coefficients_df)

# Prediction on the training set
X_train_b = np.c_[np.ones((len(X_train), 1)), X_train.values]
T_train_predict = X_train_b @ W_Star_Column_Vector 
# print("\nFirst 5 Predicted House Values on Training Set:", T_train_predict[:5].flatten())


--- Learned Regression Coefficients (Normal Equation) ---
                          Coefficient Value
Intercept                      2.751753e+05
Median_Income                  5.610514e+05
Median_Age                     4.350051e+04
Tot_Rooms                     -1.780353e+05
Tot_Bedrooms                   5.934739e+05
Population                    -1.604342e+06
Households                     3.983280e+05
Latitude                      -4.223777e+05
Longitude                     -2.894427e+05
Distance_to_coast             -7.705033e+04
Distance_to_LA                -1.522214e+05
Distance_to_SanDiego           2.827004e+05
Distance_to_SanJose            1.232811e+05
Distance_to_SanFrancisco      -1.114679e+05


In [81]:
# Prepare validation data
X_validation_b = np.c_[np.ones((len(X_validation), 1)), X_validation.values]

X_validation_b = (X_validation_b - X_validation_b.min()) / (X_validation_b.max() - X_validation_b.min())
# Predict on validation set
T_validation_predict = X_validation_b @ W_Star_Column_Vector

# Evaluate
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(T_validation, T_validation_predict)
r2 = r2_score(T_validation, T_validation_predict)

print(f"Validation MSE: {mse:.4f}")
print(f"Validation R²: {r2:.4f}")


Validation MSE: 4573978531.2095
Validation R²: 0.6506
