In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [28]:
california_houses = pd.read_csv("datasets/California_Houses.csv")
california_houses.head(10) #display 1st 10 rows

Unnamed: 0,Median_House_Value,Median_Income,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Latitude,Longitude,Distance_to_coast,Distance_to_LA,Distance_to_SanDiego,Distance_to_SanJose,Distance_to_SanFrancisco
0,452600.0,8.3252,41,880,129,322,126,37.88,-122.23,9263.040773,556529.158342,735501.806984,67432.517001,21250.213767
1,358500.0,8.3014,21,7099,1106,2401,1138,37.86,-122.22,10225.733072,554279.850069,733236.88436,65049.908574,20880.6004
2,352100.0,7.2574,52,1467,190,496,177,37.85,-122.24,8259.085109,554610.717069,733525.682937,64867.289833,18811.48745
3,341300.0,5.6431,52,1274,235,558,219,37.85,-122.25,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568
4,342200.0,3.8462,52,1627,280,565,259,37.85,-122.25,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568
5,269700.0,4.0368,52,919,213,413,193,37.85,-122.25,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568
6,299200.0,3.6591,52,2535,489,1094,514,37.84,-122.25,6843.020847,554364.406936,733249.808578,64315.991742,17538.209972
7,241400.0,3.12,52,3104,687,1157,647,37.84,-122.25,6843.020847,554364.406936,733249.808578,64315.991742,17538.209972
8,226700.0,2.0804,42,2555,665,1206,595,37.84,-122.26,6363.948013,554949.749519,733820.824419,64751.463863,16744.406584
9,261100.0,3.6912,52,3549,707,1551,714,37.84,-122.25,6843.020847,554364.406936,733249.808578,64315.991742,17538.209972


In [29]:
# Separate features (X) and target (T)
X = california_houses.drop(columns=['Median_House_Value'])
T = california_houses['Median_House_Value']
X.head(10)
# T.head(10)

Unnamed: 0,Median_Income,Median_Age,Tot_Rooms,Tot_Bedrooms,Population,Households,Latitude,Longitude,Distance_to_coast,Distance_to_LA,Distance_to_SanDiego,Distance_to_SanJose,Distance_to_SanFrancisco
0,8.3252,41,880,129,322,126,37.88,-122.23,9263.040773,556529.158342,735501.806984,67432.517001,21250.213767
1,8.3014,21,7099,1106,2401,1138,37.86,-122.22,10225.733072,554279.850069,733236.88436,65049.908574,20880.6004
2,7.2574,52,1467,190,496,177,37.85,-122.24,8259.085109,554610.717069,733525.682937,64867.289833,18811.48745
3,5.6431,52,1274,235,558,219,37.85,-122.25,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568
4,3.8462,52,1627,280,565,259,37.85,-122.25,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568
5,4.0368,52,919,213,413,193,37.85,-122.25,7768.086571,555194.266086,734095.290744,65287.138412,18031.047568
6,3.6591,52,2535,489,1094,514,37.84,-122.25,6843.020847,554364.406936,733249.808578,64315.991742,17538.209972
7,3.12,52,3104,687,1157,647,37.84,-122.25,6843.020847,554364.406936,733249.808578,64315.991742,17538.209972
8,2.0804,42,2555,665,1206,595,37.84,-122.26,6363.948013,554949.749519,733820.824419,64751.463863,16744.406584
9,3.6912,52,3549,707,1551,714,37.84,-122.25,6843.020847,554364.406936,733249.808578,64315.991742,17538.209972


In [30]:
# Shuffle the data as a whole not x alone and T alone
shuffled_data = pd.concat([X, T], axis=1).sample(frac=1, random_state=42).reset_index(drop=True)



In [31]:
# Separate shuffled X and T
X_shuffled = shuffled_data.drop(columns=['Median_House_Value'])
T_shuffled = shuffled_data['Median_House_Value']

In [32]:
#Splitting data 70% train
total_rows = X_shuffled.shape[0]
train_end = int(total_rows * 0.7)
validation_end = int(total_rows * 0.85)

#assign data portions to train validation and test
X_train = X_shuffled.iloc[:train_end]
T_train = T_shuffled.iloc[:train_end]

X_validation = X_shuffled.iloc[train_end:validation_end]
T_validation = T_shuffled.iloc[train_end:validation_end]

X_test = X_shuffled.iloc[validation_end:]
T_test = T_shuffled.iloc[validation_end:]



In [None]:
#apply direct sol

# The design matrix X_Design_Matrix includes a column of ones for the bias term 
X_Design_Matrix = np.c_[np.ones((len(X_train), 1)), X_train.values]

# Convert T_train to a numpy arr
T_Column_Vector = T_train.values.reshape(-1, 1)

# 3. Apply Normal Equation: w* = (X_T * X)^-1 * X_T * T
# Calculate the optimal weight vector w* using the Normal Equation
# Note: np.linalg.pinv is safer for inversion in real-world data than np.linalg.inv
W_Star_Column_Vector = np.linalg.pinv(X_Design_Matrix.T @ X_Design_Matrix) @ X_Design_Matrix.T @ T_Column_Vector

# Create a DataFrame to display the coefficients
feature_names = ['Intercept'] + list(X_train.columns)
coefficients_df = pd.DataFrame(W_Star_Column_Vector, index=feature_names, columns=['Coefficient Value'])

print("\n--- Learned Regression Coefficients (Normal Equation) ---")
print(coefficients_df)

# Prediction on the training set
X_train_b = np.c_[np.ones((len(X_train), 1)), X_train.values]
T_train_predict = X_train_b @ W_Star_Column_Vector 
# print("\nFirst 5 Predicted House Values on Training Set:", T_train_predict[:5].flatten())


--- Learned Regression Coefficients (Normal Equation) ---
                          Coefficient Value
Intercept                       -286.853401
Median_Income                  38717.982131
Median_Age                       850.470609
Tot_Rooms                         -4.548680
Tot_Bedrooms                      85.658054
Population                       -45.339914
Households                        73.571900
Latitude                      -57687.638954
Longitude                     -16526.166308
Distance_to_coast                 -0.275192
Distance_to_LA                    -0.166111
Distance_to_SanDiego               0.413439
Distance_to_SanJose                0.152918
Distance_to_SanFrancisco          -0.144267

First 5 Predicted House Values on Training Set: [ 33690.62046449 173752.09012459 336524.47366343 280298.96294782
 258079.42989222]
