In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("kc_house_data.csv")
df = df.drop(columns=["id", "date", "zipcode"])
df["price"] = df["price"] / 1000
X = df.drop(columns=["price"])
y = df["price"]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [4]:
y_train_np = y_train.values.reshape(-1, 1)
y_test_np = y_test.values.reshape(-1, 1)
X_train_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
X_test_b = np.c_[np.ones((X_test_scaled.shape[0], 1)), X_test_scaled]
theta = np.linalg.pinv(X_train_b) @ y_train_np

In [5]:
def predict(X_scaled, theta):
    X_b = np.c_[np.ones((X_scaled.shape[0], 1)), X_scaled]
    return X_b @ theta

y_pred_train = predict(X_train_scaled, theta)
y_pred_test = predict(X_test_scaled, theta)

In [6]:
mse_train_cf = mean_squared_error(y_train_np, y_pred_train)
r2_train_cf = r2_score(y_train_np, y_pred_train)
mse_test_cf = mean_squared_error(y_test_np, y_pred_test)
r2_test_cf = r2_score(y_test_np, y_pred_test)

print("Closed-Form Model Results")
print("Train MSE:", mse_train_cf)
print("Train R^2:", r2_train_cf)
print("Test MSE:", mse_test_cf)
print("Test R^2:", r2_test_cf)

Closed-Form Model Results
Train MSE: 39834.2534976257
Train R^2: 0.6951038946870625
Test MSE: 45998.56287706282
Test R^2: 0.6957298370207377


In [7]:
# Compare with sklearn model
lr = LinearRegression(fit_intercept=False)
lr.fit(X_train_b, y_train_np)
y_pred_train_sk = lr.predict(X_train_b)
y_pred_test_sk = lr.predict(X_test_b)
mse_train_sk = mean_squared_error(y_train_np, y_pred_train_sk)
r2_train_sk = r2_score(y_train_np, y_pred_train_sk)
mse_test_sk = mean_squared_error(y_test_np, y_pred_test_sk)
r2_test_sk = r2_score(y_test_np, y_pred_test_sk)

print("Sklearn Model Results")
print("Train MSE:", mse_train_sk)
print("Train R^2:", r2_train_sk)
print("Test MSE:", mse_test_sk)
print("Test R^2:", r2_test_sk)

Sklearn Model Results
Train MSE: 39834.25349762571
Train R^2: 0.6951038946870625
Test MSE: 45998.56287706282
Test R^2: 0.6957298370207377


In [8]:
print("Maximum coefficient difference:", np.max(np.abs(theta.ravel() - lr.coef_.ravel())))

Maximum coefficient difference: 7.958078640513122e-13
