In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("kc_house_data.csv")
df["price"] = df["price"] / 1000
X = df[["sqft_living"]]
y = df["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [3]:
def fit_closed_form(X_design, y):
    y = np.asarray(y)
    if y.ndim == 1:
        y = y.reshape(-1, 1)

    X_design = np.asarray(X_design)
    X_b = np.c_[np.ones((X_design.shape[0], 1)), X_design]
    theta = np.linalg.pinv(X_b) @ y
    return theta

In [4]:
def predict_closed_form(X_design, theta):
    """
    X_design: (n, d) WITHOUT intercept column
    theta: (d+1, 1) WITH intercept
    Returns y_pred as (n,)
    """
    X_b = np.c_[np.ones((X_design.shape[0], 1)), X_design]
    y_pred = X_b @ theta
    return y_pred.ravel()

In [5]:
def make_poly_features(x, p):
    """
    x: (n, 1) numpy array
    returns: (n, p) matrix [x^1, x^2, ..., x^p]
    """
    x = np.asarray(x).reshape(-1, 1)
    return np.hstack([x**k for k in range(1, p+1)])

In [6]:
results = []

for p in [1, 2, 3, 4, 5]:
    X_train_poly = make_poly_features(X_train.values, p)
    X_test_poly = make_poly_features(X_test.values, p)

    scaler = StandardScaler()
    X_train_poly_scaled = scaler.fit_transform(X_train_poly)
    X_test_poly_scaled = scaler.transform(X_test_poly)
    theta = fit_closed_form(X_train_poly_scaled, y_train.values)

    y_pred_train = predict_closed_form(X_train_poly_scaled, theta)
    y_pred_test = predict_closed_form(X_test_poly_scaled, theta)
    mse_train = mean_squared_error(y_train, y_pred_train)
    r2_train = r2_score(y_train, y_pred_train)
    mse_test = mean_squared_error(y_test, y_pred_test)
    r2_test = r2_score(y_test, y_pred_test)

    results.append([p, mse_train, r2_train, mse_test, r2_test])

In [7]:
results_df = pd.DataFrame(
    results, 
    columns=["degree p", "MSE_train", "R2_train", "MSE_test", "R2_test"]
)
results_df

Unnamed: 0,degree p,MSE_train,R2_train,MSE_test,R2_test
0,1,66319.347785,0.492384,76484.977062,0.494069
1,2,58871.855127,0.549388,82113.931184,0.456835
2,3,58862.529017,0.549459,83663.526745,0.446585
3,4,58818.304351,0.549798,88922.167903,0.4118
4,5,58798.9524,0.549946,88307.646878,0.415865
