In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

# Preprocessing

## Center data

- Set the mean of each feature of $X$ at $0$
- Set the mean of $y$ at $0$
- Need to substruct he mean of $X$ from the input and add mean of $y$ to output to get predictions

In [21]:
X = np.random.randn(313, 3) * 2.56 + 3.6
y = 1.34 * X[:, 0] + 2.67 * X[:, 1] + 7.89 + 0.3 * np.random.randn(len(X))
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2)

Xc = np.mean(X_train, axis=0, keepdims=True)
yc = np.mean(y_train)
X_train2 = X_train - Xc 
y_train2 = y_train - yc
beta = np.linalg.inv(X_train2.T @ X_train2) @ X_train2.T @ y_train2

preds_train = (X_train - Xc) @ beta + yc
preds_test = (X_test - Xc) @ beta + yc

print('coeffs', beta)
print('train error:', np.mean((y_train - preds_train)**2))
print('test error:', np.mean((y_test - preds_test)**2))

coeffs [ 1.35030633  2.67236391 -0.0095658 ]
train error: 0.09316112498130295
test error: 0.0977621066645689


## Standardize data

- Center data
- Set the standard deviation of each feature of x at $1$
- Set the standard deviation of $y$ at $1$
- Need to substract and divide by mean and std of $X$ from the input
- Need to multilply and add mean and std of $y$ to output to get predictions

In [31]:
X = np.random.randn(313, 3) * 2.56 + 3.6
y = 1.34 * X[:, 0] + 2.67 * X[:, 1] + 7.89 + 0.3 * np.random.randn(len(X))
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2)

Xc = np.mean(X_train, axis=0, keepdims=True)
Xs = np.std(X_train, axis=0, keepdims=True)
yc = np.mean(y_train)
ys = np.std(y_train)
X_train2 = (X_train - Xc) / Xs
y_train2 = (y_train - yc) / ys
beta = np.linalg.inv(X_train2.T @ X_train2) @ X_train2.T @ y_train2

preds_train = (((X_train - Xc) / Xs) @ beta) * ys + yc
preds_test  = (((X_test  - Xc) / Xs) @ beta) * ys + yc

print('coeffs', beta)
print('train error:', np.mean((y_train - preds_train)**2))
print('test error:', np.mean((y_test - preds_test)**2))

coeffs [ 4.26439519e-01  9.16017824e-01 -5.91241243e-04]
train error: 0.07780094924852021
test error: 0.1042436721663027
