In [1]:
import numpy as np
from sklearn.model_selection import train_test_split

# Preprocessing

## Center data

- Set the mean of each feature of $X$ at $0$
- Set the mean of $y$ at $0$
- Need to substruct he mean of $X$ from the input and add mean of $y$ to output to get predictions

In [37]:
X = np.random.randn(313, 3) * 2.56 + 3.6
y = 1.34 * X[:, 0] + 2.67 * X[:, 1] + 7.89 + 0.01 * np.random.randn(len(X))
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2)

Xc = np.mean(X_train, axis=0, keepdims=True)
yc = np.mean(y_train)
X_train2 = X_train - Xc 
y_train2 = y_train - yc
beta = np.linalg.inv(X_train2.T @ X_train2) @ X_train2.T @ y_train2
 
intercept = float(- Xc @ beta + yc)

preds_train = X_train @ beta + intercept
preds_test = X_test @ beta + intercept

print('coeffs', beta)
print('intercept', intercept)
print('train error:', np.mean((y_train - preds_train)**2))
print('test error:', np.mean((y_test - preds_test)**2))

coeffs [1.33970580e+00 2.67030622e+00 1.78469930e-04]
intercept 7.889832792362837
train error: 0.00010383926536641751
test error: 9.720628982733466e-05


## Standardize data

- Center data
- Set the standard deviation of each feature of x at $1$
- Set the standard deviation of $y$ at $1$
- Need to substract and divide by mean and std of $X$ from the input
- Need to multilply and add mean and std of $y$ to output to get predictions

In [42]:
X = np.random.randn(313, 3) * 2.56 + 3.6
y = 1.34 * X[:, 0] + 2.67 * X[:, 1] + 7.89 + 0.01 * np.random.randn(len(X))
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2)

Xc = np.mean(X_train, axis=0, keepdims=True)
Xs = np.std(X_train, axis=0, keepdims=True)
yc = np.mean(y_train)
ys = np.std(y_train)
X_train2 = (X_train - Xc) / Xs
y_train2 = (y_train - yc) / ys

beta = np.linalg.inv(X_train2.T @ X_train2) @ X_train2.T @ y_train2

coeffs = ys * beta/Xs[0]
intercept = - (Xc[0]*ys/Xs[0])@beta + yc

preds_train = X_train @ coeffs + intercept
preds_test  = X_test @ coeffs + intercept

print('coeffs', coeffs)
print('intercept', intercept)
print('train error:', np.mean((y_train - preds_train)**2))
print('test error:', np.mean((y_test - preds_test)**2))

coeffs [ 1.34020876e+00  2.67013143e+00 -1.27480707e-04]
intercept 7.8886505790953
train error: 9.135660150473298e-05
test error: 0.0001014917480331836


# Dataset Augmentation

## Scale Jitering

This technique is applied to ConvNet that take 2D images as input.  
Let's suppose the input is of size $N*N$, we take $S$ the rescaled image size, with $S > N$.  
We rescale the image to size $S*S$ and take a random crop of size $N*N$.  

### Single-scale training

Fix $S$ at a specific value during the whole training process.

### Multi-Scale training

Each training image is rescaled individually with $S$ sampled from a range $[S_\min, S_\max]$


### Testing

At test time, all images are resized to a fixed $Q$, that might be different from the $S$ used during training.