In [None]:
%load_ext jupyter_black

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

We start by generating a toy 2D linear problem: $y = 10x_1 + 20x_2$.

The second feature has a dynamic that is 10 times larger than the first feature:
- $x_1 \sim \mathcal{N}(0,1)$
- $x_2 \sim \mathcal{N}(10,10)$

In [None]:
def init_problem(scaler: StandardScaler = None, random_seed=0) -> (np.array, ...):
    # Generate data with feature of different scale
    np.random.seed(random_seed)
    w_true = np.asarray([10, 30])
    w_hat = w_true + 10 * np.random.randn(2)

    X = np.random.randn(100, 2) * np.asarray([1, 10]) + np.asarray([0, 10])

    y_true = X @ w_true
    y_hat = X @ w_hat
    e = (y_true - y_hat)[:, np.newaxis]
    if (
        scaler is not None
    ):  # https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
        X = scaler.fit_transform(X)
    return X, y_true, y_hat, e, w_true, w_hat

Helper function for plotting the iterations

In [None]:
def plot_iterations(W: list, w_true: np.array, scaler: StandardScaler = None) -> None:
    Wa = np.asarray(W)
    if (
        scaler is not None
    ):  # Should descale the parameter to be in the same dynamic than the original
        Wa /= scaler.scale_
    plt.figure(figsize=(5, 5))
    plt.scatter(Wa[:, 0], Wa[:, 1], c=range(n_epochs + 1))
    plt.scatter(w_true[0], w_true[1], marker="*")
    for e, (w_1, w_2) in enumerate(Wa):
        plt.annotate(
            str(e), (w_1, w_2), textcoords="offset points", xytext=(0, 10), ha="center"
        )
    plt.xlabel("w_1")
    plt.ylabel("w_2")
    plt.title(f"True W: {w_true}")
    plt.xlim([9, 30])
    plt.ylim([3, 60])
    plt.grid()

We can now look at the iteration process with differnt learning rate.

In [None]:
n_epochs = 50
scale = False
scaler = StandardScaler() if scale else None

for learning_rate in [0.001, 0.01, 0.05]:
    X, y_true, y_hat, e, w_true, w_hat = init_problem(scaler=scaler)
    W = [w_hat.copy()]
    for epoch in range(n_epochs):
        e = (y_true - X @ w_hat)[:, np.newaxis]
        grad = -np.mean(e * X, axis=0)
        w_hat -= learning_rate * grad
        W.append(w_hat.copy())
    plot_iterations(W, w_true, scaler=scaler)

You can have a look to the gradiant values also: should exhibit different dynamics w.r.t. scaling. This is let as homework :)