# Cycle 1 - Introduction to neural networks exercices

## Rodolphe Cledassou school

> Alexandre Boucaud and Marc Huertas-Company

In [None]:
# Native PyTorch setup (replaces TF/Keras)
import math, random, numpy as np
import torch, torch.nn as nn, torch.nn.functional as F
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(context='notebook', style='whitegrid', font='sans-serif', font_scale=1.2, rc={"lines.linewidth": 2})

device = torch.device("mps" if torch.backends.mps.is_available() else
                      "cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

torch.manual_seed(0); np.random.seed(0); random.seed(0)

# Small helpers
def to_tensor(x):
    x = np.asarray(x, dtype=np.float32)
    return torch.from_numpy(x).to(device).view(-1, 1)

@torch.no_grad()
def predict(model, x_np):
    x_t = to_tensor(x_np)
    y_t = model(x_t).squeeze(1)
    return y_t.detach().cpu().numpy()

class History:
    pass

def train_regressor(model, x_np, y_np, epochs=200, lr=1e-2, bs=None, verbose=False):
    model.to(device).train()
    x_t, y_t = to_tensor(x_np), to_tensor(y_np).squeeze(1)
    opt = torch.optim.Adam(model.parameters(), lr=lr)
    loss_fn = nn.MSELoss()
    losses = []
    N = x_t.size(0)
    for ep in range(epochs):
        if bs is None:
            # full-batch
            opt.zero_grad()
            pred = model(x_t).squeeze(1)
            loss = loss_fn(pred, y_t)
            loss.backward(); opt.step()
        else:
            # mini-batch
            idx = torch.randperm(N)
            for s in range(0, N, bs):
                b = idx[s:s+bs]
                xb, yb = x_t[b], y_t[b]
                opt.zero_grad()
                pred = model(xb).squeeze(1)
                loss = loss_fn(pred, yb)
                loss.backward(); opt.step()
        losses.append(loss.item())
        if verbose and (ep+1) % max(1, epochs//10) == 0:
            print(f"epoch {ep+1:4d}  mse={losses[-1]:.4f}")
    h = History(); h.epoch = list(range(epochs)); h.history = {"loss": losses}
    return h

## Let's first generate some data...

In [None]:
# Generate simple linear-ish data
x = np.random.uniform(-1, 1, 100)
y = 0.1 * x + np.random.normal(0, 0.025, 100)
plt.figure(figsize=(4,3)); plt.scatter(x, y, label="data"); plt.legend(); plt.show()

## The standard way to deal with this, is through linear regression

In [None]:
# Linear regression via numpy (degree 1 polyfit) + visualization
res = np.polyfit(x, y, 1)
print("polyfit degree-1 coefficients [slope, intercept]:", res)

x_plot = np.linspace(-1, 1, 400)
y_poly = np.polyval(res, x_plot)

plt.figure(figsize=(4,3))
plt.scatter(x, y, label="data", s=20)
plt.plot(x_plot, y_poly, label="polyfit deg=1", color="C1")
plt.legend(); plt.show()

## Now, let's try to write the linear regression in a different way (more complicated way)

The Dense command here, onnly says that the input is multiplied by a parameter $w$. We are effectively writing a simple model for our data: $y = w.a+b$, where $w$ is unknown.
![alt](https://drive.google.com/uc?id=1Rt2bNPCxaHXdjzmVS7TCw_u_Ur-WIqlW)

We can visualize the model we just created.

In [None]:
# Define a *linear* neural net: y = W x + b
ann = nn.Linear(1, 1).to(device)
print(ann)

### We then compile

In [None]:
# In PyTorch, "compile" = choose loss + optimizer (no separate compile() step)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(ann.parameters(), lr=1e-2)
print("Criterion:", criterion, "| Optimizer:", optimizer)

We are simply tht we want to minimize the mean square error (mse) between input and output. We call this the "loss function". So we are looking for the value of $w$ that minimizes the following expression: $$\sum_i(y_i-w.x_i)^2$$

### And fit the model ...

In [None]:
# Train the linear model
history = train_regressor(ann, x, y, epochs=200, lr=1e-2, bs=None, verbose=False)

In [None]:
# Plot training loss (Keras-like history object)
plt.figure(figsize=(4,3))
plt.plot(history.epoch, history.history["loss"])
plt.xlabel("epoch"); plt.ylabel("loss"); plt.title("Training loss (linear model)"); plt.show()

### Let's see what we got here...

In [None]:
# Show learned line vs data & polyfit
y_ann = predict(ann, x_plot)

plt.figure(figsize=(4,3))
plt.scatter(x, y, label="data", s=20)
plt.plot(x_plot, y_ann, color="red", label="ANN")
plt.plot(x_plot, np.polyval(res, x_plot), label="polyfit deg=1", color="C1")
plt.legend(); plt.show()

# Parity plot (model vs ANN) inside the training range
model_true = lambda u: 0.1 * u
y_true = model_true(x_plot)
plt.figure(figsize=(4,3))
plt.scatter(y_true, y_ann, s=10)
m = [y_true.min(), y_true.max()]
plt.plot(m, m, color="red"); plt.gca().set_aspect('equal', adjustable='box')
plt.xlabel("true y"); plt.ylabel("ann y"); plt.title("Parity (linear data)")
plt.show()

We have performed a linear regression with and artifical neural network ! So, yes, linear regression IS also Machine Learning...

### Why is this useful ?

Let's suppose we have a more complex dataset...

In [None]:
# Generate a more complex dataset and visualize it
x = np.random.uniform(-1, 1, 100)
model_true = lambda u: 0.1 * u + np.sin(5 * u)
y = model_true(x) + np.random.normal(0, 0.45 * np.abs(x), 100)

plt.figure(figsize=(4,3))
plt.scatter(x, y, label="data")
plt.legend(); plt.show()

x_plot = np.linspace(-1, 1, 400)

## We can try again simple polynomial regression ...

In [None]:
# Polynomial regression baselines (deg=1,3,5)
poly = [(deg, np.polyfit(x, y, deg)) for deg in [1, 3, 5]]

plt.figure(figsize=(5,4))
plt.scatter(x, y, label="data", s=15)
for deg, coeff in poly:
    plt.plot(x_plot, np.polyval(coeff, x_plot), label=f"polyfit deg={deg}")
plt.legend(); plt.show()

but that will not work super well as expected...

# Let's go back to our network...

In [None]:
# Linear ANN (no hidden layer) on complex data
ann_lin = nn.Linear(1, 1).to(device)
hist_lin = train_regressor(ann_lin, x, y, epochs=400, lr=1e-2, bs=None, verbose=False)

If I do not change anything, I will obtain the same result. My model is simply linear...

In [None]:
# Visualize linear ANN predictions
y_lin = predict(ann_lin, x_plot)
plt.figure(figsize=(5,4))
plt.scatter(x, y, label="data", s=15)
plt.plot(x_plot, y_lin, color="red", label="ANN linear")
plt.legend(); plt.show()

## and add a bit of non-linearity ...

In [None]:
# Add a bit of non-linearity: 1 hidden layer
ann_shallow = nn.Sequential(
    nn.Linear(1, 16), nn.Tanh(),
    nn.Linear(16, 1),
).to(device)

hist_shallow = train_regressor(ann_shallow, x, y, epochs=600, lr=1e-2, bs=32, verbose=False)

# Loss curve
plt.figure(figsize=(4,3))
plt.plot(hist_shallow.epoch, hist_shallow.history["loss"])
plt.xlabel("epoch"); plt.ylabel("loss"); plt.title("Training loss (1 hidden layer)"); plt.show()

# Predictions vs data and polyfits
y_shallow = predict(ann_shallow, x_plot)
plt.figure(figsize=(6,4))
plt.scatter(x, y, label="data", s=12)
for deg, coeff in poly:
    plt.plot(x_plot, np.polyval(coeff, x_plot), label=f"polyfit deg={deg}")
plt.plot(x_plot, y_shallow, color="red", label="ANN (1 hidden)")
plt.plot(x_plot, model_true(x_plot), color="black", label="true model")
plt.legend(); plt.show()

The sigmoid function is given by this expression: $$ \frac{1}{1+e^{-x}}$$
So our model is now like this:![alt](https://drive.google.com/uc?id=1-2VbatzRnqGJMKCga-tppiTo6iPRBr9s)
This is what we call a perceptron. The non-linear function added after the linear combination is also called the activation function, because "it fires the unit".

Still not great, but there is some potential !?

## We can add another layer

In [None]:
# Add another layer (slightly deeper)
ann_two = nn.Sequential(
    nn.Linear(1, 32), nn.ReLU(),
    nn.Linear(32, 16), nn.ReLU(),
    nn.Linear(16, 1),
).to(device)

hist_two = train_regressor(ann_two, x, y, epochs=800, lr=1e-2, bs=32, verbose=False)

# Quick look at predictions
y_two = predict(ann_two, x_plot)
plt.figure(figsize=(6,4))
plt.scatter(x, y, label="data", s=12)
plt.plot(x_plot, y_two, color="red", label="ANN (2 hidden)")
plt.plot(x_plot, model_true(x_plot), color="black", label="true model")
plt.legend(); plt.show()

We have added "a layer". Our model is now: $$ y=(\frac{1}{1+e^{-(w_1.x)}}).w_2$$
![alt](https://drive.google.com/uc?id=1E0iobni7jhUI2jfGKPb081OM_QDB5Hjg)

Not fantastic, but you get the idea...You have just created your first ANN for regression!

In fact, it turns out that it exists a mathematical theorem that proves that NNs are optimal approximators:


> _For any continuous function for a hypercube [0,1]d to real numbers, and every positive epsilon, there exists a sigmoid based 1-hidden layer neural network that obtaines at most epsilon error in functional space_  
> Cybenko+89

> _Big enough network can approximate, but not represent any smooth function. the math demonstration implies showing that networs are dense in the space of target functions_

So, the approximation theorem tells me that there exists a NN that can approximate any function. It does not tell me which one: this is the alchemia of ML. It does not tell me how to minimize it either!

## Let's make the model more complex

In [None]:
# Make the model/data more complex
x = np.random.uniform(-1, 1, 100)
model_true = lambda u: 0.1 * u + np.sin(5 * u)
y = model_true(x) + np.random.normal(0, 0.45 * np.abs(x), 100)

plt.figure(figsize=(4,3))
plt.scatter(x, y, label="data"); plt.legend(); plt.show()

# Polynomial baselines
poly = [(deg, np.polyfit(x, y, deg)) for deg in [1, 3, 5]]
x_plot = np.linspace(-1, 1, 400)

In [None]:
# A slightly deeper neural net to capture nonlinearity
ann = nn.Sequential(
    nn.Linear(1, 10), nn.ReLU(),
    nn.Linear(10, 5), nn.ReLU(),
    nn.Linear(5, 1),
).to(device)

history = train_regressor(ann, x, y, epochs=800, lr=1e-2, bs=16, verbose=False)

# Loss curve
plt.figure(figsize=(4,3))
plt.plot(history.epoch, history.history["loss"]); plt.xlabel("epoch"); plt.ylabel("loss")
plt.title("Training loss (nonlinear model)"); plt.show()

# Predictions vs baselines
y_ann = predict(ann, x_plot)

plt.figure(figsize=(5,4))
plt.scatter(x, y, label="data", s=15)
for deg, coeff in poly:
    plt.plot(x_plot, np.polyval(coeff, x_plot), label=f"polyfit deg={deg}")
plt.plot(x_plot, y_ann, color="red", label="ANN")
plt.plot(x_plot, model_true(x_plot), color="black", label="true model")
plt.legend(); plt.show()

Which is not that far from the real underlying model...

### What if I go beyond my training set?

In [None]:
# Extrapolation beyond training range
x_plot_large = np.linspace(-2, 2, 400)
y_ann_large = predict(ann, x_plot_large)
plt.figure(figsize=(5,4))
plt.plot(x_plot_large, y_ann_large, color="red", label="ANN")
plt.plot(x_plot_large, model_true(x_plot_large), label="model", color="black")
plt.scatter(x, y, s=15, label="data"); plt.legend(); plt.show()

# Parity (wide range)
y_predict = y_ann_large
plt.figure(figsize=(4,3))
plt.scatter(model_true(x_plot_large), y_predict, s=8)
plt.plot([-2, 2], [-2, 2], color="red")
plt.gca().set_aspect('equal', adjustable='box')
plt.xlabel("true y"); plt.ylabel("ann y"); plt.title("Parity (extrapolation)")
plt.show()

## What about errors? Can we capture the uncertainties in the data?