<center><h1>2-ab: Introduction to Neural Networks</h1></center>

<center><h2><a href="https://rdfia.github.io/">Course link</a></h2></center>

# Warning : 
# Do "File -> Save a copy in Drive" before you start modifying the notebook, otherwise your modifications will not be saved.


In [None]:
!wget https://github.com/rdfia/rdfia.github.io/raw/master/data/2-ab.zip
!unzip -j 2-ab.zip
!wget https://github.com/rdfia/rdfia.github.io/raw/master/code/2-ab/utils-data.py

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
%run 'utils-data.py'

# Part 1 : Forward and backward passes "by hands"

In [None]:
def init_params(nx, nh, ny):
    """
    nx, nh, ny: integers
    out params: dictionnary
    """
    params = {}

    #####################
    ## Your code here  ##
    #####################
    # fill values for Wh, Wy, bh, by

    params["Wh"] = 0.3 * torch.randn(nh, nx)
    params["Wy"] = 0.3 * torch.randn(ny, nh)
    params["bh"] = torch.zeros(1, nh)
    params["by"] = torch.zeros(1, ny)
    ####################
    ##      END        #
    ####################
    return params

In [None]:
def forward(params, X):
    """
    params: dictionnary
    X: (n_batch, dimension)
    """
    bsize = X.size(0)
    nh = params["Wh"].size(0)
    ny = params["Wy"].size(0)
    outputs = {}

    #####################
    ## Your code here  ##
    #####################
    # fill values for X, htilde, h, ytilde, yhat

    outputs["X"] = X
    outputs["htilde"] = torch.mm(outputs["X"], params["Wh"].t()) + params["bh"].expand(
        bsize, nh
    )
    outputs["h"] = torch.tanh(outputs["htilde"])
    outputs["ytilde"] = torch.mm(outputs["h"], params["Wy"].t()) + params["by"].expand(
        bsize, ny
    )
    outputs["ytilde"] = torch.exp(outputs["ytilde"])
    outputs["yhat"] = outputs["ytilde"] / torch.sum(
        outputs["ytilde"], 1, keepdim=True
    ).expand_as(outputs["ytilde"])

    ####################
    ##      END        #
    ####################

    return outputs["yhat"], outputs

In [None]:
def loss_accuracy(Yhat, Y):
    #####################
    ## Your code here  ##
    #####################
    L = -torch.mean(torch.sum(Y * torch.log(Yhat), 0))

    _, indsY = torch.max(Y, 1)
    _, indsYhat = torch.max(Yhat, 1)
    sum = torch.sum((Y * torch.log(Yhat)), 1)
    acc = torch.sum(torch.eq(indsY, indsYhat)) / Yhat.size(0) * 100
    ####################
    ##      END        #
    ####################

    return L, acc

In [None]:
def backward(params, outputs, Y):
    bsize = Y.shape[0]
    grads = {}

    #####################
    ## Your code here  ##
    #####################
    # fill values for Wy, Wh, by, bh

    grads["ytilde"] = outputs["yhat"] - Y

    grads["Wy"] = torch.mm(grads["ytilde"].t(), outputs["h"]) / bsize
    grads["htilde"] = (torch.mm(grads["ytilde"], params["Wy"])) * (
        1 - torch.pow(outputs["h"], 2)
    )
    grads["Wh"] = torch.mm(grads["htilde"].t(), outputs["X"]) / bsize
    grads["by"] = torch.sum(grads["ytilde"], 0).t() / bsize
    grads["bh"] = torch.sum(grads["htilde"], 0).t() / bsize
    ####################
    ##      END        #
    ####################
    return grads

In [None]:
def sgd(params, grads, eta):
    #####################
    ## Your code here  ##
    #####################
    # update the params values

    params["Wh"] -= eta * grads["Wh"]
    params["Wy"] -= eta * grads["Wy"]
    params["bh"] -= eta * grads["bh"]
    params["by"] -= eta * grads["by"]
    ####################
    ##      END        #
    ####################
    return params

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
Nbatch = 10
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
eta = 0.05

params = init_params(nx, nh, ny)

curves = [[], [], [], []]

# epoch
for iteration in range(300):
    # permute
    perm = np.random.permutation(N)
    Xtrain = data.Xtrain[perm, :]
    Ytrain = data.Ytrain[perm, :]

    #####################
    ## Your code here  ##
    #####################
    # batches
    for j in range(N // Nbatch):
        indsBatch = range(j * Nbatch, (j + 1) * Nbatch)
        X_train = Xtrain[indsBatch, :]
        Y_train = Ytrain[indsBatch, :]

        # write the optimization algorithm on the batch (X,Y)
        # using the functions: forward, loss_accuracy, backward, sgd
        Yhat_train, outputs = forward(params, X_train)
        Ltrain, acctrain = loss_accuracy(Yhat_train, Y_train)
        grads = backward(params, outputs, Y_train)
        params = sgd(params, grads, eta)

    ####################
    ##      END        #
    ####################
    Yhat_train, _ = forward(params, data.Xtrain)
    Yhat_test, _ = forward(params, data.Xtest)
    Ltrain, acctrain = loss_accuracy(Yhat_train, data.Ytrain)
    Ltest, acctest = loss_accuracy(Yhat_test, data.Ytest)
    Ygrid, _ = forward(params, data.Xgrid)

    title = "Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})".format(
        iteration, acctrain, Ltrain, acctest, Ltest
    )
    print(title)
    data.plot_data_with_grid(Ygrid, title)

    curves[0].append(acctrain)
    curves[1].append(acctest)
    curves[2].append(Ltrain)
    curves[3].append(Ltest)

fig = plt.figure(figsize=(10, 8))
plt.xlabel("Epoch", fontsize=16)
plt.ylabel("Accuracy / loss", fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.plot(curves[0], label="acc. train")
plt.plot(curves[1], label="acc. test")
plt.plot(curves[2], label="loss train")
plt.plot(curves[3], label="loss test")
plt.legend(fontsize=16)
plt.show()

## Global learning procedure "by hands"

# Part 2 : Simplification of the backward pass with `torch.autograd`



In [None]:
def init_params(nx, nh, ny):
    """
    nx, nh, ny: integers
    out params: dictionnary
    """
    params = {}

    #####################
    ## Your code here  ##
    #####################
    # fill values for Wh, Wy, bh, by
    params["Wh"] = 0.3 * torch.randn(nh, nx, requires_grad=True)
    params["Wh"].retain_grad()

    params["Wy"] = 0.3 * torch.randn(ny, nh, requires_grad=True)
    params["Wy"].retain_grad()
    
    params["bh"] = torch.zeros(1, nh, requires_grad=True)
    params["bh"].retain_grad()
    
    params["by"] = torch.zeros(1, ny, requires_grad=True)
    params["by"].retain_grad()
    ####################
    ##      END        #
    ####################
    return params

The function `forward` remains unchanged from previous part. 

The function `backward` is no longer used because of "autograd". 

In [None]:
def sgd(params, eta):
    #####################
    ## Your code here  ##
    #####################
    # update the network weights
    # warning: use torch.no_grad()
    # and reset to zero the gradient accumulators
    with torch.no_grad():
        params["Wh"] -= eta * params["Wh"].grad
        params["Wh"].grad.zero_()

        params["Wy"] -= eta * params["Wy"].grad
        params["Wy"].grad.zero_()
        
        params["bh"] -= eta * params["bh"].grad
        params["bh"].grad.zero_()
        
        params["by"] -= eta * params["by"].grad
        params["by"].grad.zero_()
    ####################
    ##      END        #
    ####################
    return params

## Global learning procedure with autograd

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
Nbatch = 10
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
eta = 0.03

params = init_params(nx, nh, ny)

curves = [[], [], [], []]

# epoch
for iteration in range(300):
    # permute
    perm = np.random.permutation(N)
    Xtrain = data.Xtrain[perm, :]
    Ytrain = data.Ytrain[perm, :]

    #####################
    ## Your code here  ##
    #####################
    # batches
    for j in range(N // Nbatch):
        # write the optimization algorithm on the batch (X,Y)
        # using the functions: forward, loss_accuracy, sgd
        # and the backward function with autograd
        indsBatch = range(j * Nbatch, (j + 1) * Nbatch)
        X = Xtrain[indsBatch, :]
        Y = Ytrain[indsBatch, :]
        Yhat_train, outputs = forward(params, X)
        Ltrain, acctrain = loss_accuracy(Yhat_train, Y)
        Ltrain.backward(retain_graph=True)
        params = sgd(params, eta)
    ####################
    ##      END        #
    ####################
    Yhat_train, _ = forward(params, data.Xtrain)
    Yhat_test, _ = forward(params, data.Xtest)
    Ltrain, acctrain = loss_accuracy(Yhat_train, data.Ytrain)
    Ltest, acctest = loss_accuracy(Yhat_test, data.Ytest)
    Ygrid, _ = forward(params, data.Xgrid)

    title = "Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})".format(
        iteration, acctrain, Ltrain, acctest, Ltest
    )
    print(title)
    # detach() is used to remove the predictions from the computational graph in autograd
    data.plot_data_with_grid(Ygrid.detach(), title)

    curves[0].append(acctrain)
    curves[1].append(acctest)
    curves[2].append(Ltrain.detach())
    curves[3].append(Ltest.detach())

fig = plt.figure(figsize=(10, 8))
plt.xlabel("Epoch", fontsize=16)
plt.ylabel("Accuracy / loss", fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.plot(curves[0], label="acc. train")
plt.plot(curves[1], label="acc. test")
plt.plot(curves[2], label="loss train")
plt.plot(curves[3], label="loss test")
plt.legend(fontsize=16)
plt.show()

# Part 3 : Simplification of the forward pass with `torch.nn`

`init_params` and `forward` are replaced by the `init_model` function which defines the network architecture and the loss.

In [None]:
def init_model(nx, nh, ny):
    #####################
    ## Your code here  ##
    #####################
    model = torch.nn.Sequential(
        torch.nn.Linear(nx, nh),
        torch.nn.Tanh(),
        torch.nn.Linear(nh, ny),
        torch.nn.Softmax(),
    )
    loss = torch.nn.CrossEntropyLoss()
    ####################
    ##      END        #
    ####################

    return model, loss

In [None]:
def loss_accuracy(loss, Yhat, Y):
    #####################
    ## Your code here  ##
    #####################
    # call the loss function
    L = loss(Yhat, Y)
    _, indsY = torch.max(Y, 1)
    _, indsYhat = torch.max(Yhat, 1)
    sum = torch.sum((Y * torch.log(Yhat)), 1)
    acc = torch.sum(torch.eq(indsY, indsYhat)) / Yhat.size(0) * 100
    ####################
    ##      END        #
    ####################

    return L, acc

In [None]:
def sgd(model, eta):
    #####################
    ## Your code here  ##
    #####################
    # update the network weights
    # warning: use torch.no_grad()
    # and reset to zero the gradient accumulators
    with torch.no_grad():
        for param in model.parameters():
            param -= eta * param.grad
        model.zero_grad()
    ####################
    ##      END        #
    ####################
    return model

## Global learning procedure with autograd and `torch.nn`

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
Nbatch = 10
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
eta = 0.03

model, loss = init_model(nx, nh, ny)

curves = [[], [], [], []]

# epoch
for iteration in range(300):
    # permute
    perm = np.random.permutation(N)
    Xtrain = data.Xtrain[perm, :]
    Ytrain = data.Ytrain[perm, :]

    #####################
    ## Your code here  ##
    #####################
    # batches
    for j in range(N // Nbatch):
        # write the optimization algorithm on the batch (X,Y)
        # using the functions: loss_accuracy, sgd
        # the forward with the predict method from the model
        # and the backward function with autograd
        indsBatch = range(j * Nbatch, (j + 1) * Nbatch)
        X = Xtrain[indsBatch, :]
        Y = Ytrain[indsBatch, :]
        Yhat = model(X)
        Ltrain, acctrain = loss_accuracy(loss, Yhat, Y)
        Ltrain.backward(retain_graph=True)
        model = sgd(model, eta)
    ####################
    ##      END        #
    ####################
    Yhat_train = model(data.Xtrain)
    Yhat_test = model(data.Xtest)
    Ltrain, acctrain = loss_accuracy(loss, Yhat_train, data.Ytrain)
    Ltest, acctest = loss_accuracy(loss, Yhat_test, data.Ytest)
    Ygrid = model(data.Xgrid)

    title = "Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})".format(
        iteration, acctrain, Ltrain, acctest, Ltest
    )
    print(title)
    data.plot_data_with_grid(torch.nn.Softmax(dim=1)(Ygrid.detach()), title)

    curves[0].append(acctrain)
    curves[1].append(acctest)
    curves[2].append(Ltrain.detach() * 100)
    curves[3].append(Ltest.detach() * 100)

fig = plt.figure(figsize=(10, 8))
plt.xlabel("Epoch", fontsize=16)
plt.ylabel("Accuracy / Loss (X 100)", fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.plot(curves[0], label="acc. train")
plt.plot(curves[1], label="acc. test")
plt.plot(curves[2], label="loss train")
plt.plot(curves[3], label="loss test")
plt.legend(fontsize=16)
plt.show()

# Part 4 : Simplification of the SGD with `torch.optim`

In [None]:
def init_model(nx, nh, ny, eta):
    #####################
    ## Your code here  ##
    #####################
    model = torch.nn.Sequential(
        torch.nn.Linear(nx, nh),
        torch.nn.Tanh(),
        torch.nn.Linear(nh, ny),
    )
    loss = torch.nn.CrossEntropyLoss()
    optim = torch.optim.SGD(model.parameters(), lr=eta)
    ####################
    ##      END        #
    ####################

    return model, loss, optim

The `sgd` function is replaced by calling the `optim.zero_grad()` before the backward and `optim.step()` after. 

## Algorithme global d'apprentissage (avec autograd, les couches `torch.nn` et `torch.optim`)

In [None]:
# init
data = CirclesData()
data.plot_data()
N = data.Xtrain.shape[0]
Nbatch = 10
nx = data.Xtrain.shape[1]
nh = 10
ny = data.Ytrain.shape[1]
eta = 0.03

model, loss, optim = init_model(nx, nh, ny, eta)

curves = [[], [], [], []]

# epoch
for iteration in range(300):
    # permute
    perm = np.random.permutation(N)
    Xtrain = data.Xtrain[perm, :]
    Ytrain = data.Ytrain[perm, :]

    #####################
    ## Your code  here ##
    #####################
    # batches
    for j in range(N // Nbatch):
        # write the optimization algorithm on the batch (X,Y)
        # using the functions: loss_accuracy
        # the forward with the predict method from the model
        # the backward function with autograd
        # and then an optimization step
        indsBatch = range(j * Nbatch, (j + 1) * Nbatch)
        X = Xtrain[indsBatch, :]
        Y = Ytrain[indsBatch, :]

        Yhat = model(X)
        Ltrain, acctrain = loss_accuracy(loss, Yhat, Y)
        optim.zero_grad()
        Ltrain.backward(retain_graph=True)
        optim.step()
    ####################
    ##      FIN        #
    ####################
    Yhat_train = model(data.Xtrain)
    Yhat_test = model(data.Xtest)
    Ltrain, acctrain = loss_accuracy(loss, Yhat_train, data.Ytrain)
    Ltest, acctest = loss_accuracy(loss, Yhat_test, data.Ytest)

    title = "Iter {}: Acc train {:.1f}% ({:.2f}), acc test {:.1f}% ({:.2f})".format(
        iteration, acctrain, Ltrain, acctest, Ltest
    )
    print(title)

    curves[0].append(acctrain)
    curves[1].append(acctest)
    curves[2].append(Ltrain.detach() * 100)
    curves[3].append(Ltest.detach() * 100)

fig = plt.figure(figsize=(10, 8))
plt.xlabel("Epoch", fontsize=16)
plt.ylabel("Accuracy / Loss (X 100)", fontsize=16)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.plot(curves[0], label="acc. train")
plt.plot(curves[1], label="acc. test")
plt.plot(curves[2], label="loss train")
plt.plot(curves[3], label="loss test")
plt.legend(fontsize=16)
plt.show()

# Part 5 : MNIST

Apply the code from previous part code to the MNIST dataset.

In [None]:
# init
data = MNISTData()
N = data.Xtrain.shape[0]
Nbatch = 100
nx = data.Xtrain.shape[1]
nh = 100
ny = data.Ytrain.shape[1]
eta = 0.03

# Part 6: Bonus: SVM


Train a SVM model on the Circles dataset.

Ideas : 
- First try a linear SVM (sklearn.svm.LinearSVC dans scikit-learn). Does it work well ? Why ?
- Then try more complex kernels (sklearn.svm.SVC). Which one is the best ? why ?
- Does the parameter C of regularization have an impact? Why ?

In [None]:
# data
data = CirclesData()
Xtrain = data.Xtrain.numpy()
Ytrain = data.Ytrain[:, 0].numpy()

Xgrid = data.Xgrid.numpy()

Xtest = data.Xtest.numpy()
Ytest = data.Ytest[:, 0].numpy()


def plot_svm_predictions(data, predictions):
    plt.figure(2)
    plt.clf()
    plt.imshow(np.reshape(predictions, (40, 40)))
    plt.plot(
        data._Xtrain[data._Ytrain[:, 0] == 1, 0] * 10 + 20,
        data._Xtrain[data._Ytrain[:, 0] == 1, 1] * 10 + 20,
        "bo",
        label="Train",
    )
    plt.plot(
        data._Xtrain[data._Ytrain[:, 1] == 1, 0] * 10 + 20,
        data._Xtrain[data._Ytrain[:, 1] == 1, 1] * 10 + 20,
        "ro",
    )
    plt.plot(
        data._Xtest[data._Ytest[:, 0] == 1, 0] * 10 + 20,
        data._Xtest[data._Ytest[:, 0] == 1, 1] * 10 + 20,
        "b+",
        label="Test",
    )
    plt.plot(
        data._Xtest[data._Ytest[:, 1] == 1, 0] * 10 + 20,
        data._Xtest[data._Ytest[:, 1] == 1, 1] * 10 + 20,
        "r+",
    )
    plt.xlim(0, 39)
    plt.ylim(0, 39)
    plt.clim(0.3, 0.7)
    plt.draw()
    plt.pause(1e-3)

In [None]:
import sklearn.svm

############################
### Your code here   #######
### Train the SVM    #######
## See https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
## and https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
############################
svmlin = sklearn.svm.LinearSVC(C=0.01)
svmlin.fit(Xtrain, Ytrain)
svmsig = sklearn.svm.SVC(C=0.1, kernel="sigmoid")
svmsig.fit(Xtrain, Ytrain)
svmrbf = sklearn.svm.SVC(C=0.01, kernel="rbf")
svmrbf.fit(Xtrain, Ytrain)
svmpoly = sklearn.svm.SVC(C=100, kernel="poly")
svmpoly.fit(Xtrain, Ytrain)

In [None]:
## Print results

Ytest_pred = svmlin.predict(Xtest)
accuracy = np.sum(Ytest == Ytest_pred) / len(Ytest)
print(f"Accuracy : {100 * accuracy:.2f}")
Ygrid_pred = svmlin.predict(Xgrid)
plot_svm_predictions(data, Ygrid_pred)

Ytest_pred = svmsig.predict(Xtest)
accuracy = np.sum(Ytest == Ytest_pred) / len(Ytest)
print(f"Accuracy : {100 * accuracy:.2f}")
Ygrid_pred = svmsig.predict(Xgrid)
plot_svm_predictions(data, Ygrid_pred)

Ytest_pred = svmrbf.predict(Xtest)
accuracy = np.sum(Ytest == Ytest_pred) / len(Ytest)
print(f"Accuracy : {100 * accuracy:.2f}")
Ygrid_pred = svmrbf.predict(Xgrid)
plot_svm_predictions(data, Ygrid_pred)

Ytest_pred = svmpoly.predict(Xtest)
accuracy = np.sum(Ytest == Ytest_pred) / len(Ytest)
print(f"Accuracy : {100 * accuracy:.2f}")
Ygrid_pred = svmpoly.predict(Xgrid)
plot_svm_predictions(data, Ygrid_pred)