<a href="https://colab.research.google.com/github/CIS-522/course-content/blob/main/tutorials/W3_MLPs/W3_Tutorial2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CIS-522 Week 3 Part 2
# Multi-Layer Perceptrons (MLPs)

__Instructor__: Konrad Kording

__Content creators:__ Arash Ash

---
# Tutorial Objectives
In this tutorial, we delve deeper by using one of the most famous deep learning models of all!

MLPs are arguably one of the most tractable models that we can use to study deep learning fundamentals. Here we will learn why MLPs are: 

* similar to biological networks
* good at function approximation
* can evolve linearly in weights 
* the case of deep vs. wide
* dependant on transfer functions
* sensitive to initialization

In [None]:
#@markdown What is your Pennkey and pod? (text, not numbers, e.g. bfranklin)
my_pennkey = 'value' #@param {type:"string"}
my_pod = 'Select' #@param ['Select', 'euclidean-wombat', 'sublime-newt', 'buoyant-unicorn', 'lackadaisical-manatee','indelible-stingray','superfluous-lyrebird','discreet-reindeer','quizzical-goldfish','astute-jellyfish','ubiquitous-cheetah','nonchalant-crocodile','fashionable-lemur','spiffy-eagle','electric-emu','quotidian-lion']


In [None]:
# @title Slides
from IPython.display import HTML
HTML('<iframe src="https://docs.google.com/presentation/d/e/2PACX-1vSPvHqDTmMq4GyQy6lieNEFxq4qz1SmqC2RNoeei3_niECH53zneh8jJVYOnBIdk0Uaz7y2b9DK8V1t/embed?start=false&loop=false&delayms=3000" frameborder="0" width="480" height="299" allowfullscreen="true" mozallowfullscreen="true" webkitallowfullscreen="true"></iframe>')


Meet with your pod for 10 minutes to discuss what you learned, what was clear, and what you hope to learn more about.

In [None]:
#@markdown Tell us your thoughts about what you have learned.
my_w2_upshot = '' #@param {type:"string"}

---
# Setup

In [None]:
# @title Imports
import random
import pathlib

import torch
import numpy as np
import matplotlib.pyplot as plt

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, TensorDataset
from torchvision.utils import make_grid
from IPython.display import HTML, display

dev = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dev, torch.get_num_threads()

In [None]:
# @title Seeding for reproducibility
seed = 522
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.set_deterministic(True)
def seed_worker(worker_id):
    worker_seed = seed % (worker_id+1)
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [None]:
# @title Dataset download
%%capture
!rm -r AnimalFaces32x32/
!git clone https://github.com/arashash/AnimalFaces32x32
!rm -r afhq/
!unzip ./AnimalFaces32x32/afhq_32x32.zip 

In [None]:
# @title Figure Settings
import ipywidgets as widgets
%matplotlib inline 
fig_w, fig_h = (8, 6)
plt.rcParams.update({'figure.figsize': (fig_w, fig_h)})
%config InlineBackend.figure_format = 'retina'
my_layout = widgets.Layout()

In [None]:
# @title Helper functions
def imshow(img):
    img = img / 2 + 0.5     # unnormalize
    npimg = img.numpy()
    plt.imshow(np.transpose(npimg, (1, 2, 0)))
    plt.axis(False)
    plt.show()

def progress(epoch, loss, epochs=100):
    return HTML("""
        <label for="file">Training loss: {loss}</label>
        <progress
            value='{epoch}'
            max='{epochs}',
            style='width: 100%'
        >
            {epoch}
        </progress>
    """.format(loss=loss, epoch=epoch, epochs=epochs))

In [None]:
# @title Part 1 Code
class Net(nn.Module):
    def __init__(self, actv, num_inputs, hidden_units, num_outputs):
        super(Net, self).__init__()

        exec('self.actv = nn.%s'%actv)

        self.layers = nn.ModuleList()
        for i in range(len(hidden_units)):
          next_num_inputs = hidden_units[i] 
          self.layers += [nn.Linear(num_inputs, next_num_inputs)]
          num_inputs = next_num_inputs

        self.out = nn.Linear(num_inputs, num_outputs)

    def forward(self, x):
        # flattening
        x = x.view(x.shape[0], -1)

        for layer in self.layers:
          x = self.actv(layer(x))
        x = self.out(x)
        return x

K = 4
sigma = 0.4
N = 1000
t = torch.linspace(0, 1, N)
X = torch.zeros(K*N, 2)
y = torch.zeros(K*N)
for k in range(K):
  X[k*N:(k+1)*N, 0] = t*(torch.sin(2*np.pi/K*(2*t+k)) + sigma**2*torch.randn(N))   # [TO-DO]
  X[k*N:(k+1)*N, 1] = t*(torch.cos(2*np.pi/K*(2*t+k)) + sigma**2*torch.randn(N))   # [TO-DO]
  y[k*N:(k+1)*N] = k

# Shuffling
shuffled_indeces = torch.randperm(K*N)
X = X[shuffled_indeces]
y = y[shuffled_indeces]

# Test Train splitting
test_size = int(0.2*N)
X_test = X[:test_size]
y_test = y[:test_size]
X_train = X[test_size:]
y_train = y[test_size:]


batch_size = 128
test_data = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_data, batch_size=batch_size,
                         shuffle=False, num_workers=0)

train_data = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_data, batch_size=batch_size, drop_last=True,
                        shuffle=True, num_workers=0, worker_init_fn=seed_worker)

def train_test_classification(net, criterion, optimizer,
                              train_loader, test_loader,
                              num_epochs=1, verbose=True, 
                              training_plot=False):
  if verbose:
    progress_bar = display(progress(0, 0, num_epochs), display_id=True)

  net.train()
  training_losses = []
  for epoch in range(num_epochs):  # loop over the dataset multiple times
      running_loss = 0.0
      for i, data in enumerate(train_loader, 0):
          # get the inputs; data is a list of [inputs, labels]
          inputs, labels = data
          inputs = inputs.to(dev).float()
          labels = labels.to(dev).long()

          # zero the parameter gradients
          optimizer.zero_grad()

          # forward + backward + optimize
          outputs = net(inputs)

          loss = criterion(outputs, labels)
          loss.backward()
          optimizer.step()

          # print statistics
          if verbose:
            training_losses += [loss.item()]
            running_loss += loss.item()
            if i % 10 == 9:    # update every 10 mini-batches
                progress_bar.update(progress(epoch+1, running_loss / 10, num_epochs))
                running_loss = 0.0

  net.eval()
  def test(data_loader):
    correct = 0
    total = 0
    for data in data_loader:
        inputs, labels = data
        inputs = inputs.to(dev).float()
        labels = labels.to(dev).long()

        outputs = net(inputs)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    acc = 100 * correct / total
    return total, acc

  train_total, train_acc = test(train_loader)
  test_total, test_acc = test(test_loader)

  if verbose:
    print('Accuracy on the %d training samples: %0.2f %%' % (train_total, train_acc))
    print('Accuracy on the %d testing samples: %0.2f %%' % (test_total, test_acc))

  if training_plot:
    plt.plot(training_losses)
    plt.xlabel('Batch')
    plt.ylabel('Training loss')
    plt.show()
  
  return train_acc, test_acc

def sample_grid(M=500, x_max = 2.0):
  ii, jj = torch.meshgrid(torch.linspace(-x_max, x_max,M),
                          torch.linspace(-x_max, x_max, M))
  X_all = torch.cat([ii.unsqueeze(-1),
                     jj.unsqueeze(-1)],
                     dim=-1).view(-1, 2)
  return X_all

def plot_decision_map(X_all, y_pred, X_test, y_test, M=500, x_max = 2.0, eps = 1e-3):
  decision_map = torch.argmax(y_pred, dim=1)    # [TO-DO]

  for i in range(len(X_test)):
    indeces = (X_all[:, 0] - X_test[i, 0])**2 + (X_all[:, 1] - X_test[i, 1])**2 < eps    # [TO-DO]
    decision_map[indeces] = (K + y_test[i]).long()    # [TO-DO]

  decision_map = decision_map.view(M, M).cpu()
  plt.imshow(decision_map, extent=[-x_max, x_max, -x_max, x_max], cmap='jet')
  plt.plot()

In [None]:
#@title Video 10:

## Exercise 9: Implement gradient visualization

In [None]:
def plot_polytopes(grad, M=500, x_max=1):
  grad = grad.detach().cpu()
  grad_colors = grad[:, 0]     # [TO-DO]
  grad_colors = (grad_colors / grad_colors.max() * 1e3).int() % 10     # [TO-DO]
  grad_colors = grad_colors.view(M, M).cpu().numpy()
  plt.imshow(grad_colors, cmap='rainbow')
  plt.show()

rand_net = Net('ReLU()', X_train.shape[1], [128], K).to(dev)

criterion = nn.CrossEntropyLoss()
X_all = sample_grid(x_max=1)
labels = torch.zeros(len(X_all)).long()
X_all.requires_grad = True    # [TO-DO]
outputs = rand_net(X_all)

loss = torch.mean(outputs)    # [TO-DO]
# loss = torch.mean(outputs**2) # try this to see how it become non-linear
loss.backward()

plot_polytopes(X_all.grad)    # [TO-DO]

## Implement gradient visualization that flows! (optional)

In [None]:
def plot_grad(X_all, grad, y_test, M=500):
  grad = grad.detach().cpu()
  X_all = X_all.detach().cpu()

  plt.quiver(X_all[:, 0], X_all[:, 1],
             grad[:, 0], grad[:, 1], y_test)    # [TO-DO]
  plt.show()

rand_net = Net('ReLU()', X_train.shape[1], [128], K).to(dev)
X_test.requires_grad = True    # [TO-DO]
outputs = rand_net(X_test)

loss = criterion(outputs, y_test.long())    # [TO-DO]
loss.backward()

plot_grad(X_test, X_test.grad, y_test)    # [TO-DO]
X_test.requires_grad = False    # [TO-DO]

# MLPs vs Linear model with Polynomial features

In [None]:
#@title Video 11:

## Exercise 10: Add polynomial features and train without any hidden layers
[Outline the goal and steps and the formulas, especially how the num_features is calculated]

In [None]:
poly_degree = 50
def make_poly_features(poly_degree, X):
  num_features = (poly_degree+1)*(poly_degree+2)//2-1   # [TO-DO]
  poly_X = torch.zeros((X.shape[0], num_features))
  count = 0
  for i in range(poly_degree+1):
    for j in range(poly_degree+1):
      if j+i > 0: # no need to add zero degree since model has biases
        if j+i <= poly_degree:
          poly_X[:, count] = X[:, 0]**i * X[:, 1]**j   # [TO-DO]
          count += 1
  return poly_X

poly_X_test = make_poly_features(poly_degree, X_test)
poly_X_train = make_poly_features(poly_degree, X_train)

batch_size = 128
poly_test_data = TensorDataset(poly_X_test, y_test)
poly_test_loader = DataLoader(poly_test_data, batch_size=batch_size,
                        shuffle=False, num_workers=1)
poly_train_data = TensorDataset(poly_X_train, y_train)
poly_train_loader = DataLoader(poly_train_data, batch_size=batch_size,
                        shuffle=True, num_workers=1)

In [None]:
poly_net = Net('ReLU()', poly_X_train.shape[1], [], K).to(dev)    # [TO-DO] 
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(poly_net.parameters(), lr=1e-3)
num_epochs = 100
_, _ = train_test_classification(poly_net, criterion, optimizer, 
                                 poly_train_loader, poly_test_loader,
                                 num_epochs=num_epochs)

In [None]:
X_all = sample_grid()
poly_X_all = make_poly_features(poly_degree, X_all)
y_pred = poly_net(poly_X_all)
plot_decision_map(X_all, y_pred, X_test, y_test)

# Wider vs deeper networks

In [None]:
#@title Video 12:

## Exercise 11: Wider vs. Deeper while keeping number of parameters same
Let's find the optimal number of hidden layers under the constrained fixed number of parameters!

But first, let's implement a model parameter counter

In [None]:
def count_parameters(model):
  # facny implementation
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

def count_parameters(model):
  # more didactic implementation
  par_count = 0
  for p in model.parameters():
    if p.requires_grad:
      par_count += p.numel()     # [TO-DO]
  return par_count

In [None]:
max_par_count = 100
max_hidden_layer = 5
hidden_layers = range(1, max_hidden_layer+1)    # [TO-DO]
test_scores = []
for hidden_layer in hidden_layers:
  hidden_units = np.ones(hidden_layer, dtype=np.int)    # [TO-DO]
  wide_net = Net('ReLU()', X_train.shape[1], hidden_units, K).to(dev)
  par_count = count_parameters(wide_net)
  while par_count < max_par_count:
    hidden_units += 1
    wide_net = Net('ReLU()', X_train.shape[1], hidden_units, K).to(dev)
    par_count = count_parameters(wide_net)

  criterion = nn.CrossEntropyLoss()
  optimizer = optim.Adam(wide_net.parameters(), lr=1e-3)
  num_epochs = 100
  _, test_acc = train_test_classification(wide_net, criterion, optimizer, train_loader,
                                          test_loader, num_epochs=num_epochs)    # [TO-DO]
  test_scores += [test_acc]

plt.xlabel('# of hidden layers')
plt.ylabel('Test accuracy')
plt.plot(hidden_layers, test_scores)
plt.show()

# Neural Tangent Kernels (NTKs)

In [None]:
#@title Video 13:

## Exercise 12: Motivation for NTKs
lazy training of overcomplete MLPs results in linear changes in weights. Let's try to see it here,

In [None]:
net = Net('ReLU()', X_train.shape[1], [1000], K).to(dev)    # [TO-DO] 
criterion = nn.CrossEntropyLoss()    # [TO-DO]
optimizer = optim.SGD(net.parameters(), lr=1e-2)    # [TO-DO]
num_select_weights = 10
num_time_steps = 5
step_epoch = 40
weights = torch.zeros(num_time_steps, num_select_weights)
for i in range(num_time_steps):
  _, _ = train_test_classification(net, criterion, optimizer, train_loader,
                                  test_loader, num_epochs=step_epoch, verbose=False)
  weights[i] = net.layers[0].weight[:num_select_weights, 0]    # [TO-DO]

for k in range(num_select_weights):
  weight = weights[:, k].detach()    # [TO-DO]
  epochs = range(1, 1+num_time_steps*step_epoch, step_epoch)
  plt.plot(epochs, weight, label='weight #%d'%k)

plt.xlabel('epochs')
plt.legend()
plt.show()
_, _ = train_test_classification(net, criterion, optimizer, train_loader, test_loader, num_epochs=1)

# Deeper MLPs
[The ability of deeper MLP to approximate a broader set of functions]

In [None]:
#@title Video 14:

## Exercise 13: Classification on a real world dataset
[Outline the goal and steps, introducing the need for augmentation, preprocessing (bring to -1,1 range), small batch_size (due to overfitting), multithreading in data loaders]
[Ask to choose a good choice for augmentation and preprocessing]

In [None]:
# Data Loaders
batch_size = 128
train_transform = transforms.Compose([
     transforms.RandomRotation(10), # [TO-DO]
     transforms.RandomHorizontalFlip(),    # [TO-DO]
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))    # [TO-DO] example of a simple one
     ])

data_path = pathlib.Path('.')/'afhq' # using pathlib to be compatible with all OS's
img_train_dataset = ImageFolder(data_path/'train', transform=train_transform)
# num_workers can be set to 10 if running on Colab Pro TPUs
img_train_loader = DataLoader(img_train_dataset, batch_size=batch_size,
                              shuffle=True, num_workers=10, worker_init_fn=seed_worker)

test_transform = transforms.Compose([
     transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))    # [TO-DO]
     ])
img_test_dataset = ImageFolder(data_path/'val', transform=test_transform)
img_test_loader = DataLoader(img_test_dataset, batch_size=batch_size,
                        shuffle=False, num_workers=1)
classes = ('cat', 'dog', 'wild')

In [None]:
# get some random training images
dataiter = iter(img_train_loader)
images, labels = dataiter.next()

# show images
imshow(make_grid(images, nrow=16))

In [None]:
net = Net('ReLU()', 3*32*32, [128, 32], 3).to(dev) 
criterion = nn.MultiMarginLoss(margin=1.0)    # [TO-DO]
optimizer = optim.Adam(net.parameters(), lr=3e-4)
_, _ = train_test_classification(net, criterion, optimizer,
                                img_train_loader, img_test_loader,
                                num_epochs=50)

In [None]:
fc1_weights = net.layers[0].weight.view(128, 3, 32, 32).detach().cpu()     # [TO-DO]
fc1_weights /= torch.max(torch.abs(fc1_weights))
imshow(make_grid(fc1_weights, nrow=16))

# The choice of transfer function matters
[introduce different properties of different transfer functions]

In [None]:
#@title Video 15:

## Exercise 14: Find the best transfer function for this model
[categorizing Pytorch transfer functions according to their properties in a table so that students could have a better educated guess on which one to pick instead of trying all!]

In [None]:
# Possible Activations
Activations = ['ReLU', 'Tanh', 'Sigmoid', 'ELU', 'Hardshrink', 'Hardsigmoid',
'Hardtanh', 'Hardswish', 'LeakyReLU', 'LogSigmoid', 'PReLU',
'ReLU6', 'RReLU', 'SELU', 'CELU', 'GELU', 'SiLU', 'Softplus',
'Softshrink', 'Softsign', 'Tanhshrink']

your_picks = ['Hardswish'] # [TO-DO] other picks above 91.5% test accuracy is acceptable

for actv in your_picks:
  print(actv)
  actv = actv+'()'
  net = Net(actv, 3*32*32, [128, 32], 3).to(dev) 
  criterion = nn.MultiMarginLoss(margin=1.0)
  optimizer = optim.Adam(net.parameters(), lr=3e-4)
  _, _ = train_test_classification(net, criterion, optimizer,
                                  img_train_loader, img_test_loader,
                                  num_epochs=50)

In [None]:
fc1_weights = net.layers[0].weight.view(128, 3, 32, 32).detach().cpu()
fc1_weights /= torch.max(torch.abs(fc1_weights))
imshow(make_grid(fc1_weights, nrow=16))

# The need for good initialization
[The discussion about why optimal gain is dependent with the transfer functions and what is theoretical optimal solution for Leaky ReLU]

In [None]:
#@title Video 16:

## Xavier initialization
Let us look at the scale distribution of an output (e.g., a hidden variable)  $o_i$  for some fully-connected layer without nonlinearities. With  $n_{in}$  inputs  ($x_j$)  and their associated weights  $w_{ij}$  for this layer. Then an output is given by,
$$
o_{i} = \sum_{j=1}^{n_\mathrm{in}} w_{ij} x_j
$$
The weights  $w_{ij}$  are all drawn independently from the same distribution. Furthermore, let us assume that this distribution has zero mean and variance  $\sigma^2$ . Note that this does not mean that the distribution has to be Gaussian, just that the mean and variance need to exist. For now, let us assume that the inputs to the layer  $x_j$ also have zero mean and variance  $\gamma^2$  and that they are independent of $w_{ij}$ and independent of each other. In this case, we can compute the mean and variance of $o_i$ as follows:
\begin{split}\begin{aligned}
    E[o_i] & = \sum_{j=1}^{n_\mathrm{in}} E[w_{ij} x_j] = \sum_{j=1}^{n_\mathrm{in}} E[w_{ij}] E[x_j] = 0, \\
    \mathrm{Var}[o_i] & = E[o_i^2] - (E[o_i])^2 = \sum_{j=1}^{n_\mathrm{in}} E[w^2_{ij} x^2_j] - 0 = \sum_{j=1}^{n_\mathrm{in}} E[w^2_{ij}] E[x^2_j] = n_\mathrm{in} \sigma^2 \gamma^2
\end{aligned}\end{split}
One way to keep the variance fixed is to set $n_{in}\sigma^2=1$ . Now consider backpropagation. There we face a similar problem, albeit with gradients being propagated from the layers closer to the output. Using the same reasoning as for forward propagation, we see that the gradients’ variance can blow up unless $n_{out}\sigma^2=1$ , where  $n_{out}$ is the number of outputs of this layer. This leaves us in a dilemma: we cannot possibly satisfy both conditions simultaneously. Instead, we simply try to satisfy:
\begin{aligned}
\frac{1}{2} (n_\mathrm{in} + n_\mathrm{out}) \sigma^2 = 1 \text{ or equivalently }
\sigma = \sqrt{\frac{2}{n_\mathrm{in} + n_\mathrm{out}}}
\end{aligned}
This is the reasoning underlying the now-standard and practically beneficial Xavier initialization, named after the first author of its creators [Glorot & Bengio, 2010]. Typically, the Xavier initialization samples weights from a Gaussian distribution with zero mean and variance  $\sigma^2=\frac{2}{(n_{in}+n_{out})}$. We can also adapt Xavier’s intuition to choose the variance when sampling weights from a uniform distribution. Note that the uniform distribution $U(−a,a)$ has variance $\frac{a^2}{3}$. Plugging this into our condition on $\sigma^2$ yields the suggestion to initialize according to
$$
U\left(-\sqrt{\frac{6}{n_\mathrm{in} + n_\mathrm{out}}}, \sqrt{\frac{6}{n_\mathrm{in} + n_\mathrm{out}}}\right)
$$
This explanation is mainly taken from [here](https://d2l.ai/chapter_multilayer-perceptrons/numerical-stability-and-init.html).


## Initialization with transfer function
Let's derive the optimal gain for LeakyReLU following a similar steps,

$$
f(x)=\left\{\begin{array}{ll}
a x & \text { for } x<0 \\
x & \text { for } x \geq 0
\end{array}\right.
$$

Considering a single layer with activation gives, 


The expectation of the output is still zero but the variance changes and assuming the probability $P(x < 0) = 0.5$

\begin{split}\begin{aligned}
    \mathrm{Var}[f(o_i)] = E[f(o_i)^2] & = \frac{\mathrm{Var}[o_i] + a^2 \mathrm{Var}[o_i]}{2} = \frac{1+a^2}{2}n_\mathrm{in} \sigma^2 \gamma^2
\end{aligned}\end{split}

Therefore following the rest of derivation as before,

$$
\sigma = gain\sqrt{\frac{2}{n_\mathrm{in} + n_\mathrm{out}}}, \quad gain = \sqrt{\frac{2}{1+a^2}}
$$

## Exercise 15: Best gain for Xavier Initialization with Leaky ReLU

In [None]:
N = 10 # number of trials
gains = np.linspace(1/N, 3.0, N)
test_accs = []
train_accs = []
for gain in gains:

  def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_normal_(m.weight, gain) # [TO-DO]
        # torch.nn.init.xavier_uniform_(m.weight, gain)

  negative_slope = 0.1
  actv = 'LeakyReLU(%f)'%negative_slope # [TO-DO]
  net = Net(actv, 3*32*32, [128, 64, 32], 3).to(dev) 
  net.apply(init_weights) # [TO-DO]
  criterion = nn.CrossEntropyLoss()
  # criterion = nn.MultiMarginLoss(margin=1.0)
  optimizer = optim.SGD(net.parameters(), lr=1e-2) # [TO-DO]
  train_acc, test_acc = train_test_classification(net, criterion, optimizer,
                                                  img_train_loader, img_test_loader,
                                                  num_epochs=1, verbose=False)
  test_accs += [test_acc]
  train_accs += [train_acc]

best_gain = gains[np.argmax(train_accs)]
plt.plot(gains, test_accs, label='Test')
plt.plot(gains, train_accs, label='Train')
plt.scatter(best_gain, max(train_accs), label='argmax gain = %.1f'%best_gain, c='r')
theoretical_gain = np.sqrt(2.0 / (1 + negative_slope ** 2)) # [TO-DO]
plt.scatter(theoretical_gain, max(train_accs), label='theoretical gain = %.2f'%theoretical_gain, c='g')
plt.legend()
plt.plot()

# Conclusion
[Overcomplete MLPs are good (make link to neural tangent kernels), show how the infinite width limit produces beautifully smooth interpolations]

[High dimensional spaces intuition, the idea of ensemble methods, See how mixing multiple models often helps. Conceptualize ANNs as many models in parallel]

In [None]:
#@title Video 17:

#Feedback
how could this session have been better? How happy are you in your group? How do you feel right now?

In [None]:
# report to Airtable

# Homeworks
* (1) Join the Kaggle Competition to solve Animal Faces with random permutations using MLPs.
* (2) Something where you debug something: Cross-entropy optimization with poor initialization, producing NaNs.
* (3) Something related to ethics: A classification system with interest bias?
* (4) Read some cool original paper:  Kernel vs Rich regimes paper?


# Kaggle competition
https://www.kaggle.com/c/permuted-animal-faces/

preparing the Kaggle dataset by permuting the animal faces

[ To-be removed in the end]

In [None]:

# Datasets
train_transform = transforms.Compose([
     transforms.RandomRotation(10),
     transforms.RandomHorizontalFlip(),
     ])

data_path = pathlib.Path('.')/'afhq' # using pathlib to be compatible with all OS's
img_train_dataset = ImageFolder(data_path/'train', transform=train_transform)

test_transform = transforms.Compose([
     transforms.RandomRotation(10),
     transforms.RandomHorizontalFlip(),
     ])
img_test_dataset = ImageFolder(data_path/'val', transform=test_transform)

In [None]:
rand_perm = np.random.permutation(3*32*32)

In [None]:
import json
with open("rand_perm.txt", "w") as fp:
  json.dump(rand_perm.tolist(), fp)

In [None]:
test_data = np.zeros((len(img_test_dataset), 1+3*32*32),
                     dtype=np.uint8)
for i, data in enumerate(img_test_dataset):
  X, y = data
  X = np.array(X, dtype=np.uint8).reshape(-1)
  X = X[rand_perm]
  test_data[i, 0] = y
  test_data[i, 1:] = X
test_data = np.random.permutation(test_data)

In [None]:
header = []
for i in range(3*32*32):
  header += ['pixel%d'%(i+1)]

import csv
with open('test.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    
    for i in range(test_data.shape[0]):
      data = test_data[i, 1:].tolist()
      writer.writerow(data)

header = ['ImageId', 'Label']
with open('solution.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    
    for i in range(test_data.shape[0]):
      data = [i+1] + [test_data[i, 0]]
      writer.writerow(data)

header = ['ImageId', 'Label']
with open('sampleSubmission.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    
    for i in range(test_data.shape[0]):
      writer.writerow([i+1] + [1])

In [None]:
train_data = np.zeros((len(img_train_dataset), 1+3*32*32),
                     dtype=np.uint8)
for i, data in enumerate(img_train_dataset):
  X, y = data
  X = np.array(X, dtype=np.uint8).reshape(-1)
  X = X[rand_perm]
  train_data[i, 0] = y
  train_data[i, 1:] = X
train_data = np.random.permutation(train_data)

In [None]:
header = ["Label"]
for i in range(3*32*32):
  header += ['pixel %d'%(i+1)]

with open('train.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(header)
    
    for i in range(train_data.shape[0]):
      data = train_data[i].tolist()
      writer.writerow(data)