## Let's import the stuff we need in this notebook

In [None]:
import matplotlib.pyplot as plt
from matplotlib.tri import Triangulation
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F 

## Let's define two data distributions in 2d
These functions generate two distributions of points with 2d coordinates. 

In [None]:
def distrib1(n):
    x = np.random.rand(n,2)-0.5
    x[:,1] = 0.4 - 0.2*x[:,0] - 0.3*np.cos(6*np.pi*x[:,0]) + 0.1*np.random.randn(n) -(np.random.randn(n)<0)*0.8
    return x

def distrib2(n):
    x = np.random.rand(n,2)-0.5
    x[:,1] = - 0.2*x[:,0] - 0.3*np.cos(6*np.pi*x[:,0]) + 0.05*np.random.randn(n)
    return x


## Training samples and their visualization
Lets sample n examples from the two distributions and visualize them. The task of our classification network will be to predict the membership to one of the classes for given 2d coordinates of a point. 

In [None]:
n = 110
x1 = distrib1(n)
x2 = distrib2(n)
print(x1.shape, x2.shape)

plt.plot(x1[:,0],x1[:,1], 'x', label='Class 0')
plt.plot(x2[:,0],x2[:,1], 'o', label='Class 1')
plt.legend()
plt.grid()
plt.show()

## Concatenate all points and generate class labels in order to have training data

In [None]:
Xtrain = np.concatenate((x1,x2),axis=0)
Ytrain = np.ones((2*n))
Ytrain[0:n] = 0


Xtrain = torch.from_numpy(Xtrain).type(torch.FloatTensor)
Ytrain = torch.from_numpy(Ytrain).type(torch.LongTensor)

## Let's define a simple (fully connected) network

In [None]:
# Lets define a network architecture
class myNet(nn.Module):
    
    def __init__(self):
        super().__init__()
        d_hidden = 40
        self.fc1 = nn.Linear(2, d_hidden)
        self.fc2 = nn.Linear(d_hidden,d_hidden)
        self.fc3 = nn.Linear(d_hidden,2)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
        
        

## And train the network

In [None]:
## Lets create a network instance and train it
model = myNet()

# training
criterion = torch.nn.CrossEntropyLoss()
learning_rate = 1e-2
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(10001):
    y_pred = model(Xtrain)
    loss = criterion(y_pred, Ytrain)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if np.mod(epoch,1000)==0:
            print(epoch, loss.item())

## Visualization of the result
A nice way to visualize the predictions of a network in such a simple 2d example is to feed an entire meshgrid of coordinates (i.e. a dense sampling of the 2d plane) into the network, and visualize the predictions via a contour plot  

#### What is a meshgrid?

In [None]:
x = torch.linspace(-1,1,5)
print(x)

In [None]:
grid_y, grid_x = torch.meshgrid(x, x)
print(grid_x)

#### The visualization function

In [None]:
def visualizeMyDecisionBoundary(model):
    model.eval()
    with torch.no_grad():
        c=100
        x = torch.linspace(-0.7,0.7,c)
        y = torch.linspace(-0.9,0.9,c)
        grid_y, grid_x = torch.meshgrid(x, y)
        wholeGrid = torch.cat((torch.reshape(grid_y, (-1,1)), torch.reshape(grid_x, (-1,1))), 1)

        probs = F.softmax(model(wholeGrid),dim=1) # Predicted probabilities for each point on the grid
        decisionBoundary = torch.reshape(probs[:,0], (c,c)) # Reshape into an image
        plt.contourf(grid_y.detach().numpy() , grid_x.detach().numpy() , decisionBoundary.detach().numpy(),np.linspace(0,1,20),cmap=plt.cm.bone)

        plt.plot(x1[:,0],x1[:,1], 'x', label='Class 0')
        plt.plot(x2[:,0],x2[:,1], 'o', label='Class 1')
        plt.legend()
        plt.grid()

    

In [None]:
visualizeMyDecisionBoundary(model)

## Now let us define the label smoothing loss ...

In [None]:
class LabelSmoothingLoss(nn.Module):
    
    def __init__(self, smoothing=0.0):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing = smoothing

    def forward(self, prediction, target):
        assert 0 <= self.smoothing < 1
        neglog_softmaxPrediction = -prediction.log_softmax(dim=1)

        with torch.no_grad():
            smoothedLabels = self.smoothing / (prediction.size(1) - 1)* torch.ones_like(prediction)
            smoothedLabels.scatter_(1, target.data.unsqueeze(1), 1-self.smoothing)
        return torch.mean(torch.sum(smoothedLabels * neglog_softmaxPrediction, dim=1)) 

## ... and train a network with it

In [None]:
## Lets create a network instance and train it
modelLS = myNet()


# training
criterion = LabelSmoothingLoss(smoothing=0.1)   #<<<< This is the only line that changed!! 
learning_rate = 1e-2
optimizer = torch.optim.Adam(modelLS.parameters(), lr=learning_rate)
for epoch in range(10000):
    y_pred = modelLS(Xtrain)
    loss = criterion(y_pred, Ytrain)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if np.mod(epoch,1000)==0:
            print(epoch, loss.item())
            

## Let's visualize the decision boundary of the network trained with label smoothing

In [None]:
visualizeMyDecisionBoundary(modelLS)

##### Reminder - this was the original decision boundary

In [None]:
visualizeMyDecisionBoundary(model)

## Let's play with Maxup!

In [None]:
## Implementation inspired by https://github.com/JonasGeiping/data-poisoning/
# see forest / data / mixing_data_augmentations.py

class Maxup(torch.nn.Module):
    """A meta-augmentation, returning the worst result from a range of augmentations.
    As in the orignal paper, https://arxiv.org/abs/2002.09024,
    """

    def __init__(self, given_data_augmentation, ntrials=4):
        """Initialize with a given data augmentation module."""
        super().__init__()
        self.augment = given_data_augmentation
        self.ntrials = ntrials
        self.max_criterion = torch.nn.CrossEntropyLoss(reduction='none')

    def forward(self, x, y):
        additional_x, additional_labels = [], []
        for trial in range(self.ntrials):
            x_out, y_out = self.augment(x, y)
            additional_x.append(x_out)
            additional_labels.append(y_out)

        additional_x = torch.cat(additional_x, dim=0)
        additional_labels = torch.cat(additional_labels, dim=0)
        
        return additional_x, additional_labels


    def maxup_loss(self, outputs, extra_labels):
        """Compute loss. Here the loss is computed as worst-case estimate over the trials."""
        batch_size = outputs.shape[0] // self.ntrials
        correct_preds = (torch.argmax(outputs.data, dim=1) == extra_labels).sum().item() / self.ntrials
        stacked_loss = self.max_criterion(outputs, extra_labels).view(batch_size, self.ntrials, -1)
        loss = stacked_loss.max(dim=1)[0].mean()
        
        return loss, correct_preds

In [None]:
def myNoiseAdditionAugmenter(x,y):
    sigma = 0.03
    return x + sigma*torch.randn_like(x), y

### Let's try our maxup implementation out in training

In [None]:
## Lets create a network instance and train it
modelMaxup = myNet()


# training
maxup = Maxup(myNoiseAdditionAugmenter, ntrials=2)
learning_rate = 1e-2
optimizer = torch.optim.Adam(modelMaxup.parameters(), lr=learning_rate)
for epoch in range(10000):
    
    inputs,targets = maxup(Xtrain.detach().clone(),Ytrain.detach().clone())
    y_pred = modelMaxup(inputs)
    loss = maxup.maxup_loss(y_pred, targets.long())[0]
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if np.mod(epoch,1000)==0:
            print(epoch, loss.item())
            

In [None]:
visualizeMyDecisionBoundary(modelMaxup)