# Comparison with torch.nn modules
Implements everything that is asked in mini-project _with_ the use of the torch.nn functions. Specifically:
* Generates a training and a test set of 1000 points sampled uniformly in [0,1]x[0,1] each with a label 0 if outside the disk of radius 1/sqrt(2*pi) and label 1 inside,
* builds a network with two input units, two output units, three hidden layers of 25 units
* trains it with MSE, logging the loss
* computes and prints the final train and the test errors

In [108]:
import torch
from torch import Tensor
from torch import nn  # not allowed in project 2; just for baseline comparison
from torch.autograd import Variable
from math import pi

In [135]:
def generate_disc_set(nb, one_hot=False):
    """return torch.FloatTensor of size nb x 2 and 
    torch.FloatTensor of size nb (if one_hot=False) or nb x 2 (if one_hot=True) """
    coordinates = Tensor(nb, 2).uniform_(0, 1)
    target = coordinates.pow(2).sum(dim=1).sub(1.0/(2.0*pi)).sign().add(1).div(2).long() # 0.5* [sign(x^2 + y^2 - 2/pi) + 1]
    if one_hot:
        # Useful for MSE loss: convert from scalar labels to vector labels
        target_one_hot = -torch.ones((nb, 2)).long()
        # Stupid Tensor does not seem to have an efficient way to do this; call it quits and use ugly loop
        for k in range(target.size(0)):
            target_one_hot[k, target[k]] = 1
        target = target_one_hot
    return (coordinates, target)

def check_and_normalize(train_input, train_target, test_input, test_target, one_hot=False):
    """Normalize train and test data by subtracting train mean and dividing by train standard deviation"""
    if one_hot:
        in_disk_train = (train_target[:, 1]==1).sum()
        in_disk_test = (test_target[:, 1]==1).sum()
    else:
        in_disk_train = train_target.sum()
        in_disk_test = test_target.sum()

    print("Sanity check: {:d} out of {:d} training points inside disk, i.e."
          " {:3.2f}% (expected: 50%).".format(in_disk_train, train_input.size(0), 100*in_disk_train/train_input.size(0)))
    print("Sanity check: {:d} out of {:d} test points inside disk, i.e."
          " {:3.2f}% (expected: 50%).".format(in_disk_test, test_input.size(0), 100*in_disk_test/test_input.size(0)))

    # Normalize train and test input: subtract training mean and divide by training variance
    train_mean, train_std = train_input.mean(0), train_input.std(0)

    train_input.sub_(train_mean).div_(train_std)
    test_input.sub_(train_mean).div_(train_std)

    print("Sanity check: mean", " ".join("%.4g" % (i,) for i in train_input.mean(0)), " (expected 0 0)"
          " and variance:", " ".join("%.3g" % (i,) for i in train_input.std(0)), "(expected 1 1).")

    return Variable(train_input), Variable(train_target), Variable(test_input), Variable(test_target)

def train_model(model, train_input, train_target, eta=0.1, n_epochs=250, batch_size=100, momentum=0.8, log_loss=False):
    Ndata = train_input.size(0)
    # loss_function = torch.nn.CrossEntropyLoss() # takes 2 args: network output and sample target
    loss_function = torch.nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=eta, momentum=momentum) # SOL
    for e in range(0, n_epochs):
        epoch_loss = 0
        for b_start in range(0, train_input.size(0), batch_size):
            bsize_eff = batch_size - max(0, b_start + batch_size - Ndata)   # accounts for boundary effects
            batch_output = model(train_input.narrow(0, b_start, bsize_eff))
            batch_loss = loss_function(batch_output, train_target.narrow(0, b_start, bsize_eff).float())  # instance of Variable
            epoch_loss = epoch_loss + batch_loss.data[0]
            model.zero_grad()  # seems to work well outside of batch loop, sort of inertia?
            batch_loss.backward()
            optimizer.step()
        if log_loss and e % round(n_epochs/4) == 0:  # prints 4 times
            print("Epoch {:d}/{:d}: epoch loss {:5.4g}".format(e+1, n_epochs, epoch_loss))

def compute_nb_errors(model, data_input, data_target, one_hot=False, batch_size=100):
    nb_errors = 0
    nb_good = 0
    Ndata = data_input.size(0)
    for b_start in range(0, data_input.size(0), batch_size):
        bsize_eff = batch_size - max(0, b_start + batch_size - Ndata)   # accounts for boundary effects
        batch_output = model(data_input.narrow(0, b_start, bsize_eff))  # Nbatch x 2 if one_hot=True, Nbatch otherwise
        if one_hot:
            pred_label = batch_output.max(dim=1)[1]  # size Nbatch
            data_label = data_target.narrow(0, b_start, bsize_eff).max(dim=1)[1]  # could be done outside the batch loop; size is Nbatch
            nb_err_batch = 0
            for k in range(bsize_eff): # not very efficient but safest bet given how poorly torch operations are designed
                if data_label.data[k] != pred_label.data[k]: # data extracts torch.Tensor out of Variable
                    nb_err_batch = nb_err_batch + 1
        else:
            nb_err_batch = (batch_output.max(1)[1] != data_target.narrow(0, b_start, bsize_eff)).long().sum()
        # HUGE overflow problem if conversion to Long Int not performed, treated as short 1-byte int otherwise!!
        nb_errors += nb_err_batch
    if isinstance(nb_errors, torch.autograd.Variable):
        nb_errors = nb_errors.data[0]
    return nb_errors

def create_miniproject2_model(nonlin_activ=nn.ReLU()):
    return nn.Sequential(nn.Linear(2, 25), nonlin_activ, nn.Linear(25, 25), nonlin_activ, nn.Linear(25, 25), nonlin_activ, nn.Linear(25, 2))

In [136]:
# Generate toy-example data
nb_train = 1000
nb_test = 5000

# Set one_hot=True to generate targets with C dimensions, C being the number of classes (C=2 here) s.t. 
# target[i]=1 when i=C and target[i]=-1 elsewhere, for i=1,..., nb
# one_hot=True is necessary for MSELoss; one_hot=False is necessary for torch.nn.CrossEntropyLoss()
one_hot = True  

train_input, train_target = generate_disc_set(nb_train, one_hot)
test_input, test_target = generate_disc_set(nb_test, one_hot)

(train_input,
 train_target,
 test_input,
 test_target) = check_and_normalize(train_input, train_target, test_input, test_target, one_hot)

Sanity check: 889 out of 1000 training points inside disk, i.e. 88.90% (expected: 50%).
Sanity check: 4353 out of 5000 test points inside disk, i.e. 87.06% (expected: 50%).
Sanity check: mean 3.147e-07 -7.411e-07  (expected 0 0) and variance: 1 1 (expected 1 1).


# Ideal network for comparison
- dropout can certainly help! See lecture's toy example (same as ours!), handout 6, slide 44/82

In [138]:
# Meta parameters
log_loss = True  # True for printing the loss during the training; False for no verbose at all
batch_size = 150  # does not have to divide total number of training sample but it's probably better to do so
n_epochs = 250  # number of times the training samples are visited
eta = 0.1  # learning rate
momentum = 0.8  # "inertia" (see handout 5, slide 22/83) 
non_lin_activation = nn.Tanh()  # nn.Tanh(), nn.ReLU(), ... (http://pytorch.org/docs/stable/nn.html#non-linear-activations-weighted-sum-nonlinearity)

model = create_miniproject2_model(non_lin_activation)

# Train and test 3-hidden-layer MLP with various initialization strategies
for std in ["PyTorch", 1e-3, 1e-2, 1e-1, 1, 10]:
    if type(std) is str: 
        # PyTorch default initialization (supposed to be good)
        train_model(model, train_input, train_target, n_epochs=n_epochs, eta=eta, batch_size=batch_size, log_loss=log_loss)
        n_err_train = compute_nb_errors(model, train_input, train_target, one_hot, batch_size)
        n_err_test = compute_nb_errors(model, test_input, test_target, one_hot, batch_size)
        print("PyTorch default initialization:train error {:g}%, test error {:g}%\n".format(
            n_err_train*100/nb_train, n_err_test*100/nb_test))  
    else:
        # Gaussian initialization
        for p in model.parameters():
            p.data.normal_(0, std)
        train_model(model, train_input, train_target, eta=eta, n_epochs=n_epochs, batch_size=batch_size, log_loss=log_loss)
        n_err_train = compute_nb_errors(model, train_input, train_target, one_hot, batch_size)
        n_err_test = compute_nb_errors(model, test_input, test_target, one_hot, batch_size)
        print("Gaussian initialization N(0,sig={}): train error {:g}%, test error {:g}%".format(
            std, n_err_train*100/nb_train, n_err_test*100/nb_test))
        
        # Uniform initialization (avoid repeating PyTorch default initialization)
        for p in model.parameters():
            p.data.uniform_(-std, std)
        train_model(model, train_input, train_target, eta=eta, n_epochs=n_epochs, batch_size=batch_size, log_loss=log_loss)
        n_err_train = compute_nb_errors(model, train_input, train_target, one_hot, batch_size)
        n_err_test = compute_nb_errors(model, test_input, test_target, one_hot, batch_size)
        print("Uniform initialization [{},{}]: train error {:g}%, test error {:g}%\n".format(
            -std, std, n_err_train*100/nb_train, n_err_test*100/nb_test))


PyTorch default initialization:train error 0.3%, test error 0.28%

Gaussian initialization N(0,sig=0.001): train error 11.1%, test error 12.94%
Uniform initialization [-0.001,0.001]: train error 11.1%, test error 12.94%

Gaussian initialization N(0,sig=0.01): train error 2.3%, test error 2.3%
Uniform initialization [-0.01,0.01]: train error 11.1%, test error 12.94%

Gaussian initialization N(0,sig=0.1): train error 0.3%, test error 0.32%
Uniform initialization [-0.1,0.1]: train error 0.4%, test error 0.42%

Gaussian initialization N(0,sig=1): train error 0%, test error 0.32%
Uniform initialization [-1,1]: train error 0.2%, test error 0.2%

Gaussian initialization N(0,sig=10): train error 5.2%, test error 4.48%
Uniform initialization [-10,10]: train error 3.9%, test error 3.76%

