This program trains a two-layer neural network on the famous XOR task.

In [1]:
import torch
import torch.utils.data
import torch.nn.functional as F

lr = 0.1
mom = 0.0
init = 1.0

class MyModel(torch.nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # define structure of the network here
        self.in_hid  = torch.nn.Linear(2,2)
        self.hid_out = torch.nn.Linear(2,1)
    def forward(self, input):
        # apply network and return output
        hid_sum = self.in_hid(input)
        hidden  = torch.tanh(hid_sum)
        out_sum = self.hid_out(hidden)
        output  = torch.sigmoid(out_sum)
        return(output)

device = 'cpu'

input  = torch.Tensor([[0,0],[0,1],[1,0],[1,1]])
target = torch.Tensor([[0],[1],[1],[0]])

xor_dataset  = torch.utils.data.TensorDataset(input,target)
train_loader = torch.utils.data.DataLoader(xor_dataset,batch_size=4)

# create neural network according to model specification
net = MyModel().to(device) # CPU or GPU

# initialize weight values
net.in_hid.weight.data.normal_(0,init)
net.hid_out.weight.data.normal_(0,init)

# choose between SGD, Adam or other optimizer
optimizer = torch.optim.SGD(net.parameters(),lr=lr,momentum=mom)

epochs = 10000

for epoch in range(1, epochs):
    #train(net, device, train_loader, optimizer)
    for batch_id, (data,target) in enumerate(train_loader):
        optimizer.zero_grad() # zero the gradients
        output = net(data)    # apply network
        loss = F.binary_cross_entropy(output,target)
        loss.backward()       # compute gradients
        optimizer.step()      # update weights
        if epoch % 100 == 0:
            print('ep%3d: loss = %7.4f' % (epoch, loss.item()))
        if loss < 0.01:
            print("Global Mininum")
            exit(0)
print("Local Minimum")

ep100: loss =  0.5490
ep200: loss =  0.4588
ep300: loss =  0.4179
ep400: loss =  0.3973
ep500: loss =  0.3854
ep600: loss =  0.3778
ep700: loss =  0.3726
ep800: loss =  0.3688
ep900: loss =  0.3660
ep1000: loss =  0.3638
ep1100: loss =  0.3620
ep1200: loss =  0.3605
ep1300: loss =  0.3593
ep1400: loss =  0.3583
ep1500: loss =  0.3574
ep1600: loss =  0.3566
ep1700: loss =  0.3560
ep1800: loss =  0.3554
ep1900: loss =  0.3549
ep2000: loss =  0.3544
ep2100: loss =  0.3540
ep2200: loss =  0.3536
ep2300: loss =  0.3533
ep2400: loss =  0.3530
ep2500: loss =  0.3527
ep2600: loss =  0.3524
ep2700: loss =  0.3522
ep2800: loss =  0.3520
ep2900: loss =  0.3518
ep3000: loss =  0.3516
ep3100: loss =  0.3514
ep3200: loss =  0.3513
ep3300: loss =  0.3511
ep3400: loss =  0.3510
ep3500: loss =  0.3508
ep3600: loss =  0.3507
ep3700: loss =  0.3506
ep3800: loss =  0.3505
ep3900: loss =  0.3504
ep4000: loss =  0.3502
ep4100: loss =  0.3502
ep4200: loss =  0.3501
ep4300: loss =  0.3500
ep4400: loss =  0.34

## Question 1:
Run the code ten times. For how many runs does it reach the Global Minimum?

For how many runs does it reach a local minimum?

## Answer:
It should reach the global minimum in approximately half of the runs, and it gets stuck in a local minimum for the other half

## Question 2:
Keeping the learning rate fixed at `0.1`, adjust the values of momentum `(mom)` on line 6 and initial weight size `(init)` on line 7 to see if you can find values for which the code converges relatively quickly to the Global Minimum on virtually every run.

## Answer:
with `mom=0.9` and `init=0.01` it should successfully reach the Global minimum in 99% of runs.