# XOR perceptron using pytorch tensors only and double layer network

In [1]:
import torch

# Training data for XOR plus bias
x = torch.tensor([[0, 0, 1], [0, 1, 1], [1, 0, 1], [1, 1, 1]], dtype=float)
y = torch.tensor([[0], [1], [1], [0]], dtype=float)
w1 = torch.randn(3, 2, dtype=float, requires_grad=True)
w2 = torch.randn(2, 1, dtype=float, requires_grad=True)
b2 = torch.randn(1, 1, dtype=float, requires_grad=True)

lr = 5e-1
for t in range(500):
    y_pred = x.mm(w1).sigmoid().mm(w2).add(b2).sigmoid()
    ll = y * y_pred + (1 - y) * (1 - y_pred)
    loss = -ll.log().sum()
    if t % 100 == 99:
        print(t, loss.item())    
    
    loss.backward()
    
    with torch.no_grad():
        # Update weights using SGD.
        w1 -= lr * w1.grad
        w2 -= lr * w2.grad
        b2 -= lr * b2.grad
        
        # Clear the gradients for the next iteration.
        w1.grad.zero_()
        w2.grad.zero_()
        b2.grad.zero_()

99 2.7718662858337737
199 2.7632382342985204
299 2.139246428035179
399 0.2715010079183565
499 0.11792639248631942


In [2]:
print("weights w1: ", w1, w1.shape)
print("weights w2: ", w2, w2.shape) 
print("biases  b2: ", b2, b2.shape) 

weights w1:  tensor([[ 6.0037,  5.0552],
        [-5.8602, -5.3385],
        [ 2.9976, -2.7222]], dtype=torch.float64, requires_grad=True) torch.Size([3, 2])
weights w2:  tensor([[-8.1816],
        [ 8.8383]], dtype=torch.float64, requires_grad=True) torch.Size([2, 1])
biases  b2:  tensor([[3.7175]], dtype=torch.float64, requires_grad=True) torch.Size([1, 1])


In [3]:
x = torch.tensor([[0, 0, 1], [0, 1, 1], [1, 0, 1], [1, 1, 1]], dtype=float)

x.mm(w1).sigmoid().mm(w2).add(b2).sigmoid()

tensor([[0.0285],
        [0.9637],
        [0.9732],
        [0.0239]], dtype=torch.float64, grad_fn=<SigmoidBackward>)

# Now XOR using pytroch nn module and single layer network to see if it works

In [4]:
import torch

class SingleLayerNet(torch.nn.Module):
    def __init__(self, D_in, D_out):
        """
        In the constructor we instantiate one nn.Linear modules and assign them as
        member variables.
        """
        super(SingleLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, D_out, bias=True)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        y_pred = self.linear1(x)
        return y_pred

# N is batch size; D_in is input dimension;
# D_out is output dimension.
N, D_in, D_out = 4, 2, 1

# Create Tensors to hold inputs and outputs for NAND
x = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32)
# Construct our model by instantiating the class defined above
model = SingleLayerNet(D_in, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the single
# nn.Linear module which is member of the model.
criterion = torch.nn.BCEWithLogitsLoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=5e-1)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

99 2.7725887298583984
199 2.7725887298583984
299 2.7725887298583984
399 2.7725887298583984
499 2.7725887298583984


# Loss never decrease wich is a bad symptomth 

In [5]:
print("weights: ", model.state_dict()['linear1.weight'], model.state_dict()['linear1.weight'].shape)
print("bias:    ", model.state_dict()['linear1.bias'], model.state_dict()['linear1.bias'].shape)

weights:  tensor([[8.9407e-08, 8.9407e-08]]) torch.Size([1, 2])
bias:     tensor([-1.4901e-07]) torch.Size([1])


# And we'll see that the model make no distintion between different inputs

In [6]:
x = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
model(x).sigmoid()

tensor([[0.5000],
        [0.5000],
        [0.5000],
        [0.5000]], grad_fn=<SigmoidBackward>)

# Finally XOR using pytroch nn module and multi layer network

In [7]:
import torch

class DoubleLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate one nn.Linear modules and assign them as
        member variables.
        """
        super(DoubleLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H, bias=True)
        self.linear2 = torch.nn.Linear(H, D_out, bias=True)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        h_out = self.linear1(x).sigmoid()
        y_pred = self.linear2(h_out)
        return y_pred

# N is batch size; D_in is input dimension;
# D_out is output dimension.
N, D_in, H, D_out = 4, 2, 2, 1

# Create Tensors to hold inputs and outputs for NAND
x = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
y = torch.tensor([[0], [1], [1], [0]], dtype=torch.float32)
# Construct our model by instantiating the class defined above
model = DoubleLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the double
# nn.Linear modules which is member of the model.
criterion = torch.nn.BCEWithLogitsLoss(size_average=False)

optimizer = torch.optim.SGD(model.parameters(), lr=5e-1)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

99 2.7451202869415283
199 1.4362913370132446
299 0.22615067660808563
399 0.10816438496112823
499 0.06997331976890564


In [8]:
print("weights: ", model.state_dict()['linear1.weight'], model.state_dict()['linear1.weight'].shape)
print("bias:    ", model.state_dict()['linear1.bias'], model.state_dict()['linear1.bias'].shape)

weights:  tensor([[-6.2539,  6.0694],
        [-5.6923,  5.9648]]) torch.Size([2, 2])
bias:     tensor([-3.2886,  2.8270]) torch.Size([2])


In [10]:
x = torch.tensor([[0, 0], [1, 1], [1, 0], [0, 1]], dtype=torch.float32)
model(x).sigmoid()

tensor([[0.0169],
        [0.0142],
        [0.9770],
        [0.9851]], grad_fn=<SigmoidBackward>)