In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt

torch.manual_seed(0)

class XORproblem(nn.Module):
    def __init__(self):
        super(XORproblem, self).__init__()
        self.linear1 = nn.Linear(2, 2, bias=True)
        self.activation1 = nn.Sigmoid()
        self.linear2 = nn.Linear(2, 1, bias=True)

    def forward(self, x):  
        x = self.linear1(x)
        x = self.activation1(x)
        x = self.linear2(x)
        return x

class MyDataset(Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.x)

    def __getitem__(self, index):
        return self.x[index].to(device), self.y[index].to(device)

def criterion(model, train_data_loader, optimizer, loss_fn, epoch):
    total_loss = 0
    for i, data in enumerate(train_data_loader):
        input, labels = data
        optimizer.zero_grad()
        
        # Forward pass
        output = model(input)

        # Print details for the first epoch and first batch
        if epoch == 0 and i == 0:
            print("First Epoch, First Batch:")
            print("Input:", input)
            print("Labels:", labels)

            # Print weights and biases before the forward pass
            print("Initial Weights (linear1):", model.linear1.weight)
            print("Initial Biases (linear1):", model.linear1.bias)
            print("Initial Weights (linear2):", model.linear2.weight)
            print("Initial Biases (linear2):", model.linear2.bias)

            # Linear 1 output
            linear1_output = torch.matmul(input, model.linear1.weight.T) + model.linear1.bias
            print("Linear 1 Output:", linear1_output)

            # Sigmoid Activation output
            activation_output = model.activation1(linear1_output)
            print("Activation Output:", activation_output)

            # Linear 2 output (final prediction)
            linear2_output = torch.matmul(activation_output, model.linear2.weight.T) + model.linear2.bias
            print("Linear 2 Output (Final Prediction):", linear2_output)

            # Calculate loss
            loss = loss_fn(output.flatten(), labels)
            print("Loss:", loss.item())

        # Backward pass
        loss.backward()

        # Print gradients for the first epoch and first batch
        if epoch == 0 and i == 0:
            print("Gradients for linear1 weights and biases:", model.linear1.weight.grad, model.linear1.bias.grad)
            print("Gradients for linear2 weights and bias:", model.linear2.weight.grad, model.linear2.bias.grad)

        optimizer.step()
        total_loss += loss.item()
    
    return total_loss / len(train_data_loader)

X = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=torch.float32)
Y = torch.tensor([0, 1, 1, 0], dtype=torch.float32)

loss_list = []
full_dataset = MyDataset(X, Y)
batch_size = 1
train_data_loader = DataLoader(full_dataset, batch_size=batch_size, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = XORproblem().to(device)
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.03)
epoch = 1000

for iter in range(epoch):
    model.train()
    avg_loss = criterion(model, train_data_loader, optimizer, loss_fn, iter)
    loss_list.append(avg_loss)

plt.plot(loss_list)
plt.show()


cuda
First Epoch, First Batch:
Input: tensor([[0., 0.]], device='cuda:0')
Labels: tensor([0.], device='cuda:0')
Initial Weights (linear1): Parameter containing:
tensor([[-0.0053,  0.3793],
        [-0.5820, -0.5204]], device='cuda:0', requires_grad=True)
Initial Biases (linear1): Parameter containing:
tensor([-0.2723,  0.1896], device='cuda:0', requires_grad=True)
Initial Weights (linear2): Parameter containing:
tensor([[-0.0140,  0.5607]], device='cuda:0', requires_grad=True)
Initial Biases (linear2): Parameter containing:
tensor([-0.0628], device='cuda:0', requires_grad=True)
Linear 1 Output: tensor([[-0.2723,  0.1896]], device='cuda:0', grad_fn=<AddBackward0>)
Activation Output: tensor([[0.4323, 0.5473]], device='cuda:0', grad_fn=<SigmoidBackward0>)
Linear 2 Output (Final Prediction): tensor([[0.2380]], device='cuda:0', grad_fn=<AddBackward0>)
Loss: 0.056652721017599106
Gradients for linear1 weights and biases: tensor([[-0., -0.],
        [0., 0.]], device='cuda:0') tensor([-0.0016,

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.