In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.datasets import Planetoid
from torch_geometric.nn import GCNConv

# Load the Cora dataset
dataset = Planetoid(root='data/Cora', name='Cora')

# Prepare data
data = dataset[0]

# Define a 2-layer GCN
class GCN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return torch.log_softmax(x, dim=1)

# Initialize model, optimizer, and loss function
model = GCN(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [11]:
# Training loop
for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    # Forward pass
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
        
    # Calculate train and test accuracy
    model.eval()  
    with torch.no_grad():
        # Training accuracy
        _, pred_train = out[data.train_mask].max(dim=1)  # Get predictions for training nodes
        correct_train = (pred_train == data.y[data.train_mask]).sum().item()  # Count correct predictions
        train_accuracy = correct_train / data.train_mask.sum().item()  # Compute accuracy

        # Test accuracy
        _, pred_test = out[data.test_mask].max(dim=1)  # Get predictions for test nodes
        correct_test = (pred_test == data.y[data.test_mask]).sum().item()  # Count correct predictions
        test_accuracy = correct_test / data.test_mask.sum().item()  # Compute accuracy

    if (epoch % 10 == 0) or (epoch == 99):
        print(f'Epoch {epoch}, Loss: {loss.item()}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')

print("Training complete!")


Epoch 0, Loss: 1.9369269609451294, Train Accuracy: 0.1071, Test Accuracy: 0.1050
Epoch 10, Loss: 0.5323324799537659, Train Accuracy: 0.9786, Test Accuracy: 0.7970
Epoch 20, Loss: 0.08824098110198975, Train Accuracy: 1.0000, Test Accuracy: 0.7960
Epoch 30, Loss: 0.02015020325779915, Train Accuracy: 1.0000, Test Accuracy: 0.8000
Epoch 40, Loss: 0.007955827750265598, Train Accuracy: 1.0000, Test Accuracy: 0.7960
Epoch 50, Loss: 0.004606123082339764, Train Accuracy: 1.0000, Test Accuracy: 0.8010
Epoch 60, Loss: 0.0033149467781186104, Train Accuracy: 1.0000, Test Accuracy: 0.7980
Epoch 70, Loss: 0.002672974020242691, Train Accuracy: 1.0000, Test Accuracy: 0.7960
Epoch 80, Loss: 0.002278730273246765, Train Accuracy: 1.0000, Test Accuracy: 0.7950
Epoch 90, Loss: 0.001998076681047678, Train Accuracy: 1.0000, Test Accuracy: 0.7940
Epoch 99, Loss: 0.0017987268511205912, Train Accuracy: 1.0000, Test Accuracy: 0.7940
Training complete!


I added in code to calculate the train and test accuracy of the model to help with comparisons in later problems. The train accuracy reaches 100% quickly and doesn't seem to hold much information, but I will calculate it anyway to see if there are any differences.

## Explanation:
GCN aggregates features from a node’s neighbors using graph convolutions. This allows the network to learn representations based on both node features and graph structure.
The Cora dataset is used to classify nodes into one of 7 research topics.

## Questions (1 point each):

1. What would happen if we added more GCN layers (e.g., 3 layers instead of 2)? How would this affect over-smoothing?
2. What would happen if we used a larger hidden dimension (e.g., 64 instead of 16)? How would this impact the model's capacity?
3. What would happen if we replaced ReLU activation with a sigmoid function? Would the performance change?

4. What would happen if we trained on only 10% of the nodes and tested on the remaining 90%? How would the performance be affected?
5. What would happen if we used a different optimizer (e.g., RMSprop) instead of Adam? Would it affect the convergence speed?

Extra credit: 
1. What would happen if we used edge weights (non-binary) in the adjacency matrix? How would it affect message passing?
2. What would happen if we removed the log-softmax function in the output layer? Would the loss function still work correctly?

## No points, just for you to think about:
1. What would happen if we applied dropout to the node features during training? How would it affect the model’s generalization?
2. What would happen if we used mean-pooling instead of summing the messages in the GCN layers?
3. What would happen if we pre-trained the node features using a different algorithm, like Node2Vec, before feeding them into the GCN?


### Question 1

What would happen if we added more GCN layers (e.g., 3 layers instead of 2)? How would this affect over-smoothing?

Over-smoothing occurs when the node-embeddings in a GNN get too similar to each other to the point they become indistinguishable. It often occurs when there are too many layers in the network, when nodes gain information from other nodes further away on the graph. Local nodes are no longer represented properly. There is too much message passing, and the embeddings lose meaning. The result is lower performance of the network. Adding a third layer could cause over-smoothing. Let's test it.

In [15]:
class GCN3Layer(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, output_dim):
        super(GCN3Layer, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim1)
        self.conv2 = GCNConv(hidden_dim1, hidden_dim2)
        self.conv3 = GCNConv(hidden_dim2, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        x = torch.relu(x)
        x = self.conv3(x, edge_index)
        return torch.log_softmax(x, dim=1)

model3 = GCN3Layer(input_dim=dataset.num_node_features, hidden_dim1=16, hidden_dim2=8, output_dim=dataset.num_classes)
optimizer3 = optim.Adam(model3.parameters(), lr=0.01)

In [16]:
for epoch in range(100):
    model3.train()
    optimizer3.zero_grad()
    
    # Forward pass
    out = model3(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer3.step()

    # Calculate train and test accuracy
    model3.eval()  
    with torch.no_grad():
        # Training accuracy
        _, pred_train = out[data.train_mask].max(dim=1)  
        correct_train = (pred_train == data.y[data.train_mask]).sum().item() 
        train_accuracy = correct_train / data.train_mask.sum().item() 

        # Test accuracy
        _, pred_test = out[data.test_mask].max(dim=1)  
        correct_test = (pred_test == data.y[data.test_mask]).sum().item()  
        test_accuracy = correct_test / data.test_mask.sum().item()  
        
    if (epoch % 10 == 0) or (epoch == 99):
        print(f'Epoch {epoch}, Loss: {loss.item()}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')

print("Training complete!")

Epoch 0, Loss: 1.944490909576416, Train Accuracy: 0.2071, Test Accuracy: 0.1810
Epoch 10, Loss: 1.028019666671753, Train Accuracy: 0.6857, Test Accuracy: 0.4410
Epoch 20, Loss: 0.29625144600868225, Train Accuracy: 1.0000, Test Accuracy: 0.7580
Epoch 30, Loss: 0.05181198939681053, Train Accuracy: 1.0000, Test Accuracy: 0.7480
Epoch 40, Loss: 0.010977059602737427, Train Accuracy: 1.0000, Test Accuracy: 0.7450
Epoch 50, Loss: 0.004037094302475452, Train Accuracy: 1.0000, Test Accuracy: 0.7500
Epoch 60, Loss: 0.0022837382275611162, Train Accuracy: 1.0000, Test Accuracy: 0.7520
Epoch 70, Loss: 0.0016414098208770156, Train Accuracy: 1.0000, Test Accuracy: 0.7530
Epoch 80, Loss: 0.001327091595157981, Train Accuracy: 1.0000, Test Accuracy: 0.7520
Epoch 90, Loss: 0.0011335505405440927, Train Accuracy: 1.0000, Test Accuracy: 0.7520
Epoch 99, Loss: 0.0010074801975861192, Train Accuracy: 1.0000, Test Accuracy: 0.7540
Training complete!


Although the train accuracy was 100% and the loss was lower than the baseline model, the test accuracy was about 4% lower. This indicates that there likely was over-smoothing involved, causing the 3 layer model to be less accurate than the 2 layer model.

### Question 2

What would happen if we used a larger hidden dimension (e.g., 64 instead of 16)? How would this impact the model's capacity?

Capacity refers to the ability of a model to learn complex patterns in data. Increasing the size of the hidden dimension increases the model's capacity. It allows for better generalization on unseen data, but only up to a point before there becomes an increased risk for overfitting.

In [2]:
model64 = GCN(input_dim=dataset.num_node_features, hidden_dim=64, output_dim=dataset.num_classes)
optimizer64 = optim.Adam(model64.parameters(), lr=0.01)

In [3]:
for epoch in range(100):
    model64.train()
    optimizer64.zero_grad()
    
    # Forward pass
    out = model64(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer64.step()

    # Calculate train and test accuracy
    model64.eval()  
    with torch.no_grad():
        # Training accuracy
        _, pred_train = out[data.train_mask].max(dim=1)  
        correct_train = (pred_train == data.y[data.train_mask]).sum().item() 
        train_accuracy = correct_train / data.train_mask.sum().item() 

        # Test accuracy
        _, pred_test = out[data.test_mask].max(dim=1)  
        correct_test = (pred_test == data.y[data.test_mask]).sum().item()  
        test_accuracy = correct_test / data.test_mask.sum().item()   
        
    if (epoch % 10 == 0) or (epoch == 99):
        print(f'Epoch {epoch}, Loss: {loss.item()}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')

print("Training complete!")

Epoch 0, Loss: 1.9403634071350098, Train Accuracy: 0.1929, Test Accuracy: 0.1420
Epoch 10, Loss: 0.06855574250221252, Train Accuracy: 1.0000, Test Accuracy: 0.8040
Epoch 20, Loss: 0.00263330340385437, Train Accuracy: 1.0000, Test Accuracy: 0.7890
Epoch 30, Loss: 0.0005256105796433985, Train Accuracy: 1.0000, Test Accuracy: 0.7780
Epoch 40, Loss: 0.00023464775586035103, Train Accuracy: 1.0000, Test Accuracy: 0.7740
Epoch 50, Loss: 0.00016375772247556597, Train Accuracy: 1.0000, Test Accuracy: 0.7740
Epoch 60, Loss: 0.00013796996790915728, Train Accuracy: 1.0000, Test Accuracy: 0.7780
Epoch 70, Loss: 0.00012520681775640696, Train Accuracy: 1.0000, Test Accuracy: 0.7790
Epoch 80, Loss: 0.00011709984391927719, Train Accuracy: 1.0000, Test Accuracy: 0.7790
Epoch 90, Loss: 0.00011094879300799221, Train Accuracy: 1.0000, Test Accuracy: 0.7790
Epoch 99, Loss: 0.00010622364789014682, Train Accuracy: 1.0000, Test Accuracy: 0.7800
Training complete!


In this case, the model's capacity surely grew. It learned more complex patterns in the data, allowing the loss function to reach an order of magnitude lower than the baseline model. However, this did come with some overfitting, as the model only reached 78% accuracy on the test set as opposed to 80% on the baseline model.

### Question 3

What would happen if we replaced ReLU activation with a sigmoid function? Would the performance change?

In [4]:
class GCNSigmoid(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNSigmoid, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.sigmoid(x)
        x = self.conv2(x, edge_index)
        return torch.log_softmax(x, dim=1)

modelsig = GCNSigmoid(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizersig = optim.Adam(modelsig.parameters(), lr=0.01)

In [5]:
for epoch in range(100):
    modelsig.train()
    optimizersig.zero_grad()
    
    # Forward pass
    out = modelsig(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizersig.step()

    # Calculate train and test accuracy
    modelsig.eval()  
    with torch.no_grad():
        # Training accuracy
        _, pred_train = out[data.train_mask].max(dim=1)  
        correct_train = (pred_train == data.y[data.train_mask]).sum().item() 
        train_accuracy = correct_train / data.train_mask.sum().item() 

        # Test accuracy
        _, pred_test = out[data.test_mask].max(dim=1)  
        correct_test = (pred_test == data.y[data.test_mask]).sum().item()  
        test_accuracy = correct_test / data.test_mask.sum().item()   
        
    if (epoch % 10 == 0) or (epoch == 99):
        print(f'Epoch {epoch}, Loss: {loss.item()}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')

print("Training complete!")

Epoch 0, Loss: 2.1666195392608643, Train Accuracy: 0.1429, Test Accuracy: 0.0640
Epoch 10, Loss: 1.5325162410736084, Train Accuracy: 0.7643, Test Accuracy: 0.5180
Epoch 20, Loss: 1.082481861114502, Train Accuracy: 0.9714, Test Accuracy: 0.7440
Epoch 30, Loss: 0.7212515473365784, Train Accuracy: 0.9857, Test Accuracy: 0.7900
Epoch 40, Loss: 0.45960235595703125, Train Accuracy: 0.9857, Test Accuracy: 0.7850
Epoch 50, Loss: 0.2960023880004883, Train Accuracy: 0.9929, Test Accuracy: 0.7800
Epoch 60, Loss: 0.19991613924503326, Train Accuracy: 1.0000, Test Accuracy: 0.7780
Epoch 70, Loss: 0.14317697286605835, Train Accuracy: 1.0000, Test Accuracy: 0.7740
Epoch 80, Loss: 0.10831882804632187, Train Accuracy: 1.0000, Test Accuracy: 0.7720
Epoch 90, Loss: 0.08572407066822052, Train Accuracy: 1.0000, Test Accuracy: 0.7700
Epoch 99, Loss: 0.07155357301235199, Train Accuracy: 1.0000, Test Accuracy: 0.7700
Training complete!


The performace of the model with the sigmoid function instead of ReLU did change. The loss of the sigmoid model did not reach as low as the ReLU model, and the test accuracy was also lower by about 2%. The ReLU activation function is a better choice for this model.

### Question 4

What would happen if we trained on only 10% of the nodes and tested on the remaining 90%? How would the performance be affected?

In [24]:
modelSmallTrainSet = GCN(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizerSmallTrainSet = optim.Adam(modelSmallTrainSet.parameters(), lr=0.01)

In [25]:
import numpy as np
num_nodes = data.num_nodes
indices = np.random.permutation(num_nodes)
split_point = int(num_nodes * 0.1)

# Assign 10% to training and 90% to testing
train_indices = indices[:split_point]
test_indices = indices[split_point:]

# Create new train and test masks (boolean masks)
train_mask10 = torch.zeros(num_nodes, dtype=torch.bool)
test_mask90 = torch.zeros(num_nodes, dtype=torch.bool)

train_mask10[train_indices] = True
test_mask90[test_indices] = True

In [26]:
for epoch in range(100):
    modelSmallTrainSet.train()
    optimizerSmallTrainSet.zero_grad()
    
    # Forward pass
    out = modelSmallTrainSet(data)
    loss = criterion(out[train_mask10], data.y[train_mask10])
    loss.backward()
    optimizerSmallTrainSet.step()

    # Calculate train and test accuracy
    modelSmallTrainSet.eval()  
    with torch.no_grad():
        # Training accuracy
        _, pred_train = out[train_mask10].max(dim=1)  
        correct_train = (pred_train == data.y[train_mask10]).sum().item() 
        train_accuracy = correct_train / train_mask10.sum().item() 

        # Test accuracy
        _, pred_test = out[test_mask90].max(dim=1)  
        correct_test = (pred_test == data.y[test_mask90]).sum().item()  
        test_accuracy = correct_test / test_mask90.sum().item()   
        
    if (epoch % 10 == 0) or (epoch == 99):
        print(f'Epoch {epoch}, Loss: {loss.item()}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')

print("Training complete!")

Epoch 0, Loss: 1.9553686380386353, Train Accuracy: 0.1148, Test Accuracy: 0.1255
Epoch 10, Loss: 0.8031471967697144, Train Accuracy: 0.9000, Test Accuracy: 0.7715
Epoch 20, Loss: 0.23066052794456482, Train Accuracy: 0.9741, Test Accuracy: 0.8298
Epoch 30, Loss: 0.07965433597564697, Train Accuracy: 0.9963, Test Accuracy: 0.8396
Epoch 40, Loss: 0.037285588681697845, Train Accuracy: 0.9963, Test Accuracy: 0.8318
Epoch 50, Loss: 0.02308228611946106, Train Accuracy: 1.0000, Test Accuracy: 0.8302
Epoch 60, Loss: 0.016371894627809525, Train Accuracy: 1.0000, Test Accuracy: 0.8290
Epoch 70, Loss: 0.012456296011805534, Train Accuracy: 1.0000, Test Accuracy: 0.8290
Epoch 80, Loss: 0.009907688945531845, Train Accuracy: 1.0000, Test Accuracy: 0.8290
Epoch 90, Loss: 0.008103243075311184, Train Accuracy: 1.0000, Test Accuracy: 0.8298
Epoch 99, Loss: 0.006879020016640425, Train Accuracy: 1.0000, Test Accuracy: 0.8302
Training complete!


This model surprisingly performed better? The loss didn't get as low as the baseline model, reaching 0.007 at the end as opposed to 0.001, but the test accuracy was about 3.5% better. I could have done something wrong with the code. It could be that GNNs perform better when the network sees less of the graph in training (perhaps due to no over-smoothing; the representations were nicely found from the small subset of nodes), or that seeing 10% of this graph was all the GCN needed to find patterns, and more nodes in the training set only led to overfitting. I would have expected the model to perform far worse after only seeing 10% of the nodes in the graph, but I was wrong.

### Question 5

What would happen if we used a different optimizer (e.g., RMSprop) instead of Adam? Would it affect the convergence speed?

Adam and RMSprop are similar algorithms. They both use adaptive learning rates to compute gradient descent on each parameter with step-size adjustments based on past gradients. They differ because Adam uses momentum to smooth out gradient updates and converge quicker. Based on this, I would expect the convergence speed of the model with RMSprop as its optimizer to be slower than the baseline with Adam. 

In [6]:
modelRMSprop = GCN(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizerRMSprop = optim.RMSprop(modelRMSprop.parameters(), lr=0.01)

In [7]:
for epoch in range(100):
    modelRMSprop.train()
    optimizerRMSprop.zero_grad()
    
    # Forward pass
    out = modelRMSprop(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizerRMSprop.step()

    # Calculate train and test accuracy
    modelRMSprop.eval()  
    with torch.no_grad():
        # Training accuracy
        _, pred_train = out[data.train_mask].max(dim=1)  
        correct_train = (pred_train == data.y[data.train_mask]).sum().item() 
        train_accuracy = correct_train / data.train_mask.sum().item() 

        # Test accuracy
        _, pred_test = out[data.test_mask].max(dim=1)  
        correct_test = (pred_test == data.y[data.test_mask]).sum().item()  
        test_accuracy = correct_test / data.test_mask.sum().item()   
        
    if (epoch % 10 == 0) or (epoch == 99):
        print(f'Epoch {epoch}, Loss: {loss.item()}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')

print("Training complete!")

Epoch 0, Loss: 1.9490680694580078, Train Accuracy: 0.0786, Test Accuracy: 0.1360
Epoch 10, Loss: 0.0547318309545517, Train Accuracy: 1.0000, Test Accuracy: 0.7850
Epoch 20, Loss: 0.018357135355472565, Train Accuracy: 1.0000, Test Accuracy: 0.7900
Epoch 30, Loss: 0.010021893307566643, Train Accuracy: 1.0000, Test Accuracy: 0.7870
Epoch 40, Loss: 0.006585356779396534, Train Accuracy: 1.0000, Test Accuracy: 0.7850
Epoch 50, Loss: 0.004774170462042093, Train Accuracy: 1.0000, Test Accuracy: 0.7860
Epoch 60, Loss: 0.003671690821647644, Train Accuracy: 1.0000, Test Accuracy: 0.7880
Epoch 70, Loss: 0.0029376945458352566, Train Accuracy: 1.0000, Test Accuracy: 0.7870
Epoch 80, Loss: 0.0024171206168830395, Train Accuracy: 1.0000, Test Accuracy: 0.7860
Epoch 90, Loss: 0.002031078329309821, Train Accuracy: 1.0000, Test Accuracy: 0.7860
Epoch 99, Loss: 0.0017610318027436733, Train Accuracy: 1.0000, Test Accuracy: 0.7840
Training complete!


The convergence speed for the RMSprop optimizer wasn't much slower than the Adam optimizer. If anything, it reached a lower loss quicker than Adam did. By epoch 10, the loss was down to 0.05, while at epoch 10 in the baseline model, loss was 0.53. They both reached a loss under 0.01 at epoch 40 and finished with a loss around 0.018 after 100 epochs. Both models also performed roughly the same in terms of test accuracy (within 1%), meaning both are valid to use for this model.

### Extra Credit 1

What would happen if we used edge weights (non-binary) in the adjacency matrix? How would it affect message passing?

**Answer:** Adding edge weights to the adjacency matrix for the Cora dataset could take the form of something like how many times the connecting paper references the other. This affects message passing by adding more relevance to papers that are closely connected to each other. The message from the closely connected nodes (higher edge weights) would have a greater impact on the node's embedding than nodes more loosely connected. This could increase the strength and accuracy of the GCN, allowing it to better classify unseen nodes.

### Extra Credit 2

What would happen if we removed the log-softmax function in the output layer? Would the loss function still work correctly?

I actually came across this issue in the last homework assignment. In creating a new class to define my neural network for one of the problems, I forgot to use the log-softmax function in the output layer, causing the loss function to go negative in training. However, that was using negative log likelihood loss instead of cross entropy loss. In consulting ChatGPT about my error, it suggested using cross entropy loss to fix my solution, as the `F.cross_entropy()` function combines both the log-softmax and negative log loss functions. This fixed my error, then I realized I could just add the log-softmax function to the output layer and continue using `F.nll_loss()` to keep things consistent, which also worked. For this problem, we are using cross entropy loss already, but this time from the torch.nn package instead of the torch.F package. I could look up documentation to see if there is any difference, but it is more fun to test and break code, so that's what I'm going to do.

In [8]:
class GCNNoLogSoftmax(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(GCNNoLogSoftmax, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

modelNoLogSoftmax = GCNNoLogSoftmax(input_dim=dataset.num_node_features, hidden_dim=16, output_dim=dataset.num_classes)
optimizerNoLogSoftmax = optim.Adam(modelNoLogSoftmax.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [10]:
for epoch in range(100):
    modelNoLogSoftmax.train()
    optimizerNoLogSoftmax.zero_grad()
    
    # Forward pass
    out = modelNoLogSoftmax(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizerNoLogSoftmax.step()

    # Calculate train and test accuracy
    modelNoLogSoftmax.eval()  
    with torch.no_grad():
        # Training accuracy
        _, pred_train = out[data.train_mask].max(dim=1)  
        correct_train = (pred_train == data.y[data.train_mask]).sum().item() 
        train_accuracy = correct_train / data.train_mask.sum().item() 

        # Test accuracy
        _, pred_test = out[data.test_mask].max(dim=1)  
        correct_test = (pred_test == data.y[data.test_mask]).sum().item()  
        test_accuracy = correct_test / data.test_mask.sum().item()   
        
    if (epoch % 10 == 0) or (epoch == 99):
        print(f'Epoch {epoch}, Loss: {loss.item()}, Train Accuracy: {train_accuracy:.4f}, Test Accuracy: {test_accuracy:.4f}')

print("Training complete!")

Epoch 0, Loss: 1.9473196268081665, Train Accuracy: 0.0857, Test Accuracy: 0.1800
Epoch 10, Loss: 0.6583453416824341, Train Accuracy: 0.9714, Test Accuracy: 0.7810
Epoch 20, Loss: 0.1296776980161667, Train Accuracy: 0.9929, Test Accuracy: 0.7920
Epoch 30, Loss: 0.029255373403429985, Train Accuracy: 1.0000, Test Accuracy: 0.7880
Epoch 40, Loss: 0.010750789195299149, Train Accuracy: 1.0000, Test Accuracy: 0.7840
Epoch 50, Loss: 0.005935577675700188, Train Accuracy: 1.0000, Test Accuracy: 0.7840
Epoch 60, Loss: 0.00418056920170784, Train Accuracy: 1.0000, Test Accuracy: 0.7850
Epoch 70, Loss: 0.0033248572144657373, Train Accuracy: 1.0000, Test Accuracy: 0.7870
Epoch 80, Loss: 0.002808221150189638, Train Accuracy: 1.0000, Test Accuracy: 0.7860
Epoch 90, Loss: 0.0024441233836114407, Train Accuracy: 1.0000, Test Accuracy: 0.7850
Epoch 99, Loss: 0.0021882858127355576, Train Accuracy: 1.0000, Test Accuracy: 0.7840
Training complete!


The loss function is continuting to work properly. If we were using nll loss like the last assignment, then it wouldn't work, because nll loss relies on inputs to be probabilities or log probabilities, requiring the use of the log-softmax function in the output. Cross entropy loss doesn't have the same requirement, as it can handle raw values or probabilities, so the loss in this scenario is calculated properly