In [98]:
#project_name = "."
#project_name = "jfreechart"
project_name = "argouml"

In [99]:
import torch
import random

def load_pyg_dataset(file_path):
    return torch.load(file_path)

output_file_path = "output/" + project_name + "/pyg_dataset_for_problematic_class.pt"
dataset = load_pyg_dataset(output_file_path)

def add_train_mask_to_pyg_data(data, train_ratio=0.8):
    num_nodes = data.num_nodes
    num_train_nodes = int(train_ratio * num_nodes)

    # Generate a list of True and False values with the specified ratio
    mask_values = [True] * num_train_nodes + [False] * (num_nodes - num_train_nodes)

    # Randomly shuffle the mask values
    random.shuffle(mask_values)

    # Convert the mask_values list to a boolean tensor and assign it to the Data object
    data.train_mask = torch.tensor(mask_values, dtype=torch.bool)

def add_test_mask_to_pyg_data(data):
    # If the train_mask is not set, raise an exception
    if not hasattr(data, 'train_mask'):
        raise ValueError("Please add a train_mask to the Data object first.")
    
    # Create a test mask as the complement of the train mask
    data.test_mask = ~data.train_mask


add_train_mask_to_pyg_data(dataset, train_ratio=0.7)
add_test_mask_to_pyg_data(dataset)

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Data(x=[1465, 18], edge_index=[2, 13369], y=[1465], num_classes=3, train_mask=[1465], test_mask=[1465]):
Number of graphs: 6
Number of features: 18
Number of classes: 3

Data(x=[1465, 18], edge_index=[2, 13369], y=[1465], num_classes=3, train_mask=[1465], test_mask=[1465])
Number of nodes: 1465
Number of edges: 13369
Average node degree: 9.13
Number of training nodes: 1025
Training node label rate: 0.70
Has isolated nodes: True
Has self-loops: True
Is undirected: False


In [100]:
import torch
from torch.nn import Linear
import torch.nn.functional as F


class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        #torch.manual_seed(12345)
        self.lin1 = Linear(dataset.num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x

model = MLP(hidden_channels=16)
print(model)

MLP(
  (lin1): Linear(in_features=18, out_features=16, bias=True)
  (lin2): Linear(in_features=16, out_features=3, bias=True)
)


In [101]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = MLP(hidden_channels=16)
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(data.x)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

for epoch in range(1, 1001):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Loss: 6.1430
Epoch: 002, Loss: 5.5194
Epoch: 003, Loss: 4.0407
Epoch: 004, Loss: 3.4137
Epoch: 005, Loss: 2.8279
Epoch: 006, Loss: 2.5036
Epoch: 007, Loss: 2.0068
Epoch: 008, Loss: 1.6587
Epoch: 009, Loss: 1.3637
Epoch: 010, Loss: 1.1703
Epoch: 011, Loss: 1.0933
Epoch: 012, Loss: 1.0171
Epoch: 013, Loss: 1.0130
Epoch: 014, Loss: 1.0076
Epoch: 015, Loss: 1.0197
Epoch: 016, Loss: 0.9609
Epoch: 017, Loss: 0.9617
Epoch: 018, Loss: 0.9209
Epoch: 019, Loss: 0.8794
Epoch: 020, Loss: 0.8553
Epoch: 021, Loss: 0.8026
Epoch: 022, Loss: 0.7911
Epoch: 023, Loss: 0.8168
Epoch: 024, Loss: 0.7731
Epoch: 025, Loss: 0.7491
Epoch: 026, Loss: 0.7182
Epoch: 027, Loss: 0.6876
Epoch: 028, Loss: 0.7048
Epoch: 029, Loss: 0.6856
Epoch: 030, Loss: 0.6692
Epoch: 031, Loss: 0.6442
Epoch: 032, Loss: 0.6441
Epoch: 033, Loss: 0.6461
Epoch: 034, Loss: 0.6130
Epoch: 035, Loss: 0.6049
Epoch: 036, Loss: 0.5983
Epoch: 037, Loss: 0.6023
Epoch: 038, Loss: 0.6230
Epoch: 039, Loss: 0.5976
Epoch: 040, Loss: 0.5803


In [102]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.9023


In [103]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_results(y_test, y_pred):
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

model.eval()
out = model(data.x)
pred = out.argmax(dim=1)
evaluate_results(data.y[data.test_mask], pred[data.test_mask])

Confusion Matrix:
[[340   1   0]
 [ 25  48   0]
 [  0  17   9]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96       341
           1       0.73      0.66      0.69        73
           2       1.00      0.35      0.51        26

    accuracy                           0.90       440
   macro avg       0.89      0.67      0.72       440
weighted avg       0.90      0.90      0.89       440



In [104]:
from torch_geometric.nn import GCNConv


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        #torch.manual_seed(1234567)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(hidden_channels=16)
print(model)

GCN(
  (conv1): GCNConv(18, 16)
  (conv2): GCNConv(16, 3)
)


In [105]:
model = GCN(hidden_channels=16)
model.eval()

out = model(data.x, data.edge_index)

In [106]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x, data.edge_index)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(data.x, data.edge_index)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc


for epoch in range(1, 1001):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Loss: 100.9517
Epoch: 002, Loss: 81.3035
Epoch: 003, Loss: 61.2495
Epoch: 004, Loss: 35.7731
Epoch: 005, Loss: 25.1956
Epoch: 006, Loss: 21.8821
Epoch: 007, Loss: 25.7564
Epoch: 008, Loss: 28.9664
Epoch: 009, Loss: 32.4319
Epoch: 010, Loss: 33.5858
Epoch: 011, Loss: 33.9124
Epoch: 012, Loss: 29.9329
Epoch: 013, Loss: 27.3731
Epoch: 014, Loss: 26.9548
Epoch: 015, Loss: 24.8132
Epoch: 016, Loss: 24.0639
Epoch: 017, Loss: 20.0937
Epoch: 018, Loss: 19.5106
Epoch: 019, Loss: 17.5797
Epoch: 020, Loss: 15.6364
Epoch: 021, Loss: 13.2026
Epoch: 022, Loss: 11.7384
Epoch: 023, Loss: 11.6939
Epoch: 024, Loss: 11.1394
Epoch: 025, Loss: 9.6400
Epoch: 026, Loss: 11.1315
Epoch: 027, Loss: 10.8479
Epoch: 028, Loss: 11.9920
Epoch: 029, Loss: 10.2413
Epoch: 030, Loss: 7.9808
Epoch: 031, Loss: 7.6894
Epoch: 032, Loss: 6.4086
Epoch: 033, Loss: 5.9398
Epoch: 034, Loss: 5.9250
Epoch: 035, Loss: 5.1007
Epoch: 036, Loss: 5.4364
Epoch: 037, Loss: 4.7758
Epoch: 038, Loss: 5.1135
Epoch: 039, Loss: 4.7

In [107]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.7727


In [108]:
model.eval()
out = model(data.x, data.edge_index)
pred = out.argmax(dim=1)
evaluate_results(data.y[data.test_mask], pred[data.test_mask])

Confusion Matrix:
[[340   0   1]
 [ 72   0   1]
 [ 24   2   0]]

Classification Report:
              precision    recall  f1-score   support

           0       0.78      1.00      0.88       341
           1       0.00      0.00      0.00        73
           2       0.00      0.00      0.00        26

    accuracy                           0.77       440
   macro avg       0.26      0.33      0.29       440
weighted avg       0.60      0.77      0.68       440



Bad pipe message: %s [b'%\r;)\x8c\xee{\x90\x8bOc\xcf\xd9\x9a2\x8e\xbdO \xba6G\x9f\xbc\xbbT\xf6\x87\xec\x19\x08\x89M@d\x9a\xcc\xd4T\x84\x7f`89p"\x02\xc0\xc9\xd4\xf1\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x00\x1e\x00\x1c\x04\x03\x05\x03\x06\x03\x08\x07\x08\x08\x08\t\x08\n\x08\x0b\x08\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06\x01\x00+\x00\x03\x02\x03\x04\x00-\x00\x02\x01\x01\x003\x00&\x00$\x00\x1d\x00 \xa5~\x91\x02\xf7\x18\xe5\xdd\x00\x17\x08b\x86U/!\xfe\x88\xd2\x8c\x15\xe2e\xdf\x98']
Bad pipe message: %s [b'f\xd8\x11\xd5L}\x08\x89L;\xe6\xf1\xba\x7f\xf3^\xa1\xea R\xfd!?\xd7^\xf8\xd3\xc6\xfbaY\xf7\x857\xe4J\xd8\xcb11:q\x1c\xe9\xa5\x85:b\xb6\x9e\xa9\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x