In [1]:
#project_name = "."
#project_name = "jfreechart"
#project_name = "argouml"
project_name = "weka"

In [2]:
import torch
import random

def load_pyg_dataset(file_path):
    return torch.load(file_path)

output_file_path = "output/" + project_name + "/pyg_dataset_for_problematic_class.pt"
dataset = load_pyg_dataset(output_file_path)

def add_train_mask_to_pyg_data(data, train_ratio=0.8):
    num_nodes = data.num_nodes
    num_train_nodes = int(train_ratio * num_nodes)

    # Generate a list of True and False values with the specified ratio
    mask_values = [True] * num_train_nodes + [False] * (num_nodes - num_train_nodes)

    # Randomly shuffle the mask values
    random.shuffle(mask_values)

    # Convert the mask_values list to a boolean tensor and assign it to the Data object
    data.train_mask = torch.tensor(mask_values, dtype=torch.bool)

def add_test_mask_to_pyg_data(data):
    # If the train_mask is not set, raise an exception
    if not hasattr(data, 'train_mask'):
        raise ValueError("Please add a train_mask to the Data object first.")
    
    # Create a test mask as the complement of the train mask
    data.test_mask = ~data.train_mask


add_train_mask_to_pyg_data(dataset, train_ratio=0.7)
add_test_mask_to_pyg_data(dataset)

print()
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset

print()
print(data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Data(x=[1507, 18], edge_index=[2, 18424], y=[1507], num_classes=2, train_mask=[1507], test_mask=[1507]):
Number of graphs: 6
Number of features: 18
Number of classes: 2

Data(x=[1507, 18], edge_index=[2, 18424], y=[1507], num_classes=2, train_mask=[1507], test_mask=[1507])
Number of nodes: 1507
Number of edges: 18424
Average node degree: 12.23
Number of training nodes: 1054
Training node label rate: 0.70
Has isolated nodes: False
Has self-loops: True
Is undirected: False


In [3]:
import torch
from torch.nn import Linear
import torch.nn.functional as F


class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        #torch.manual_seed(12345)
        self.lin1 = Linear(dataset.num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, dataset.num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x

model = MLP(hidden_channels=16)
print(model)

MLP(
  (lin1): Linear(in_features=18, out_features=16, bias=True)
  (lin2): Linear(in_features=16, out_features=2, bias=True)
)


In [4]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = MLP(hidden_channels=16)
criterion = torch.nn.CrossEntropyLoss()  # Define loss criterion.
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(data.x)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc

for epoch in range(1, 1001):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Loss: 6.8740
Epoch: 002, Loss: 4.9823
Epoch: 003, Loss: 3.2990
Epoch: 004, Loss: 2.5446
Epoch: 005, Loss: 2.2083
Epoch: 006, Loss: 1.6796
Epoch: 007, Loss: 1.4595
Epoch: 008, Loss: 1.0361
Epoch: 009, Loss: 0.8869
Epoch: 010, Loss: 0.7762
Epoch: 011, Loss: 0.6808
Epoch: 012, Loss: 0.6153
Epoch: 013, Loss: 0.6068
Epoch: 014, Loss: 0.6036
Epoch: 015, Loss: 0.5820
Epoch: 016, Loss: 0.5677
Epoch: 017, Loss: 0.5875
Epoch: 018, Loss: 0.5629
Epoch: 019, Loss: 0.5692
Epoch: 020, Loss: 0.5499
Epoch: 021, Loss: 0.5475
Epoch: 022, Loss: 0.5491
Epoch: 023, Loss: 0.5440
Epoch: 024, Loss: 0.5288
Epoch: 025, Loss: 0.5275
Epoch: 026, Loss: 0.5256
Epoch: 027, Loss: 0.4963
Epoch: 028, Loss: 0.5040
Epoch: 029, Loss: 0.4853
Epoch: 030, Loss: 0.4730
Epoch: 031, Loss: 0.4541
Epoch: 032, Loss: 0.4409
Epoch: 033, Loss: 0.4280
Epoch: 034, Loss: 0.4284
Epoch: 035, Loss: 0.4484
Epoch: 036, Loss: 0.4147
Epoch: 037, Loss: 0.4318
Epoch: 038, Loss: 0.4214
Epoch: 039, Loss: 0.4222
Epoch: 040, Loss: 0.4251


In [5]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.9404


In [6]:
from sklearn.metrics import classification_report, confusion_matrix

def evaluate_results(y_test, y_pred):
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

model.eval()
out = model(data.x)
pred = out.argmax(dim=1)
evaluate_results(data.y[data.test_mask], pred[data.test_mask])

Confusion Matrix:
[[292   3]
 [ 24 134]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.99      0.96       295
           1       0.98      0.85      0.91       158

    accuracy                           0.94       453
   macro avg       0.95      0.92      0.93       453
weighted avg       0.94      0.94      0.94       453



In [8]:
from torch_geometric.nn import GCNConv


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        #torch.manual_seed(1234567)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, dataset.num_classes)

    def forward(self, x, edge_index, edge_weight):
        x = self.conv1(x, edge_index,  edge_weight)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        return x

model = GCN(hidden_channels=16)
print(model)

GCN(
  (conv1): GCNConv(18, 16)
  (conv2): GCNConv(16, 2)
)


In [9]:
model = GCN(hidden_channels=16)
model.eval()

out = model(data.x, data.edge_index, data.edge_weight)

In [14]:
from IPython.display import Javascript  # Restrict height of output cell.
display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 300})'''))

model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
      model.train()
      optimizer.zero_grad()  # Clear gradients.
      out = model(data.x, data.edge_index, data.edge_weight)  # Perform a single forward pass.
      loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
      loss.backward()  # Derive gradients.
      optimizer.step()  # Update parameters based on gradients.
      return loss

def test():
      model.eval()
      out = model(data.x, data.edge_index, data.edge_weight)
      pred = out.argmax(dim=1)  # Use the class with highest probability.
      test_correct = pred[data.test_mask] == data.y[data.test_mask]  # Check against ground-truth labels.
      test_acc = int(test_correct.sum()) / int(data.test_mask.sum())  # Derive ratio of correct predictions.
      return test_acc


for epoch in range(1, 5001):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

<IPython.core.display.Javascript object>

Epoch: 001, Loss: 157.4280
Epoch: 002, Loss: 113.3165
Epoch: 003, Loss: 80.1986
Epoch: 004, Loss: 65.8373
Epoch: 005, Loss: 48.4575
Epoch: 006, Loss: 85.2972
Epoch: 007, Loss: 55.3950
Epoch: 008, Loss: 50.1072
Epoch: 009, Loss: 45.8883
Epoch: 010, Loss: 44.3533
Epoch: 011, Loss: 42.9279
Epoch: 012, Loss: 41.5951
Epoch: 013, Loss: 33.4528
Epoch: 014, Loss: 42.3014
Epoch: 015, Loss: 37.8689
Epoch: 016, Loss: 31.3395
Epoch: 017, Loss: 32.1398
Epoch: 018, Loss: 35.7587
Epoch: 019, Loss: 29.5770
Epoch: 020, Loss: 28.4616
Epoch: 021, Loss: 23.2398
Epoch: 022, Loss: 27.8619
Epoch: 023, Loss: 22.5578
Epoch: 024, Loss: 21.3936
Epoch: 025, Loss: 22.6190
Epoch: 026, Loss: 15.9698
Epoch: 027, Loss: 15.8091
Epoch: 028, Loss: 17.2814
Epoch: 029, Loss: 12.3467
Epoch: 030, Loss: 12.6121
Epoch: 031, Loss: 12.3314
Epoch: 032, Loss: 14.5078
Epoch: 033, Loss: 10.7255
Epoch: 034, Loss: 8.9273
Epoch: 035, Loss: 8.4339
Epoch: 036, Loss: 8.3722
Epoch: 037, Loss: 8.1580
Epoch: 038, Loss: 6.8969
Epoch: 039, Los

In [15]:
test_acc = test()
print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.6623


In [13]:
model.eval()
out = model(data.x, data.edge_index, data.edge_weight)
pred = out.argmax(dim=1)
evaluate_results(data.y[data.test_mask], pred[data.test_mask])

Confusion Matrix:
[[289   6]
 [141  17]]

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.98      0.80       295
           1       0.74      0.11      0.19       158

    accuracy                           0.68       453
   macro avg       0.71      0.54      0.49       453
weighted avg       0.70      0.68      0.58       453

