In [1]:
import os
import torch
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split



In [2]:
from torch_geometric.datasets import Planetoid
from torch_geometric.transforms import NormalizeFeatures

citeseer_dataset = Planetoid(root = "Citeseer_dataset", name = "Citeseer", transform = NormalizeFeatures())

In [3]:
print(len(citeseer_dataset))
print(citeseer_dataset.num_classes)
print(citeseer_dataset.num_features)
citeseer_graph = citeseer_dataset[0]
citeseer_graph

1
6
3703


Data(x=[3327, 3703], edge_index=[2, 9104], y=[3327], train_mask=[3327], val_mask=[3327], test_mask=[3327])

In [4]:
citeseer_graph.x

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [5]:
print("Training samples: ", citeseer_graph.train_mask.sum().item())
print("Validation samples: ", citeseer_graph.val_mask.sum().item())
print("Test samples: ", citeseer_graph.test_mask.sum().item())

Training samples:  120
Validation samples:  500
Test samples:  1000


In [6]:
citeseer_graph.y

tensor([3, 1, 5,  ..., 3, 1, 5])

In [7]:
print(f'Number of nodes: {citeseer_graph.num_nodes}')
print(f'Number of edges: {citeseer_graph.num_edges}')
print(f'Average node degree: {citeseer_graph.num_edges / citeseer_graph.num_nodes:.2f}')
print(f'Has isolated nodes: {citeseer_graph.has_isolated_nodes()}')
print(f'Has self-loops: {citeseer_graph.has_self_loops()}')
print(f'Is undirected: {citeseer_graph.is_undirected()}')

Number of nodes: 3327
Number of edges: 9104
Average node degree: 2.74
Has isolated nodes: True
Has self-loops: False
Is undirected: True


In [8]:
# Extract features (X) and labels (y)
X = citeseer_graph.x.cpu().numpy()
y = citeseer_graph.y.cpu().numpy()

# Split data into train, validation, and test sets
train_mask = citeseer_graph.train_mask.cpu().numpy()
val_mask = citeseer_graph.val_mask.cpu().numpy()
test_mask = citeseer_graph.test_mask.cpu().numpy()

X_train, y_train = X[train_mask], y[train_mask]
X_val, y_val = X[val_mask], y[val_mask]
X_test, y_test = X[test_mask], y[test_mask]

In [9]:
# Combine train and validation sets for XGBoost training
X_train_combined = torch.cat([torch.tensor(X_train), torch.tensor(X_val)]).numpy()
y_train_combined = torch.cat([torch.tensor(y_train), torch.tensor(y_val)]).numpy()

In [10]:
# Train an XGBoost classifier
model = xgb.XGBClassifier(
    objective="multi:softprob",
    num_class=2,
    eval_metric="mlogloss",
    use_label_encoder=False,
    learning_rate=0.01,
    max_depth=6,
    n_estimators=500
)

In [11]:
model.fit(X_train_combined, y_train_combined)

Parameters: { "use_label_encoder" } are not used.



In [12]:
# Evaluate on the test set
y_pred = model.predict(X_test)

In [13]:
y_pred

array([3, 2, 4, 3, 4, 1, 0, 2, 3, 5, 3, 2, 2, 5, 2, 4, 3, 2, 1, 1, 4, 3,
       3, 4, 2, 0, 3, 4, 3, 3, 3, 5, 0, 4, 4, 3, 1, 0, 3, 3, 3, 3, 3, 2,
       1, 3, 3, 3, 2, 0, 2, 3, 3, 1, 5, 5, 5, 3, 2, 2, 3, 2, 3, 3, 5, 2,
       3, 1, 1, 2, 1, 2, 1, 1, 1, 1, 3, 3, 2, 4, 3, 3, 1, 1, 2, 2, 2, 2,
       1, 3, 0, 3, 3, 3, 2, 1, 3, 3, 4, 4, 3, 1, 2, 2, 3, 2, 1, 5, 1, 1,
       3, 1, 5, 2, 1, 3, 5, 4, 3, 5, 1, 2, 2, 3, 5, 3, 3, 4, 0, 3, 1, 2,
       1, 3, 4, 4, 2, 4, 4, 4, 5, 0, 3, 1, 1, 3, 1, 3, 5, 3, 4, 4, 2, 2,
       3, 2, 3, 3, 3, 5, 5, 2, 2, 4, 3, 1, 3, 3, 2, 5, 2, 3, 5, 1, 1, 2,
       4, 2, 5, 1, 3, 4, 4, 4, 1, 1, 3, 1, 3, 5, 1, 2, 1, 3, 3, 3, 3, 3,
       5, 4, 3, 4, 4, 1, 1, 4, 2, 3, 2, 2, 5, 2, 1, 1, 2, 0, 1, 4, 4, 1,
       1, 1, 3, 1, 2, 3, 3, 5, 2, 2, 5, 2, 2, 5, 5, 5, 1, 5, 3, 2, 3, 2,
       4, 1, 3, 4, 5, 4, 3, 3, 2, 1, 5, 5, 1, 1, 2, 2, 5, 2, 3, 3, 2, 4,
       1, 3, 3, 1, 3, 1, 2, 3, 1, 3, 1, 2, 1, 1, 1, 2, 5, 2, 2, 3, 2, 5,
       4, 4, 5, 2, 3, 4, 2, 4, 4, 4, 4, 4, 2, 3, 1,

In [14]:
# Define label names and colors
label_dict = {
    0: "Agents",
    1: "AI",
    2: "DB",
    3: "IR",
    4: "ML",
    5: "HCI"
}


In [15]:
test_acc = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {test_acc:.3f}")

# Display predictions for the first 10 nodes
print("Predictions for the first 10 nodes in the test set:")
for i in range(10):
    print(f"Node {i}: Predicted={label_dict[y_pred[i]]}, Actual={label_dict[y_test[i]]}")

Test Accuracy: 0.644
Predictions for the first 10 nodes in the test set:
Node 0: Predicted=IR, Actual=ML
Node 1: Predicted=DB, Actual=HCI
Node 2: Predicted=ML, Actual=ML
Node 3: Predicted=IR, Actual=ML
Node 4: Predicted=ML, Actual=ML
Node 5: Predicted=AI, Actual=AI
Node 6: Predicted=Agents, Actual=ML
Node 7: Predicted=DB, Actual=DB
Node 8: Predicted=IR, Actual=IR
Node 9: Predicted=HCI, Actual=IR
