In [3]:
import torch_geometric
from torch_geometric.datasets import Planetoid

In [22]:
use_cuda_if_available = False

In [4]:
dataset = Planetoid(root = 'tutorial1', name = 'Cora')

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


# Dataset Properties

In [8]:
print(f'Lenght of the dataset: {len(dataset)}')
print(f'Number of classes in the dataset: {dataset.num_classes}')
print(f'Number of Node Features : {dataset.num_node_features}')
print(f'Number of Edge Features: {dataset.num_edge_features}')

Lenght of the dataset: 1
Number of classes in the dataset: 7
Number of Node Features : 1433
Number of Edge Features: 0


In [11]:
print(dataset._data)

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])


The `Data` object from PyTorch Geometric encapsulates the data representation for a graph. Here's what the attributes mean in the context of the **Cora dataset** and graph data in general:

### 1. **`x` (Node Features)**  
   - **Shape:** `[2708, 1433]`
   - **Meaning:** This represents the **feature matrix** of the graph.
     - `2708`: Number of nodes in the graph (e.g., papers in the Cora dataset).
     - `1433`: Number of features per node (e.g., one-hot encoded word vector for the paper’s content).
   - Each row in `x` corresponds to the feature vector of a node.

---

### 2. **`edge_index` (Graph Connectivity)**  
   - **Shape:** `[2, 10556]`
   - **Meaning:** This defines the edges of the graph in a **COO (coordinate) format**.
     - The first row contains the indices of source nodes.
     - The second row contains the indices of target nodes.
     - Together, they describe the directed edges of the graph (e.g., citation relationships in the Cora dataset).

---

### 3. **`y` (Node Labels)**  
   - **Shape:** `[2708]`
   - **Meaning:** This contains the **labels** (class/category) for each node.
     - In the Cora dataset, nodes represent research papers, and `y` indicates the subject area or class of each paper.

---

### 4. **`train_mask`, `val_mask`, `test_mask` (Data Splits)**  
   - **Shape:** `[2708]`
   - **Meaning:** These are boolean masks for splitting the nodes into training, validation, and test sets.
     - `train_mask[i] == True` indicates node `i` is part of the training set.
     - Similarly, `val_mask` and `test_mask` define the nodes for validation and testing.

---

### **Summary of the `Data` Object:**
In the context of the Cora dataset:
- **Nodes** represent research papers.
- **Edges** represent citation relationships between papers.
- **Node Features (`x`)** are based on the paper's content.
- **Node Labels (`y`)** categorize the papers into different subject areas.
- **Masks** split the nodes into training, validation, and testing subsets for supervised learning tasks.

This format allows you to process graph data in a way that's compatible with graph neural network (GNN) models like Graph Convolutional Networks (GCNs). Let me know if you'd like more examples or help with code!

In [21]:
print("edge_index:\t\t",dataset.data.edge_index.shape)
print(dataset.data.edge_index)
print("\n")
print("train_mask:\t\t",dataset.data.train_mask.shape)
print(dataset.data.train_mask)
print("\n")
print("x:\t\t",dataset.data.x.shape)
print(dataset.data.x)
print("\n")
print("y:\t\t",dataset.data.y.shape)
print(dataset.data.y)

edge_index:		 torch.Size([2, 10556])
tensor([[ 633, 1862, 2582,  ...,  598, 1473, 2706],
        [   0,    0,    0,  ..., 2707, 2707, 2707]])


train_mask:		 torch.Size([2708])
tensor([ True,  True,  True,  ..., False, False, False])


x:		 torch.Size([2708, 1433])
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])


y:		 torch.Size([2708])
tensor([3, 4, 4,  ..., 3, 3, 3])


In [14]:
import os
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

In [15]:
data = dataset[0]

In [27]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv = SAGEConv(dataset.num_features,
                            dataset.num_classes,
                            aggr = 'max') # max, mean, add ...)
    def forward(self):
        x = self.conv(data.x, data.edge_index)
        return F.log_softmax(x, dim=1)

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() and use_cuda_if_available else 'cpu')
model, data = Net().to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [29]:
def train():
    model.train()
    optimizer.zero_grad()
    F.nll_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
    optimizer.step()


def test():
    model.eval()
    logits, accs = model(), []
    for _, mask in data('train_mask', 'val_mask', 'test_mask'):
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
        accs.append(acc)
    return accs

In [30]:
best_val_acc = test_acc = 0
for epoch in range(1,500):
    train()
    _, val_acc, tmp_test_acc = test()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
    log = 'Epoch: {:03d}, Val: {:.4f}, Test: {:.4f}'
    
    if epoch % 10 == 0:
        print(log.format(epoch, best_val_acc, test_acc))

Epoch: 010, Val: 0.7200, Test: 0.7060
Epoch: 020, Val: 0.7200, Test: 0.7060
Epoch: 030, Val: 0.7200, Test: 0.7060
Epoch: 040, Val: 0.7200, Test: 0.7060
Epoch: 050, Val: 0.7200, Test: 0.7060
Epoch: 060, Val: 0.7220, Test: 0.7000
Epoch: 070, Val: 0.7260, Test: 0.7050
Epoch: 080, Val: 0.7260, Test: 0.7050
Epoch: 090, Val: 0.7260, Test: 0.7050
Epoch: 100, Val: 0.7260, Test: 0.7050
Epoch: 110, Val: 0.7260, Test: 0.7050
Epoch: 120, Val: 0.7260, Test: 0.7050
Epoch: 130, Val: 0.7260, Test: 0.7050
Epoch: 140, Val: 0.7260, Test: 0.7050
Epoch: 150, Val: 0.7260, Test: 0.7050
Epoch: 160, Val: 0.7260, Test: 0.7050
Epoch: 170, Val: 0.7260, Test: 0.7050
Epoch: 180, Val: 0.7260, Test: 0.7050
Epoch: 190, Val: 0.7260, Test: 0.7050
Epoch: 200, Val: 0.7260, Test: 0.7050
Epoch: 210, Val: 0.7260, Test: 0.7050
Epoch: 220, Val: 0.7260, Test: 0.7050
Epoch: 230, Val: 0.7260, Test: 0.7050
Epoch: 240, Val: 0.7260, Test: 0.7050
Epoch: 250, Val: 0.7260, Test: 0.7050
Epoch: 260, Val: 0.7260, Test: 0.7050
Epoch: 270, 