<a href="https://colab.research.google.com/github/muhammad-usama-aleem/DNA-Classification-using-GNN/blob/main/DNA_Classification_using_GNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install required packages.
!pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/rusty1s/pytorch_geometric.git

  Building wheel for torch-sparse (setup.py) ... [?25l[?25hdone


In [2]:
import os.path as osp

import torch
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold

from torch_geometric.datasets import Planetoid
from torch_geometric.nn import DNAConv

In [3]:
from torch_geometric.transforms import NormalizeFeatures

dataset = Planetoid(root='data/Planetoid', name='Cora', transform=NormalizeFeatures())

# dataset = 'Cora'
# path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', dataset)
# dataset = Planetoid(path, dataset)

Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.x
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.tx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.allx
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.y
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ty
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.ally
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.graph
Downloading https://github.com/kimiyoung/planetoid/raw/master/data/ind.cora.test.index
Processing...
Done!


In [4]:
print()
print(f'Dataset: {dataset}:')
print('==================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
print(f'Number of classes: {dataset.num_classes}')

data = dataset[0]  # Get the first graph object.

print()
print(data)
print('===============================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {data.num_nodes}')
print(f'Number of edges: {data.num_edges}')
print(f'Average node degree: {data.num_edges / data.num_nodes:.2f}')
print(f'Number of training nodes: {data.train_mask.sum()}')
print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.3f}')
print(f'Has isolated nodes: {data.has_isolated_nodes()}')
print(f'Has self-loops: {data.has_self_loops()}')
print(f'Is undirected: {data.is_undirected()}')


Dataset: Cora():
Number of graphs: 1
Number of features: 1433
Number of classes: 7

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])
Number of nodes: 2708
Number of edges: 10556
Average node degree: 3.90
Number of training nodes: 140
Training node label rate: 0.052
Has isolated nodes: False
Has self-loops: False
Is undirected: True


In [5]:
data.train_mask = data.val_mask = data.test_mask = None

In [6]:
def gen_uniform_20_20_60_split(data):
    skf = StratifiedKFold(5, shuffle=True, random_state=55)
    idx = [torch.from_numpy(i) for _, i in skf.split(data.y, data.y)]
    data.train_idx = idx[0].to(torch.long)
    data.val_idx = idx[1].to(torch.long)
    data.test_idx = torch.cat(idx[2:], dim=0).to(torch.long)
    return data


data = gen_uniform_20_20_60_split(data)

In [7]:
print(data)

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_idx=[542], val_idx=[542], test_idx=[1624])


In [8]:
class Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers,
                 heads=1, groups=1):
        super().__init__()
        self.hidden_channels = hidden_channels
        self.lin1 = torch.nn.Linear(in_channels, hidden_channels)
        self.convs = torch.nn.ModuleList()
        for i in range(num_layers):
            self.convs.append(
                DNAConv(hidden_channels, heads, groups, dropout=0.8,
                        cached=True))
        self.lin2 = torch.nn.Linear(hidden_channels, out_channels)

    def reset_parameters(self):
        self.lin1.reset_parameters()
        for conv in self.convs:
            conv.reset_parameters()
        self.lin2.reset_parameters()

    def forward(self, x, edge_index):
        x = F.relu(self.lin1(x))
        x = F.dropout(x, p=0.5, training=self.training)
        x_all = x.view(-1, 1, self.hidden_channels)
        for conv in self.convs:
            x = F.relu(conv(x_all, edge_index))
            x = x.view(-1, 1, self.hidden_channels)
            x_all = torch.cat([x_all, x], dim=1)
        x = x_all[:, -1]
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return torch.log_softmax(x, dim=1)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = Net(in_channels=dataset.num_features, hidden_channels=128,
            out_channels=dataset.num_classes, num_layers=5, heads=8, groups=16)
model, data = model.to(device), data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=0.0005)
criterion = torch.nn.CrossEntropyLoss()

cuda


In [10]:
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index)
    loss = F.nll_loss(out[data.train_idx], data.y[data.train_idx])
    loss.backward()
    optimizer.step()


def test():
    model.eval()
    logits, accs = model(data.x, data.edge_index), []
    for _, idx in data('train_idx', 'val_idx', 'test_idx'):
        pred = logits[idx].max(1)[1]
        acc = pred.eq(data.y[idx]).sum().item() / idx.numel()
        accs.append(acc)
    return accs

In [12]:
best_val_acc = test_acc = 0
for epoch in range(1, 501):
    train()
    train_acc, val_acc, tmp_test_acc = test()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
    print(f'Epoch: {epoch:03d}, Train: {train_acc:.4f}, '
          f'Val: {best_val_acc:.4f}, Test: {test_acc:.4f}')

Epoch: 001, Train: 0.7251, Val: 0.6919, Test: 0.6706
Epoch: 002, Train: 0.7343, Val: 0.6956, Test: 0.6804
Epoch: 003, Train: 0.7472, Val: 0.7011, Test: 0.6829
Epoch: 004, Train: 0.7472, Val: 0.7048, Test: 0.6847
Epoch: 005, Train: 0.7472, Val: 0.7048, Test: 0.6847
Epoch: 006, Train: 0.7362, Val: 0.7048, Test: 0.6847
Epoch: 007, Train: 0.7269, Val: 0.7048, Test: 0.6847
Epoch: 008, Train: 0.7306, Val: 0.7048, Test: 0.6847
Epoch: 009, Train: 0.7491, Val: 0.7159, Test: 0.6903
Epoch: 010, Train: 0.7786, Val: 0.7177, Test: 0.6995
Epoch: 011, Train: 0.7786, Val: 0.7196, Test: 0.6989
Epoch: 012, Train: 0.7712, Val: 0.7196, Test: 0.6989
Epoch: 013, Train: 0.7675, Val: 0.7196, Test: 0.6989
Epoch: 014, Train: 0.7528, Val: 0.7196, Test: 0.6989
Epoch: 015, Train: 0.7509, Val: 0.7196, Test: 0.6989
Epoch: 016, Train: 0.7528, Val: 0.7196, Test: 0.6989
Epoch: 017, Train: 0.7601, Val: 0.7196, Test: 0.6989
Epoch: 018, Train: 0.7694, Val: 0.7196, Test: 0.6989
Epoch: 019, Train: 0.7731, Val: 0.7232, Test: 