In [None]:
import torch
import os
import pandas as pd
import torch_scatter
import torch_geometric
import torch_sparse
import ogb
from tensorboard import SummaryWriter

print(torch.__version__)

In [None]:
## Load the data set
from ogb.graphproppred import PygGraphPropPredDataset, Evaluator
from torch_geometric.loader import DataLoader
import torch_geometric.transforms as T
from tqdm import tqdm

dataset_name = 'ogbg-molhiv'
dataset = PygGraphPropPredDataset(dataset_name, transform=None)
device = 'cpu'
print('Device: {}'.format(device))

split_idx = dataset.get_idx_split()

print('Dataset task type: {}'.format(dataset.task_type))

In [None]:
train_loader = DataLoader(dataset[split_idx['train']], batch_size=32, shuffle=True, num_workers=0)
test_loader = DataLoader(dataset[split_idx['test']], batch_size=32, shuffle=True, num_workers=0)
valid_loader = DataLoader(dataset[split_idx['valid']], batch_size=32, shuffle=True, num_workers=0)

In [None]:
hyper_parameters = {
    'device': device,
    'num_layers': 3,
    'hidden_dim': 1024,
    'dropout': 0.5,
    'lr': 0.001,
    'epochs': 10,
}
hyper_parameters

In [None]:
from torch_geometric.nn import GCNConv
import torch.nn.functional as F


class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout, return_embeds=False):
        super(GCN, self).__init__()
        
        self.convs = torch.nn.ModuleList(
            [GCNConv(input_dim, hidden_dim)] 
            +[GCNConv(hidden_dim, hidden_dim) for i in range(num_layers - 2)] 
            +[GCNConv(hidden_dim, output_dim)]
            )
        
        self.batchnorm = torch.nn.ModuleList([torch.nn.BatchNorm1d(hidden_dim) for i in range(num_layers - 1)])

        self.softmax = torch.nn.LogSoftmax()
        self.dropout = dropout
        self.return_embeds = return_embeds

        

    def reset_parameters(self):
        for conv_layer in self.convs:
            conv_layer.reset_parameters()
        for batchnorm_layer in self.batchnorm:
            batchnorm_layer.reset_parameters()

    def forward(self, x, adj_t):
        
        for i in range(len(self.batchnorm)):
            x = self.convs[i](x, adj_t)
            x = self.batchnorm[i](x)

            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training, inplace=False) # self.training was inherited from nn.Module assign is training or evaluation
            
        x = self.convs[-1](x, adj_t)

        if not self.return_embeds:
            x = self.softmax(x)
        
        return x

In [None]:
from ogb.graphproppred.mol_encoder import AtomEncoder
from torch_geometric.nn import global_add_pool, global_mean_pool

class GCN_Graph(torch.nn.Module):

    def __init__(self, hidden_dim, output_dim, num_layers, dropout):
        super(GCN_Graph, self).__init__()

        self.node_encoder = AtomEncoder(hidden_dim) ## Encoders for atoms in molecule graphs
        self.gnn_node = GCN(hidden_dim, hidden_dim, num_layers, hidden_dim, dropout, return_embeds=True)
        self.pool = global_mean_pool
        self.linear = torch.nn.Linear(hidden_dim, output_dim) ## Output layer
        
    def reset_parameters(self):
        self.gnn_node.reset_parameters()
        self.linear.reset_parameters()

    def forward(self, batched_data):
        x = batched_data.x
        edge_index = batched_data.edge_index
        batch = batched_data.batch

        embedings = self.node_encoder(x) ## node_encoder generates emeddings with size hidden_dims
        embedings = self.gnn_node(embedings, edge_index) ## gnn_node takes hidden_dim dimensional embeddings as input and outputs hidden_dim dimensional convolutionized embeddings
        embedings = self.pool(embedings, batch)
        output = self.linear(embedings)

        return output


In [None]:
def train(model, device, data_loader, optimizer, loss_func):
    model.train()
    loss=0
    
    for step, batch in enumerate(tqdm(data_loader, desc="Iteration")):
        batch = batch.to(device)

        if batch.x.shape[0] == 1 or batch.batch[-1] == 0:
            pass
        else:
            is_labeled = batch.y == batch.y
            
            optimizer.zero_grad()
            output = model(batch)
            
        
            loss = loss_func(output[is_labeled], batch.y[is_labeled].type(torch.float32))
            loss.backward()
            optimizer.step()
            
    return loss.item()




In [None]:
def eval(model, device, loader, evaluator, save_model_results=False, save_file=None):
    model.eval()
    y_true = []
    y_pred = []

    for step, batch in enumerate(tqdm(loader, desc="Iteration")):
        batch = batch.to(device)

        if batch.x.shape[0] == 1:
            pass
        else:
            with torch.no_grad():
                pred = model(batch)

            y_true.append(batch.y.view(pred.shape).detach().cpu())
            y_pred.append(pred.detach().cpu())

    y_true = torch.cat(y_true, dim = 0).numpy()
    y_pred = torch.cat(y_pred, dim = 0).numpy()

    input_dict = {"y_true": y_true, "y_pred": y_pred}

    if save_model_results:
        print ("Saving Model Predictions")
        
        # Create a pandas dataframe with a two columns
        # y_pred | y_true
        data = {}
        data['y_pred'] = y_pred.reshape(-1)
        data['y_true'] = y_true.reshape(-1)

        df = pd.DataFrame(data=data)
        # Save to csv
        df.to_csv('ogbg-molhiv_graph_' + save_file + '.csv', sep=',', index=False)

    return evaluator.eval(input_dict)

In [None]:
model = GCN_Graph(hyper_parameters['hidden_dim'],
                  dataset.num_tasks, hyper_parameters['num_layers'],
                  hyper_parameters['dropout']).to(device)
evaluator = Evaluator(name='ogbg-molhiv')



In [None]:
import copy
model.reset_parameters()

optimizer = torch.optim.Adam(model.parameters(), lr=hyper_parameters['lr'])
loss_func = torch.nn.BCEWithLogitsLoss()

best_model = None
best_valid_acc = 0


for epoch in range(1, 1 + hyper_parameters["epochs"]):
  print('Training...')
  loss = train(model, device, train_loader, optimizer, loss_func)

  print('Evaluating...')
  train_result = eval(model, device, train_loader, evaluator)
  val_result = eval(model, device, valid_loader, evaluator)
  test_result = eval(model, device, test_loader, evaluator)

  train_acc, valid_acc, test_acc = train_result[dataset.eval_metric], val_result[dataset.eval_metric], test_result[dataset.eval_metric]
  if valid_acc > best_valid_acc:
      best_valid_acc = valid_acc
      best_model = copy.deepcopy(model)

  print(f'Epoch: {epoch:02d}, '
        f'Loss: {loss:.4f}, '
        f'Train: {100 * train_acc:.2f}%, '
        f'Valid: {100 * valid_acc:.2f}% '
        f'Test: {100 * test_acc:.2f}%')
  