# SubGNN & Contrastive Learning Model

This notebook contains the Subgraph Neural Network Model that uses contrastive learning technique to learn graph embeddings for downstream graph predictions. The subgraph neural network model is inspired by the DropGNN model and has been modified to for easier use via config files. The contrastive learning framework is inspired by the SimCLR contrastive learning framework. 

DropGNN: 
- https://arxiv.org/pdf/2111.06283.pdf 
- https://github.com/KarolisMart/DropGNN 

SimCLR: 
- https://arxiv.org/pdf/2002.05709.pdf

In [59]:
import os.path as osp
import numpy as np
import networkx as nx
import time
import random
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
from sklearn.model_selection import StratifiedKFold


# Torch Geometric 
try: 
    from torch_geometric.data import DataLoader, Data
    from torch_geometric.data.dataloader import Collater
    from torch_geometric.datasets import TUDataset
    from torch_geometric.utils import degree
    from torch_geometric.utils.convert import from_networkx
    from torch_geometric.nn import GINConv, GINEConv, global_add_pool
except ModuleNotFoundError: 
    !pip install torch_geometric
    from torch_geometric.data import DataLoader, Data
    from torch_geometric.data.dataloader import Collater
    from torch_geometric.datasets import TUDataset
    from torch_geometric.utils import degree
    from torch_geometric.utils.convert import from_networkx
    from torch_geometric.nn import GINConv, GINEConv, global_add_pool
    
# Pytorch Metric Learning
try: 
    from pytorch_metric_learning import losses
except ModuleNotFoundError:
    !pip install pytorch-metric-learning
    from pytorch_metric_learning import losses

## Import Dataset

Download the IMDB-BINARY dataset

In [60]:
class MyFilter(object):
    def __call__(self, data):
        return data.num_nodes <= 70

class MyPreTransform(object):
    def __call__(self, data):
        data.x = degree(data.edge_index[0], data.num_nodes, dtype=torch.long)
        data.x = F.one_hot(data.x, num_classes=69).to(torch.float)
        return data 

In [61]:
# Download data 
path = osp.join(osp.dirname(osp.realpath("./")), 'data', f'IMDB-BINARY')

dataset = TUDataset(
    path, 
    name = "IMDB-BINARY", 
    pre_transform = MyPreTransform(), 
    pre_filter = MyFilter()
)

In [62]:
print(dataset)

IMDB-BINARY(996)


Download the IMDB-MULTI dataset

In [63]:
# Download data 
path = osp.join(osp.dirname(osp.realpath("./")), 'data', f'IMDB-MULTI')

dataset2 = TUDataset(
    path, 
    name = "IMDB-MULTI", 
    pre_transform = MyPreTransform(), 
    pre_filter = MyFilter()
)

In [64]:
print(dataset2)

IMDB-MULTI(1498)


## SubGNN & Contrastive Learning Model

SubGNN Model with Contrastive Learning Techniques using DropGNN subGNN model and SimCLR contrastive learning framework. 

In [65]:
class SubGNN_Contrastive(nn.Module):
    def __init__(self, num_features, num_reps, num_classes, hidden_units, device, use_aux_loss=True):
        super(SubGNN_Contrastive, self).__init__()

        # Set starting parameters for model 
        self.num_features = num_features   # Number of initial features 
        self.num_reps = num_reps           # Number of features in representation vector 
        self.num_classes = num_classes     # Number of different classes
        self.dim = hidden_units            # Number of units for hidden layers
        self.use_aux_loss = use_aux_loss   # Whether to include aux loss to total loss
        
        self.device = device

        # Number of layers in model
        self.num_layers = 4

        self.convs = nn.ModuleList()        # Made of num_layers GINConv (linear -> batchnorm1d -> relu -> linear)
        self.bns = nn.ModuleList()          # Made of num_layers BatchNorm1d 
        self.reps = nn.ModuleList()         # Layer between base model and contrastive learning representation
        self.fcs = nn.ModuleList()          # Made of num_layers + 1 Linear layers mapping from num_features or dim to num_reps

        # Add initial layer from num_features to dim 
        self.convs.append(GINConv(nn.Sequential(nn.Linear(self.num_features, self.dim), nn.BatchNorm1d(self.dim), nn.ReLU(), nn.Linear(self.dim, self.dim))))
        self.bns.append(nn.BatchNorm1d(self.dim))
        self.reps.append(nn.Linear(self.num_features, self.num_reps))
        self.reps.append(nn.Linear(self.dim, self.num_reps))
        self.fcs.append(nn.Linear(self.num_features, self.num_classes))
        self.fcs.append(nn.Linear(self.dim, self.num_classes))

        # Add additional layers from dim to dim 
        for i in range(self.num_layers-1):
            self.convs.append(GINConv(nn.Sequential(nn.Linear(self.dim, self.dim), nn.BatchNorm1d(self.dim), nn.ReLU(), nn.Linear(self.dim, self.dim))))
            self.bns.append(nn.BatchNorm1d(self.dim))
            self.reps.append(nn.Linear(self.dim, self.num_reps))
            self.fcs.append(nn.Linear(self.dim, self.num_classes))
        
    def reset_parameters(self):
        # Resets parameters for Linear, GINConv, and BatchNorm1d layers
        for m in self.modules():
            if isinstance(m, nn.Linear):
                m.reset_parameters()
            elif isinstance(m, GINConv):
                m.reset_parameters()
            elif isinstance(m, nn.BatchNorm1d):
                m.reset_parameters()
                
    def forward(self, data, mode="test", p=None, dropout=None, num_runs=20):
        # Runs different modes based on whether running contrastive loss or making predictions
        if mode == 'contrastive':
            return self.contrastive(data, p, num_runs)
        else:
            return self.prediction(data, p, dropout, num_runs)
        
    def contrastive(self, data, p, num_runs):
        # Trains contrastive model and representation vector model 
        
        # Note: num_runs in DropGNN is average number of nodes in each graph in dataset
        # Note: p is 2 * 1 / (1 + gamma), but for this project, p is selected to create augmented views 
        
        self.p = p
        self.num_runs = num_runs
        
        # Store all graphs in sampled batch as one large graph with separate components
        x = data.x                     # All nodes and their features (# nodes x # node features)
        edge_index = data.edge_index   # All edge index pairs from large single graph
        batch = data.batch             # Batch numbers that group nodes within the same graph with same batch number
        
        # Do runs in parallel by repeating nodes and creating num_runs different views
        x = x.unsqueeze(0).expand(self.num_runs, -1, -1).clone()   # Creates num_runs copy of node features
        drop = torch.bernoulli(torch.ones([x.size(0), x.size(1)], device=x.device) * self.p).bool()   #  Randomly determine whether node is dropped within each copy of num_runs
        x[drop] = torch.zeros([drop.sum().long().item(), x.size(-1)], device=x.device)  # Drop nodes from graphs  
        del drop
        
        # Allow gradients to update base model 
        if self.training:
            for layer in self.convs: 
                for p in layer.parameters():
                    p.requires_grad = True

            for layer in self.bns:
                for p in layer.parameters():
                    p.requires_grad = True
        
        # Run augmented subgraph through model 
        outs = [x]  # Used to store n-hop neighborhood representations, after running through model n times
        x = x.view(-1, x.size(-1))  # Concat all num_run copies of nodes 
        run_edge_index = edge_index.repeat(1, self.num_runs) + torch.arange(self.num_runs, device=edge_index.device).repeat_interleave(edge_index.size(1)) * (edge_index.max() + 1) # Transform edge_index to correspond to the same nodes in concatenated form  
        for i in range(self.num_layers):
            x = self.convs[i](x, run_edge_index)  # Run node features and edge indices through CONV layer 
            x = self.bns[i](x)  # Run resulting values through BatchNorm1d
            x = F.relu(x)   # Run final values through RELU
            outs.append(x.view(self.num_runs, -1, x.size(-1)))  # Return x back to original stacked form 
        del run_edge_index
        
        # Aggregates results of runs by taking mean of each run and summing results of runs
        out = None
        for i, x in enumerate(outs):
            x = x.mean(dim=0)                  # Take average of all node features of same nodes 
            x = global_add_pool(x, batch)      # Take the sum of all node features for nodes in same graph 
            x = self.reps[i](x)                # Run graph features into linear layer to get contrastive representation
            if out is None:
                out = x
            else:
                out += x
                
        # Returns all contrastive graph embeddings in batch 
        return out
    
    def prediction(self, data, p, dropout, num_runs):
        self.p = p
        self.dropout = dropout
        self.num_runs = num_runs
        
        # Create intermediate representations 
        x = data.x 
        edge_index = data.edge_index
        batch = data.batch 
        
        # Do runs in parallel, by repeating the graphs in the batch
        x = x.unsqueeze(0).expand(self.num_runs, -1, -1).clone()   # Flattens features and creates num_runs copy of them 
        drop = torch.bernoulli(torch.ones([x.size(0), x.size(1)], device=x.device) * self.p).bool()   #  Returns a tensor of randomly dropped nodes based on p (p = probability of dropping) 
        x[drop] = torch.zeros([drop.sum().long().item(), x.size(-1)], device=x.device)  # Drop nodes from data  
        del drop
        
        # Stop gradients from updating base model 
        for layer in self.convs:
            for p in layer.parameters():
                p.requires_grad = False
                
        for layer in self.bns:
            for p in layer.parameters():
                p.requires_grad = False
        
        # Run augmented subgraph through model 
        outs = [x]  # Used to store view of x after each layer 
        x = x.view(-1, x.size(-1))  # Swap dimensions of data features 
        run_edge_index = edge_index.repeat(1, self.num_runs) + torch.arange(self.num_runs, device=edge_index.device).repeat_interleave(edge_index.size(1)) * (edge_index.max() + 1) # Expand edge_index and augment values
        for i in range(self.num_layers):
            x = self.convs[i](x, run_edge_index)  # Run node features and edge indices through CONV layer 
            x = self.bns[i](x)  # Run resulting values through BatchNorm1d
            x = F.relu(x)   # Run final values through RELU
            outs.append(x.view(self.num_runs, -1, x.size(-1)))    # Rearrange dimensions and append to outs 
        del run_edge_index
        
        # Aggregates results of runs by summing mean and applying random dropout (not dropping out nodes)
        out = None
        for i, x in enumerate(outs):
            x = x.mean(dim=0)
            x = global_add_pool(x, batch)
            x = F.dropout(self.fcs[i](x), p=self.dropout, training=self.training)
            if out is None:
                out = x
            else:
                out += x
        
        # Returns the likelihood of each outcome class
        return F.log_softmax(out, dim=-1) 

## Training Modules

Training, validation, and testing functions to train and call on Contrastive Learning and for Downstream Graph Prediction using the SubGNN & Contrastive Learning model. 

### Contrastive Learning Training Modules

In [66]:
# Used to train contrastive model 
def train_contrastive(model, loader, optimizer, loss_fn, p1=0.1, p2=0.2, device=None):
    # Set model to training
    model.train()
    
    # Run data through model and update model
    loss_all = 0
    n = 0 
    for data in loader: 
        data = data.to(device)
        optimizer.zero_grad()
        embeddings_1 = model(data, mode = "contrastive", p = p1)
        embeddings_2 = model(data, mode = "contrastive", p = p2)
        
        # Used as loss(embeddings, labels)
        loss = loss_fn(embeddings_1, embeddings_2)
        loss.backward()
        optimizer.step() 
        
        loss_all += data.num_graphs * loss.item()
        n += data.num_graphs
    return loss_all / n

In [67]:
# Used to validate contrastive model 
def valid_contrastive(model, loader, loss_fn, p1=0.1, p2=0.2, device=None):
    # Set model to eval
    model.eval()
    
    with torch.no_grad():
        loss_all = 0
        n = 0
        for data in loader: 
            data = data.to(device)
            embeddings_1 = model(data, mode = "contrastive", p = p1)
            embeddings_2 = model(data, mode = "contrastive", p = p2)
            loss = loss_fn(embeddings_1, embeddings_2)
            
            loss_all += data.num_graphs * loss.item()
            n += data.num_graphs
    return loss_all / n

### Prediction Training Modules

In [68]:
# Used to train prediction model AFTER contrastive learning 
def train_prediction(model, loader, optimizer, p=0.1, dropout=0.5, device=None):
    # Set model to training
    model.train()
    
    # Run data through model and update model 
    loss_all = 0
    n = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        results = model(data, mode = "prediction", p = p, dropout = dropout)
        loss = F.nll_loss(results, data.y)
    
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        n += len(data.y)
        optimizer.step()

    return loss_all / n

In [69]:
# Used to validate prediction model AFTER contrastive learning, returns loss 
def valid_prediction(model, loader, p=0.1, dropout=0.5, device=None):
    # Set model to eval
    model.eval()
    
    # Run data through model
    with torch.no_grad():
        loss_all = 0
        n = 0
        for data in loader:
            data = data.to(device)
            results = model(data, mode = "prediction", p = p, dropout = dropout)
            loss = F.nll_loss(results, data.y)
                
            loss_all += data.num_graphs * loss.item()
            n += len(data.y)

    return loss_all / n

In [70]:
# Used to test prediction model AFTER contrastive learning, returns accuracy
def test_prediction(model, loader, p=0.1, dropout=0.5, device=None):
    # Set model to eval
    model.eval() 
    
    # Run data through model and make predictions
    with torch.no_grad():
        correct = 0
        for data in loader: 
            data = data.to(device)
            results = model(data, mode = "prediction", p = p, dropout = dropout)
            pred = results.max(1)[1]
            correct += pred.eq(data.y).sum().item()
    return correct / len(loader.dataset)

## Split Dataset into K-Folds

In [71]:
def separate_data(dataset_len, seed=0, n_splits=10):
    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    idx_list = []
    for idx in folds.split(np.zeros(dataset_len), np.zeros(dataset_len)):
        idx_list.append(idx)
    return idx_list

## Training and Evaluation Loop

Automates training process by training both contrastive learning and graph prediction and making finals predictions using final model. Also automates the evaluation process of a model to test its performance using k-folds train-test splits. 

In [73]:
def training_loop(model, dataset, train_idx, test_idx, batch_size, epochs, p1, p2, p, dropout, device, lr=0.001, seed=0, m=10, filename=None):
    """
    Runs a single training loop based on given training and testing indices 
    """
    # Set random seeds 
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    # Set batch size and number of epochs 
    BATCH = batch_size
    NUM_EPOCHS = epochs 
    LR = lr
    
    # Create training and testing datasets
    train_dataset = dataset[train_idx.tolist()]
    test_dataset = dataset[test_idx.tolist()]
    train_loader = torch.utils.data.DataLoader(train_dataset, sampler=torch.utils.data.RandomSampler(train_dataset, replacement=True, num_samples=int(len(train_dataset)*50/(len(train_dataset)/BATCH))), batch_size=BATCH, drop_last=False, collate_fn=Collater(follow_batch=[],exclude_keys=[]))
    test_loader = DataLoader(test_dataset, batch_size=BATCH)
    
    # Set up for contrastive learning
    loss_func = losses.SelfSupervisedLoss(losses.NTXentLoss())   # Specify contrastive loss function to use 
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)      # Optimizer for model to use 
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) # Used to adjust learning rate while training 
    
    # CONTRASTIVE LEARNING: Train model on contrastive representation 
    print("STARTING CONTRASTIVE LEARNING")
    if filename != None:
        with open(filename, "a") as f: 
            print("STARTING CONTRASTIVE LEARNING", file=f)
    
    contrastive_losses = []
    for epoch in range(NUM_EPOCHS):
        if epoch % m == 0:
            start = time.time()

        lr = scheduler.optimizer.param_groups[0]['lr']
        train_loss = train_contrastive(model, train_loader, optimizer, loss_func, p1=p1, p2=p2, device=device)
        scheduler.step()
        test_loss = valid_contrastive(model, test_loader, loss_func, p1=p1, p2=p2, device=device)
        contrastive_losses.append(test_loss)

        if epoch % m == 0:
            print('Epoch: {:03d}, LR: {:7f}, Train Loss: {:.7f}, '
                'Val Loss: {:.7f}, Time: {:7f}'.format(
                    epoch, lr, train_loss, test_loss, time.time() - start), flush=True)
            if filename != None:
                with open(filename, "a") as f: 
                    print('Epoch: {:03d}, LR: {:7f}, Train Loss: {:.7f}, '
                        'Val Loss: {:.7f}, Time: {:7f}'.format(
                            epoch, lr, train_loss, test_loss, time.time() - start), flush=True, file=f)
            
    # Set up for prediction 
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)      # Optimizer for model to use 
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) # Used to adjust learning rate while training 
    
    # PREDICTION: Train model using contrastive representations to make predictions 
    print("\nSTARTING PREDICTION LEARNING")
    if filename != None: 
        with open(filename, "a") as f: 
            print("\nSTARTING PREDICTION LEARNING", file=f)
    
    prediction_losses = []
    for epoch in range(NUM_EPOCHS):
        if epoch % m == 0:
            start = time.time()

        lr = scheduler.optimizer.param_groups[0]['lr']
        train_loss = train_prediction(model, train_loader, optimizer, p=p, dropout=dropout, device=device)
        scheduler.step()
        test_loss = valid_prediction(model, test_loader, p=p, dropout=dropout, device=device)
        prediction_losses.append(test_loss)

        if epoch % m == 0:
            print('Epoch: {:03d}, LR: {:7f}, Train Loss: {:.7f}, '
                'Val Loss: {:.7f}, Time: {:7f}'.format(
                    epoch, lr, train_loss, test_loss, time.time() - start), flush=True)
            if filename != None:
                with open(filename, "a") as f: 
                    print('Epoch: {:03d}, LR: {:7f}, Train Loss: {:.7f}, '
                        'Val Loss: {:.7f}, Time: {:7f}'.format(
                            epoch, lr, train_loss, test_loss, time.time() - start), flush=True, file=f)
            
    # Test final accuracy of final model 
    test_acc = test_prediction(model, test_loader, dropout=dropout, device=device)
    print(f"\nFinal Prediction Accuracy: {test_acc}\n")
    if filename != None: 
        with open(filename, "a") as f: 
            print(f"\nFinal Prediction Accuracy: {test_acc}\n", file=f)
    
    return contrastive_losses, prediction_losses, test_acc

In [74]:
def evaluation_loop(model, dataset, splits, batch_size, epochs, p1, p2, p, dropout, device, lr=0.001, seed=0, m=10, filename=None):
    # Train model on different splits, meant to evaluate model, not save best model
    contrastive_loss = []
    prediction_loss = []
    test_accuracies = []
    
    # Train a new model on every fold for evaluation
    for i, (train_idx, test_idx) in enumerate(splits): 
        print(f"Running Split {i}")
        if filename != None: 
            with open(filename, "a") as f: 
                print(f"Running Split {i}", file=f)
        
        model.reset_parameters()    # Resets upon every new fold 
        c_loss, p_loss, t_acc = training_loop(model, dataset, train_idx, test_idx, batch_size, epochs, p1, p2, p, dropout, device, lr, seed, m, filename)
        contrastive_loss.append(torch.tensor(c_loss))
        prediction_loss.append(torch.tensor(p_loss))
        test_accuracies.append(t_acc)
        
    # Calculate average contrastive loss and return best epoch for contrastive loss
    contrastive_loss = torch.stack(contrastive_loss, dim=0)
    contrastive_loss_mean = contrastive_loss.mean(dim=0)
    best_contrastive_epoch = contrastive_loss_mean.argmin()

    # Calculate average prediction loss and return best epoch for predictions 
    prediction_loss = torch.stack(prediction_loss, dim=0)
    prediction_loss_mean = prediction_loss.mean(dim=0)
    best_prediction_epoch = prediction_loss_mean.argmin()
    
    # Print average final prediction accuracy
    test_accuracies = torch.tensor(test_accuracies)
    print(f"Average Test Accuracy: {test_accuracies.mean()}\n")
    if filename != None: 
        with open(filename, "a") as f: 
            print(f"Average Test Accuracy: {test_accuracies.mean()}\n", file=f)
    
    return (contrastive_loss, contrastive_loss_mean, best_contrastive_epoch), (prediction_loss, prediction_loss_mean, best_prediction_epoch), test_accuracies

## Run DropGNN + Contrastive Learning Model

Examples of how to train and use the SubGNN & Contrastive Learning model. 

In [81]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


### Run Experiments 

Runs different dropout probabilities and trains the model on the dataset. All results are both printed and saved to a text file. 

#### Test IMDB-Binary

In [82]:
num_reps = [32]
p1 = [0.1, 0.4, 0.7]
p2 = [0.25, 0.55, 0.85]
p = [0.0, 0.1, 0.5]

In [None]:
i = 1
for n in num_reps:
    for p_1 in p1: 
        for p_2 in p2: 
            for p_i in p:
                # Set random seeds
                torch.manual_seed(0)
                np.random.seed(0)
                
                filename = f"results/test_{i}.txt"
                
                with open(filename, "a") as f: 
                    print(f"num_reps: {n}\np1: {p_1}\np2: {p_2}\np: {p_i}\n", file=f)
                
                model_cfg = {
                    "num_features": dataset.num_features, 
                    "num_classes": dataset.num_classes, 
                    "num_reps": n, 
                    "hidden_units": 64
                }
                
                train_cfg = {
                    "model": SubGNN_Contrastive(**model_cfg).to(device),
                    "dataset": dataset, 
                    "splits": separate_data(len(dataset), seed=0, n_splits=3),
                    "batch_size": 32,
                    "epochs": 101, 
                    "p1": p_1, 
                    "p2": p_2, 
                    "p": p_i,
                    "dropout": 0.1, 
                    "device": device, 
                    "lr": 0.001, 
                    "seed": 0, 
                    "m": 10, 
                    "filename": filename
                }
                
                eval_results = evaluation_loop(**train_cfg)
                
                result_filename = f"results/test_{i}_metrics.txt"
                with open(result_filename, "a") as f: 
                    print(eval_results, file=f)
                
                i += 1

#### Test IMDB-Multi

In [84]:
num_reps = [32]
p1 = [0.1, 0.4]
p2 = [0.25, 0.55]
p = [0.0, 0.5]

In [None]:
i = 1
for n in num_reps:
    for p_1 in p1: 
        for p_2 in p2: 
            for p_i in p:
                # Set random seeds
                torch.manual_seed(0)
                np.random.seed(0)
                
                filename = f"results2/test_{i}.txt"
                
                with open(filename, "a") as f: 
                    print(f"num_reps: {n}\np1: {p_1}\np2: {p_2}\np: {p_i}\n", file=f)
                
                model_cfg = {
                    "num_features": dataset2.num_features, 
                    "num_classes": dataset2.num_classes, 
                    "num_reps": n, 
                    "hidden_units": 64
                }
                
                train_cfg = {
                    "model": SubGNN_Contrastive(**model_cfg).to(device),
                    "dataset": dataset2, 
                    "splits": separate_data(len(dataset2), seed=0, n_splits=3),
                    "batch_size": 32,
                    "epochs": 101, 
                    "p1": p_1, 
                    "p2": p_2, 
                    "p": p_i,
                    "dropout": 0.1, 
                    "device": device, 
                    "lr": 0.001, 
                    "seed": 0, 
                    "m": 10, 
                    "filename": filename
                }
                
                eval_results = evaluation_loop(**train_cfg)
                
                result_filename = f"results2/test_{i}_metrics.txt"
                with open(result_filename, "a") as f: 
                    print(eval_results, file=f)
                
                i += 1

### Run Evaluation Loop w/ Configs

Demonstration of how to run the evaluation loop to evaluate the model using configurations. Can be down in the form of a dictionary or a JSON file that is read in as a dictionary. 

In [None]:
# Configurations
model_cfg = {
    "num_features": dataset.num_features, 
    "num_classes": dataset.num_classes, 
    "num_reps": 32, 
    "hidden_units": 64
}

train_cfg = {
    "model": SubGNN_Contrastive(**model_cfg).to(device),
    "dataset": dataset, 
    "splits": separate_data(len(dataset), seed=0, n_splits=5),
    "batch_size": 32,
    "epochs": 101, 
    "p1": 0.1, 
    "p2": 0.2, 
    "p": 0.0,
    "dropout": 0.1, 
    "device": device, 
    "lr": 0.001, 
    "seed": 0, 
    "m": 10, 
    "filename": "test_1_results.txt"
}

In [None]:
eval_results = evaluation_loop(**train_cfg)

### Manually set each parameter

#### Train Model on IMDB-Binary

In [None]:
"""
SET UP PARAMETERS FOR TRAINING
"""

# Set random seeds
torch.manual_seed(0)
np.random.seed(0)

# Set batch size
BATCH = 32    # Default batch size in DropGNN

# Set number of epochs
NUM_EPOCHS = 100

# Set size of contrastive representation
NUM_REPS = 32

# Set node dropout probabilities 
p1 = 0.1
p2 = 0.2

# Set embedding dropout probabilities 
dropout = 0.5

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Device: {device}")

In [None]:
"""
CREATE MODEL AND CREATE DATA SPLITS
"""

# Create model
model = SubGNN_Contrastive(num_features=dataset.num_features, num_reps=NUM_REPS, num_classes=dataset.num_classes, hidden_units=64).to(device)

# Split dataset
n = len(dataset)
splits = separate_data(n, seed=2)

In [None]:
# Run one test of training a single split 
con_loss, pred_loss, test_acc = training_loop(model, dataset, train_idx=splits[0][0], test_idx=splits[0][0], batch_size=BATCH, epochs=NUM_EPOCHS, p1=p1, p2=p2, dropout=dropout, device=device, lr=0.001, seed=0, m=10)

In [None]:
# Evaluate model performance over several different splits
eval_results = evaluation_loop(model, dataset, splits[:2], batch_size=BATCH, epochs=NUM_EPOCHS, p1=p1, p2=p2, dropout=dropout, device=device, lr=0.001, seed=0, m=10)

#### Train Model on IMDB-Multi

#### Setup 1

In [None]:
"""
SET UP PARAMETERS FOR TRAINING
"""

# Set random seeds
torch.manual_seed(0)
np.random.seed(0)

# Set batch size
BATCH = 32 

# Set number of epochs
NUM_EPOCHS = 100

# Set size of contrastive representation 
NUM_REPS = 16

# Set node dropout probabilities 
p1 = 0.1
p2 = 0.2

# Set embedding dropout probabilities 
dropout = 0.1

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Device: {device}")

In [None]:
"""
CREATE MODEL AND CREATE DATA SPLITS
"""

# Create model
model2 = SubGNN_Contrastive(num_features=dataset2.num_features, num_reps=NUM_REPS, num_classes=dataset2.num_classes, hidden_units=64).to(device)

# Split dataset
n = len(dataset2)
splits = separate_data(n, seed=2)

In [None]:
# Run one test of training a single split 
con_loss2, pred_loss2, test_acc2 = training_loop(model3, dataset2, train_idx=splits[0][0], test_idx=splits[0][0], batch_size=BATCH, epochs=NUM_EPOCHS, p1=p1, p2=p2, dropout=dropout, device=device, lr=0.0001, seed=0, m=10)

In [None]:
# Evaluate model performance over several different splits
eval_results2 = evaluation_loop(model2, dataset2, splits, batch_size=BATCH, epochs=NUM_EPOCHS, p1=p1, p2=p2, dropout=dropout, device=device, lr=0.001, seed=0, m=10)

#### Setup 2

In [None]:
"""
SET UP PARAMETERS FOR TRAINING
"""

# Set random seeds
torch.manual_seed(0)
np.random.seed(0)

# Set batch size
BATCH = 32 

# Set number of epochs
NUM_EPOCHS = 100

# Set size of contrastive representation 
NUM_REPS = 64

# Set node dropout probabilities 
p1 = 0.1
p2 = 0.2

# Set embedding dropout probabilities 
dropout = 0.5

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(f"Device: {device}")

In [None]:
"""
CREATE MODEL AND CREATE DATA SPLITS
"""

# Create model
model3 = SubGNN_Contrastive(num_features=dataset2.num_features, num_reps=NUM_REPS, num_classes=dataset2.num_classes, hidden_units=64).to(device)

# Split dataset
n = len(dataset2)
splits = separate_data(n, seed=2)

In [None]:
# Run one test of training a single split 
con_loss3, pred_loss3, test_acc3 = training_loop(model3, dataset2, train_idx=splits[0][0], test_idx=splits[0][0], batch_size=BATCH, epochs=NUM_EPOCHS, p1=p1, p2=p2, dropout=dropout, device=device, lr=0.001, seed=0, m=10)

In [None]:
# Evaluate model performance over several different splits
eval_results3 = evaluation_loop(model3, dataset2, splits, batch_size=BATCH, epochs=NUM_EPOCHS, p1=p1, p2=p2, dropout=dropout, device=device, lr=0.001, seed=0, m=10)