In [1]:
# from torch_geometric.datasets import TUDataset
import torch_geometric

from torch_geometric.data import DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import pandas as pd 

from model import GNN, GNN_graphpred
from splitters import random_split




seed = 0
torch.manual_seed(seed)
np.random.seed(seed)
device = torch.device("cuda" if torch.cuda.is_available() else torch.device("cpu"))
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [2]:
torch_geometric.__version__

'1.0.3'

In [3]:
## Code copied from: https://github.com/pyg-team/pytorch_geometric/blob/90a06d02b79414fc48a3367c79cfb28f919d5769/torch_geometric/datasets/tu_dataset.py ## 

import os
import os.path as osp
import shutil

import torch
from torch_geometric.data import InMemoryDataset, download_url, extract_zip
from torch_geometric.read import read_tu_data


class TUDataset(InMemoryDataset):
    r"""A variety of graph kernel benchmark datasets, *.e.g.* "IMDB-BINARY",
    "REDDIT-BINARY" or "PROTEINS", collected from the `TU Dortmund University
    <http://graphkernels.cs.tu-dortmund.de>`_.
    Args:
        root (string): Root directory where the dataset should be saved.
        name (string): The `name <http://graphkernels.cs.tu-dortmund.de>`_ of
            the dataset.
        transform (callable, optional): A function/transform that takes in an
            :obj:`torch_geometric.data.Data` object and returns a transformed
            version. The data object will be transformed before every access.
            (default: :obj:`None`)
        pre_transform (callable, optional): A function/transform that takes in
            an :obj:`torch_geometric.data.Data` object and returns a
            transformed version. The data object will be transformed before
            being saved to disk. (default: :obj:`None`)
        pre_filter (callable, optional): A function that takes in an
            :obj:`torch_geometric.data.Data` object and returns a boolean
            value, indicating whether the data object should be included in the
            final dataset. (default: :obj:`None`)
        use_node_attr (bool, optional): If :obj:`True`, the dataset will
            contain additional continuous node features (if present).
            (default: :obj:`False`)
    """

    url = 'https://ls11-www.cs.tu-dortmund.de/people/morris/' \
          'graphkerneldatasets'

    def __init__(self,
                 root,
                 name,
                 transform=None,
                 pre_transform=None,
                 pre_filter=None,
                 use_node_attr=False):
        self.name = name
        super(TUDataset, self).__init__(root, transform, pre_transform,
                                        pre_filter)
        self.data, self.slices = torch.load(self.processed_paths[0])
        if self.data.x is not None and not use_node_attr:
            self.data.x = self.data.x[:, self.num_node_attributes:]

    @property
    def num_node_labels(self):
        if self.data.x is None:
            return 0

        for i in range(self.data.x.size(1)):
            if self.data.x[:, i:].sum().item() == self.data.x.size(0):
                return self.data.x.size(1) - i

        return 0

    @property
    def num_node_attributes(self):
        if self.data.x is None:
            return 0

        return self.data.x.size(1) - self.num_node_labels

    @property
    def raw_file_names(self):
        names = ['A', 'graph_indicator']
        return ['{}_{}.txt'.format(self.name, name) for name in names]

    @property
    def processed_file_names(self):
        return 'data.pt'

    def download(self):
        path = download_url('{}/{}.zip'.format(self.url, self.name), self.root)
        extract_zip(path, self.root)
        os.unlink(path)
        shutil.rmtree(self.raw_dir)
        os.rename(osp.join(self.root, self.name), self.raw_dir)

    def process(self):
        self.data, self.slices = read_tu_data(self.raw_dir, self.name)

        if self.pre_filter is not None:
            data_list = [self.get(idx) for idx in range(len(self))]
            data_list = [data for data in data_list if self.pre_filter(data)]
            self.data, self.slices = self.collate(data_list)

        if self.pre_transform is not None:
            data_list = [self.get(idx) for idx in range(len(self))]
            data_list = [self.pre_transform(data) for data in data_list]
            self.data, self.slices = self.collate(data_list)

        torch.save((self.data, self.slices), self.processed_paths[0])

    def __repr__(self):
        return '{}({})'.format(self.name, len(self))

In [12]:
# Code from https://github.com/thuml/LogME/blob/main/LogME.py
## NOTE: commented out njit because that package takes forever to load in jupyter notebook

import warnings

import numpy as np
# from numba import njit

# @njit
def each_evidence(y_, f, fh, v, s, vh, N, D):
    """
    compute the maximum evidence for each class
    """
    epsilon = 1e-5
    alpha = 1.0
    beta = 1.0
    lam = alpha / beta
    tmp = (vh @ (f @ np.ascontiguousarray(y_)))
    for _ in range(11):
        # should converge after at most 10 steps
        # typically converge after two or three steps
        gamma = (s / (s + lam)).sum()
        # A = v @ np.diag(alpha + beta * s) @ v.transpose() # no need to compute A
        # A_inv = v @ np.diag(1.0 / (alpha + beta * s)) @ v.transpose() # no need to compute A_inv
        m = v @ (tmp * beta / (alpha + beta * s))
        alpha_de = (m * m).sum()
        alpha = gamma / (alpha_de + epsilon)
        beta_de = ((y_ - fh @ m) ** 2).sum()
        beta = (N - gamma) / (beta_de + epsilon)
        new_lam = alpha / beta
        if np.abs(new_lam - lam) / lam < 0.01:
            break
        lam = new_lam
    evidence = D / 2.0 * np.log(alpha) \
               + N / 2.0 * np.log(beta) \
               - 0.5 * np.sum(np.log(alpha + beta * s)) \
               - beta / 2.0 * (beta_de + epsilon) \
               - alpha / 2.0 * (alpha_de + epsilon) \
               - N / 2.0 * np.log(2 * np.pi)
    return evidence / N, alpha, beta, m


# use pseudo data to compile the function
# D = 20, N = 50
f_tmp = np.random.randn(20, 50).astype(np.float64)
each_evidence(np.random.randint(0, 2, 50).astype(np.float64), f_tmp, f_tmp.transpose(), np.eye(20, dtype=np.float64), np.ones(20, dtype=np.float64), np.eye(20, dtype=np.float64), 50, 20)


# @njit
def truncated_svd(x):
    u, s, vh = np.linalg.svd(x.transpose() @ x)
    s = np.sqrt(s)
    u_times_sigma = x @ vh.transpose()
    k = np.sum((s > 1e-10) * 1)  # rank of f
    s = s.reshape(-1, 1)
    s = s[:k]
    vh = vh[:k]
    u = u_times_sigma[:, :k] / s.reshape(1, -1)
    return u, s, vh
truncated_svd(np.random.randn(20, 10).astype(np.float64))


class LogME(object):
    def __init__(self, regression=False):
        """
            :param regression: whether regression
        """
        self.regression = regression
        self.fitted = False
        self.reset()

    def reset(self):
        self.num_dim = 0
        self.alphas = []  # alpha for each class / dimension
        self.betas = []  # beta for each class / dimension
        # self.ms.shape --> [C, D]
        self.ms = []  # m for each class / dimension

    def _fit_icml(self, f: np.ndarray, y: np.ndarray):
        """
        LogME calculation proposed in the ICML 2021 paper
        "LogME: Practical Assessment of Pre-trained Models for Transfer Learning"
        at http://proceedings.mlr.press/v139/you21b.html
        """
        fh = f
        f = f.transpose()
        D, N = f.shape
        v, s, vh = np.linalg.svd(f @ fh, full_matrices=True)

        evidences = []
        self.num_dim = y.shape[1] if self.regression else int(y.max() + 1)
        for i in range(self.num_dim):
            y_ = y[:, i] if self.regression else (y == i).astype(np.float64)
            evidence, alpha, beta, m = each_evidence(y_, f, fh, v, s, vh, N, D)
            evidences.append(evidence)
            self.alphas.append(alpha)
            self.betas.append(beta)
            self.ms.append(m)
        self.ms = np.stack(self.ms)
        return np.mean(evidences)

    def _fit_fixed_point(self, f: np.ndarray, y: np.ndarray):
        """
        LogME calculation proposed in the arxiv 2021 paper
        "Ranking and Tuning Pre-trained Models: A New Paradigm of Exploiting Model Hubs"
        at https://arxiv.org/abs/2110.10545
        """
        N, D = f.shape  # k = min(N, D)
        if N > D: # direct SVD may be expensive
            u, s, vh = truncated_svd(f)
        else:
            u, s, vh = np.linalg.svd(f, full_matrices=False)
        # u.shape = N x k
        # s.shape = k
        # vh.shape = k x D
        s = s.reshape(-1, 1)
        sigma = (s ** 2)

        evidences = []
        self.num_dim = y.shape[1] if self.regression else int(y.max() + 1)
        for i in range(self.num_dim):
            y_ = y[:, i] if self.regression else (y == i).astype(np.float64)
            y_ = y_.reshape(-1, 1)
            x = u.T @ y_  # x has shape [k, 1], but actually x should have shape [N, 1]
            x2 = x ** 2
            res_x2 = (y_ ** 2).sum() - x2.sum()  # if k < N, we compute sum of xi for 0 singular values directly

            alpha, beta = 1.0, 1.0
            for _ in range(11):
                t = alpha / beta
                gamma = (sigma / (sigma + t)).sum()
                m2 = (sigma * x2 / ((t + sigma) ** 2)).sum()
                res2 = (x2 / ((1 + sigma / t) ** 2)).sum() + res_x2
                alpha = gamma / (m2 + 1e-5)
                beta = (N - gamma) / (res2 + 1e-5)
                t_ = alpha / beta
                evidence = D / 2.0 * np.log(alpha) \
                           + N / 2.0 * np.log(beta) \
                           - 0.5 * np.sum(np.log(alpha + beta * sigma)) \
                           - beta / 2.0 * res2 \
                           - alpha / 2.0 * m2 \
                           - N / 2.0 * np.log(2 * np.pi)
                evidence /= N
                if abs(t_ - t) / t <= 1e-3:  # abs(t_ - t) <= 1e-5 or abs(1 / t_ - 1 / t) <= 1e-5:
                    break
            evidence = D / 2.0 * np.log(alpha) \
                       + N / 2.0 * np.log(beta) \
                       - 0.5 * np.sum(np.log(alpha + beta * sigma)) \
                       - beta / 2.0 * res2 \
                       - alpha / 2.0 * m2 \
                       - N / 2.0 * np.log(2 * np.pi)
            evidence /= N
            m = 1.0 / (t + sigma) * s * x
            m = (vh.T @ m).reshape(-1)
            evidences.append(evidence)
            self.alphas.append(alpha)
            self.betas.append(beta)
            self.ms.append(m)
        self.ms = np.stack(self.ms)
        return np.mean(evidences)

    _fit = _fit_fixed_point

    def fit(self, f: np.ndarray, y: np.ndarray):
        """
        :param f: [N, F], feature matrix from pre-trained model
        :param y: target labels.
            For classification, y has shape [N] with element in [0, C_t).
            For regression, y has shape [N, C] with C regression-labels
        :return: LogME score (how well f can fit y directly)
        """
        if self.fitted:
            warnings.warn('re-fitting for new data. old parameters cleared.')
            self.reset()
        else:
            self.fitted = True
        f = f.astype(np.float64)
        if self.regression:
            y = y.astype(np.float64)
            if len(y.shape) == 1:
                y = y.reshape(-1, 1)
        return self._fit(f, y)

    def predict(self, f: np.ndarray):
        """
        :param f: [N, F], feature matrix
        :return: prediction, return shape [N, X]
        """
        if not self.fitted:
            raise RuntimeError("not fitted, please call fit first")
        f = f.astype(np.float64)
        logits = f @ self.ms.T
        if self.regression:
            return logits
        return np.argmax(logits, axis=-1)

# Functions

In [5]:
def get_graph_features_labels_imdb(loader, model, seed):
    """Extract graph features for IMDB graph data.
    Note: 
        Fake node features ([0, 0]) and fake edge attributes ([0, 0])
        are created for each node and edge so we can use the pre-trained 
        Strategies (Hu et al., ICLR 2020) GNNs.
        
    Args:
        loader : IMDB dataloader
        model : pre-trained GNN model
        seed : integer value random seed value
    
    Returns:
        all_graph_features : list of all graph features from the dataloader
        all_graph_labels : list of all graph labels from the dataloader 
    """
    
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    all_graph_features = []
    all_graph_labels = []
    
    for step, batch in enumerate(loader):
#         print("step: {}".format(step))
#         print("batch: {}".format(batch))
#         print("batch.batch.shape: {}".format(batch.batch.shape))
        num_nodes = batch.batch.shape[0]
        num_edges = batch.edge_index.shape[1]
#         print("num_nodes: {}".format(num_nodes))
#         print("num_edges: {}".format(num_edges))
        batch = batch.to(device)
    
        # create fake node features [0, 0] for each node of the IMDB dataset
        x = torch.zeros(size=[num_nodes, 2], dtype=torch.long).to(device)

        # create fake edge attribute [0, 0] for each edge of the IMDB dataset
        edge_attr = torch.zeros(size=[num_edges, 2], dtype=torch.long).to(device)

        y = batch.y
        edge_index = batch.edge_index
        batch = batch.batch

        node_representation = model.gnn(x, edge_index, edge_attr)

        graph_features = model.pool(node_representation, batch)
#         print("graph_features: {}".format(graph_features))
#         print("graph_features.shape: {}".format(graph_features.shape)) # batch_size x outdim (300)
        all_graph_features.extend(graph_features.cpu().detach().numpy())
        all_graph_labels.extend(y.cpu().detach().numpy())
        
    return all_graph_features, all_graph_labels


def create_dataframe_save_to_csv(embeddings, labels, dataset_name, model_name, save_path):
#     ## create pandas dataframe to store: example id, embeddings, labels ## 
#     d = {'example_id': [i for i in range(len(embeddings))],
#             'embeddings': embeddings,
#             'labels': labels
#            }
#     df = pd.DataFrame(data=d)
#     # df.head(15)

#     if not os.path.exists(save_path):
#         os.makedirs(save_path)

    filename = '{}_{}.csv'.format(dataset_name, model_name)
#     print("dataset_name: {}".format(dataset_name))
#     df.to_csv(os.path.join(save_path, filename), index=False)
    emb_df = pd.DataFrame(np.array(embeddings))
    emb_df.columns = ['emb' + str(e+1) for e in range(emb_df.shape[1])]
    emb_df['label'] = labels
#     print("emb_df: \n{}\n".format(emb_df))
    emb_df.to_csv(os.path.join(save_path, filename), sep='\t', index=False)

# Default values

In [6]:
## GNN default values ## 
num_layer = 5 # default
emb_dim = 300 # default
JK = 'last' # default (how the node features across laysers are combined)
dropout_ratio = 0.5 # default
graph_pooling = 'mean' # default
gnn_type = 'gin' # default

## DataLoader default values ## 
batch_size = 32 # strategies default
num_workers = 4 # strategies default
train_shuffle = False 
num_tasks = 1


## others ## 
save_results_to = '/mnt/sdc/course-projects/GRL-course-project/results'

# Set up model

In [13]:
## Set up model ## 
model = GNN_graphpred(num_layer, emb_dim, num_tasks, JK, dropout_ratio, graph_pooling, gnn_type)

##########################
input_model_file = './model_gin/supervised.pth'

gin_supervised_model = GNN_graphpred(num_layer, emb_dim, num_tasks, JK = JK, drop_ratio = dropout_ratio, graph_pooling = graph_pooling, gnn_type = gnn_type)
gin_supervised_model.from_pretrained(input_model_file)

gin_supervised_model.to(device)
gin_supervised_model.eval()

###########################

input_model_file = './model_gin/supervised_infomax.pth'

gin_supervised_infomax_model = GNN_graphpred(num_layer, emb_dim, num_tasks, JK = JK, drop_ratio = dropout_ratio, graph_pooling = graph_pooling, gnn_type = gnn_type)
gin_supervised_infomax_model.from_pretrained(input_model_file)

gin_supervised_infomax_model.to(device)
gin_supervised_infomax_model.eval()

###########################

input_model_file = './model_gin/supervised_edgepred.pth'

gin_supervised_edgepred_model = GNN_graphpred(num_layer, emb_dim, num_tasks, JK = JK, drop_ratio = dropout_ratio, graph_pooling = graph_pooling, gnn_type = gnn_type)
gin_supervised_edgepred_model.from_pretrained(input_model_file)

gin_supervised_edgepred_model.to(device)
gin_supervised_edgepred_model.eval()

###########################

input_model_file = './model_gin/supervised_masking.pth'

gin_supervised_masking_model = GNN_graphpred(num_layer, emb_dim, num_tasks, JK = JK, drop_ratio = dropout_ratio, graph_pooling = graph_pooling, gnn_type = gnn_type)
gin_supervised_masking_model.from_pretrained(input_model_file)

gin_supervised_masking_model.to(device)
gin_supervised_masking_model.eval()

###########################

input_model_file = './model_gin/supervised_contextpred.pth'

gin_supervised_contextpred_model = GNN_graphpred(num_layer, emb_dim, num_tasks, JK = JK, drop_ratio = dropout_ratio, graph_pooling = graph_pooling, gnn_type = gnn_type)
gin_supervised_contextpred_model.from_pretrained(input_model_file)

gin_supervised_contextpred_model.to(device)
gin_supervised_contextpred_model.eval()

###########################

input_model_file = './model_gin/infomax.pth'

gin_infomax_model = GNN_graphpred(num_layer, emb_dim, num_tasks, JK = JK, drop_ratio = dropout_ratio, graph_pooling = graph_pooling, gnn_type = gnn_type)
gin_infomax_model.from_pretrained(input_model_file)

gin_infomax_model.to(device)
gin_infomax_model.eval()

GNN_graphpred(
  (gnn): GNN(
    (x_embedding1): Embedding(120, 300)
    (x_embedding2): Embedding(3, 300)
    (gnns): ModuleList(
      (0): GINConv(
        (mlp): Sequential(
          (0): Linear(in_features=300, out_features=600, bias=True)
          (1): ReLU()
          (2): Linear(in_features=600, out_features=300, bias=True)
        )
        (edge_embedding1): Embedding(6, 300)
        (edge_embedding2): Embedding(3, 300)
      )
      (1): GINConv(
        (mlp): Sequential(
          (0): Linear(in_features=300, out_features=600, bias=True)
          (1): ReLU()
          (2): Linear(in_features=600, out_features=300, bias=True)
        )
        (edge_embedding1): Embedding(6, 300)
        (edge_embedding2): Embedding(3, 300)
      )
      (2): GINConv(
        (mlp): Sequential(
          (0): Linear(in_features=300, out_features=600, bias=True)
          (1): ReLU()
          (2): Linear(in_features=600, out_features=300, bias=True)
        )
        (edge_embedding1): E

# Load IMDB Dataset + test using Strategies pre-trained GNNs

## IMDB Binary

In [8]:
## dataset downloaded to '/pretrain-gnns-master/chem/data/imdb/binary' ## 
dataset_name = 'imdbb'
imdb_dataset = TUDataset(root='data/imdb/binary', name='IMDB-BINARY')

In [9]:
imdb_dataset

IMDB-BINARY(1000)

In [10]:
imdbb_train_dataset, imdbb_valid_dataset, imdbb_test_dataset = random_split(imdb_dataset, null_value=0, 
                                                                         frac_train=0.8,
                                                                         frac_valid=0.1,
                                                                         frac_test=0.1,
                                                                         seed=seed)

imdbb_train_loader = DataLoader(imdbb_train_dataset, batch_size=batch_size, 
                               shuffle=train_shuffle, num_workers=num_workers)

In [11]:

imdbb_graph_features, imdbb_graph_labels = get_graph_features_labels_imdb(imdbb_train_loader, 
                                                                          gin_supervised_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbb_graph_features, imdbb_graph_labels, dataset_name,
                             model_name='gin_supervised',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbb_graph_features), np.array(imdbb_graph_labels))
print("\n=============")
print("logme score (GIN supervised.pth): {}".format(score))


logme score (GIN supervised.pth): -0.6136261241956099


In [18]:
imdbb_graph_features, imdbb_graph_labels = get_graph_features_labels_imdb(imdbb_train_loader, 
                                                                          gin_supervised_infomax_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbb_graph_features, imdbb_graph_labels, dataset_name,
                             model_name='gin_supervised_infomax',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbb_graph_features), np.array(imdbb_graph_labels))
print("\n=============")
print("logme score (GIN supervised_infomax.pth): {}".format(score))


logme score (GIN supervised_infomax.pth): -0.6249411817184796


In [19]:
imdbb_graph_features, imdbb_graph_labels = get_graph_features_labels_imdb(imdbb_train_loader, 
                                                                          gin_supervised_edgepred_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbb_graph_features, imdbb_graph_labels, dataset_name,
                             model_name='gin_supervised_edgepred',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbb_graph_features), np.array(imdbb_graph_labels))
print("\n=============")
print("logme score (GIN supervised_edgepred.pth): {}".format(score))


logme score (GIN supervised_edgepred.pth): -0.6277998190368825


In [20]:
imdbb_graph_features, imdbb_graph_labels = get_graph_features_labels_imdb(imdbb_train_loader, 
                                                                          gin_supervised_masking_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbb_graph_features, imdbb_graph_labels, dataset_name,
                             model_name='gin_supervised_masking',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbb_graph_features), np.array(imdbb_graph_labels))
print("\n=============")
print("logme score (GIN supervised_masking.pth): {}".format(score))


logme score (GIN supervised_masking.pth): -0.6408094598830782


In [21]:
imdbb_graph_features, imdbb_graph_labels = get_graph_features_labels_imdb(imdbb_train_loader, 
                                                                          gin_supervised_contextpred_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbb_graph_features, imdbb_graph_labels, dataset_name,
                             model_name='gin_supervised_contextpred',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbb_graph_features), np.array(imdbb_graph_labels))
print("\n=============")
print("logme score (GIN supervised_contextpred.pth): {}".format(score))


logme score (GIN supervised_contextpred.pth): -0.635537474117997


In [22]:
imdbb_graph_features, imdbb_graph_labels = get_graph_features_labels_imdb(imdbb_train_loader, 
                                                                          gin_infomax_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbb_graph_features, imdbb_graph_labels, dataset_name,
                             model_name='gin_infomax',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbb_graph_features), np.array(imdbb_graph_labels))
print("\n=============")
print("logme score (GIN infomax.pth): {}".format(score))


logme score (GIN infomax.pth): -0.6333544027212512


## IMDB Multi

In [23]:
## dataset downloaded to '/pretrain-gnns-master/chem/data/imdb/binary' ## 
dataset_name = 'imdbm'
imdb_dataset = TUDataset(root='data/imdb/binary', name='IMDB-MULTI')

Downloading https://ls11-www.cs.tu-dortmund.de/people/morris/graphkerneldatasets/IMDB-MULTI.zip
Extracting data/imdb/binary/IMDB-MULTI.zip


In [24]:
imdb_dataset

IMDB-MULTI(1000)

In [25]:
imdbm_train_dataset, imdbm_valid_dataset, imdbm_test_dataset = random_split(imdb_dataset, null_value=0, 
                                                                         frac_train=0.8,
                                                                         frac_valid=0.1,
                                                                         frac_test=0.1,
                                                                         seed=seed)

imdbm_train_loader = DataLoader(imdbm_train_dataset, batch_size=batch_size, 
                               shuffle=train_shuffle, num_workers=num_workers)

In [26]:
imdbm_graph_features, imdbm_graph_labels = get_graph_features_labels_imdb(imdbm_train_loader, 
                                                                          gin_supervised_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbm_graph_features, imdbm_graph_labels, dataset_name,
                             model_name='gin_supervised',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbm_graph_features), np.array(imdbm_graph_labels))
print("\n=============")
print("logme score (GIN supervised.pth): {}".format(score))


logme score (GIN supervised.pth): -0.6136266211400907


In [27]:
imdbm_graph_features, imdbm_graph_labels = get_graph_features_labels_imdb(imdbm_train_loader, 
                                                                          gin_supervised_infomax_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbm_graph_features, imdbm_graph_labels, dataset_name,
                             model_name='gin_supervised_infomax',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbm_graph_features), np.array(imdbm_graph_labels))
print("\n=============")
print("logme score (GIN supervised_infomax.pth): {}".format(score))


logme score (GIN supervised_infomax.pth): -0.6249413416397545


In [30]:
imdbm_graph_features, imdbm_graph_labels = get_graph_features_labels_imdb(imdbm_train_loader, 
                                                                          gin_supervised_edgepred_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbm_graph_features, imdbm_graph_labels, dataset_name,
                             model_name='gin_supervised_edgepred',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbm_graph_features), np.array(imdbm_graph_labels))
print("\n=============")
print("logme score (GIN supervised_edgepred.pth): {}".format(score))


logme score (GIN supervised_edgepred.pth): -0.6278001157464586


In [32]:
imdbm_graph_features, imdbm_graph_labels = get_graph_features_labels_imdb(imdbm_train_loader, 
                                                                          gin_supervised_masking_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbm_graph_features, imdbm_graph_labels, dataset_name,
                             model_name='gin_supervised_masking',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbm_graph_features), np.array(imdbm_graph_labels))
print("\n=============")
print("logme score (GIN supervised_masking.pth): {}".format(score))


logme score (GIN supervised_masking.pth): -0.6408094293912805


In [34]:
imdbm_graph_features, imdbm_graph_labels = get_graph_features_labels_imdb(imdbm_train_loader, 
                                                                          gin_supervised_contextpred_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbm_graph_features, imdbm_graph_labels, dataset_name,
                             model_name='gin_supervised_contextpred',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbm_graph_features), np.array(imdbm_graph_labels))
print("\n=============")
print("logme score (GIN supervised_contextpred.pth): {}".format(score))


logme score (GIN supervised_contextpred.pth): -0.6355368522450561


In [35]:
imdbm_graph_features, imdbm_graph_labels = get_graph_features_labels_imdb(imdbm_train_loader, 
                                                                          gin_infomax_model,
                                                                          seed)
create_dataframe_save_to_csv(imdbm_graph_features, imdbm_graph_labels, dataset_name,
                             model_name='gin_infomax',
                             save_path=save_results_to)

logme = LogME(regression=False)
score = logme.fit(np.array(imdbm_graph_features), np.array(imdbm_graph_labels))
print("\n=============")
print("logme score (GIN infomax.pth): {}".format(score))


logme score (GIN infomax.pth): -0.6333549741453175


In [36]:
for i, feature in enumerate(imdbm_graph_features):
    print('{}-th feature | mean: {} +- {}'.format(i, np.mean(feature), np.std(feature)))
    if i > 10:
        break

0-th feature | mean: -125.73597717285156 +- 447.0168762207031
1-th feature | mean: -15571.361328125 +- 58989.79296875
2-th feature | mean: -189.79635620117188 +- 675.7182006835938
3-th feature | mean: -12.429853439331055 +- 51.592445373535156
4-th feature | mean: -1676.006103515625 +- 6203.193359375
5-th feature | mean: -26.136045455932617 +- 102.60440826416016
6-th feature | mean: -47.533721923828125 +- 178.07022094726562
7-th feature | mean: -36.1640510559082 +- 136.61399841308594
8-th feature | mean: -23.610666275024414 +- 92.90879821777344
9-th feature | mean: -23.610660552978516 +- 92.9087905883789
10-th feature | mean: -40.48372268676758 +- 152.52830505371094
11-th feature | mean: -59.103515625 +- 214.29075622558594
