In [1]:
import pytest
import re, os

import torch
from torch.utils.data import DataLoader
import torchvision

import modlee
from modlee import data_metafeatures as dmf
from modlee.utils import text_loaders, image_loaders

import numpy as np
import pandas as pd
import spacy

import torch_geometric
import matplotlib.pyplot as plt
import networkx as nx 
import onnx
from onnx.tools import net_drawer
import matplotlib as mpl
import copy
# import torch_geometric as pyg
import numpy as np
import torch_geometric as pyg
# from torch_geometric.utils import convert
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from transformers import GraphormerForGraphClassification
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid
import argparse
import os.path as osp
import torch
from torch_geometric.datasets import QM9
from torch_geometric.loader import DataLoader
from torch_geometric.nn import DimeNet, DimeNetPlusPlus

DATA_ROOT = os.path.expanduser("~/efs/.data")
IMAGE_DATALOADER = modlee.utils.get_imagenette_dataloader()
# TEXT_DATALOADER = modlee.utils.get_wnli_dataloader() 


TEXT_LOADERS = {loader_fn:getattr(text_loaders, loader_fn) for loader_fn in dir(text_loaders) if re.match('get_(.*)_dataloader', loader_fn)}
IMAGE_LOADERS = [getattr(image_loaders, loader_fn) for loader_fn in dir(image_loaders) if re.match('get_(.*)_dataloader', loader_fn)]

# things to install:
# torch_geometric, pydot, transformers

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import karateclub

In [1]:
# Only need to run if the environment does not have torch-{scatter,sparse,cluster}
!which pip3
!pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster --y
!pip install --no-cache torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install  --no-cache torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install  --no-cache torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git
!pip3 install transformers pydot torch_geometric

/home/ubuntu/.conda/envs/modlee311/bin/pip3
[0mLooking in links: https://data.pyg.org/whl/torch-{torch.__version__}.html
Collecting torch-scatter
  Using cached torch_scatter-2.1.2.tar.gz (108 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: torch-scatter
  Building wheel for torch-scatter (setup.py) ... [?25l\^C
[?25canceled
[31mERROR: Operation cancelled by user[0m[31m
[0mLooking in links: https://data.pyg.org/whl/torch-{torch.__version__}.html
Collecting torch-sparse
  Using cached torch_sparse-0.6.18.tar.gz (209 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: torch-sparse
  Building wheel for torch-sparse (setup.py) ... [?25l\^C
[?25canceled
[31mERROR: Operation cancelled by user[0m[31m
[0mLooking in links: https://data.pyg.org/whl/torch-{torch.__version__}.html
Collecting torch-cluster
  Using cached torch_cluster-1.6.3.tar.gz (54 kB)
  Preparing metadata (setup.py) ... [?25ldo

In [None]:
def get_imports():
    # !cat metafeatures.ipynb | grep import
    with open('./metafeatures.ipynb','r') as _f:
        l = _f.readlines()
    # print('\n'.join(l[:100]))
        
    l = [_l.strip().replace('\\n','').replace('"','').replace(',','') for _l in l if 'import' in _l]
    # print(l)
    # print('\n'.join(l))
    return '\n'.join(l)

In [None]:
mf_global = None
def get_df_from_loaders(loaders, modality, n_samples=1):
    global mf_global
    if isinstance(loaders, dict):
        loaders = list(loaders.values())
    df = pd.DataFrame()
    print(loaders)
    features = []
    MFClass = getattr(dmf, f"{modality.capitalize()}DataMetafeatures")
    for loader_fn in loaders:
        for _ in range(n_samples):
            metafeatures = MFClass(
                loader_fn(root=DATA_ROOT), testing=True
            )
            if hasattr(loader_fn, 'args'):
                dataset_name = loader_fn.args[0]
            else:
                dataset_name = loader_fn.__name__
            mf_global = metafeatures
            features.append({
                    'dataset_name':dataset_name,
                    **metafeatures.embedding,
                    **metafeatures.mfe,
                    **metafeatures.properties,
            })
            pd.DataFrame(features[-1]).to_csv(
                f'./{modality}_features_cache.csv',
                mode='a')
    df = pd.DataFrame(features)
    return df



In [None]:
text_df = get_df_from_loaders(TEXT_LOADERS, 'text')

In [None]:
image_df = get_df_from_loaders(IMAGE_LOADERS[17:], 'image', n_samples=4)

In [None]:
image_df

In [None]:
# cached_df = pd.read_csv('./image_features_cache_0.csv')

class DFTransforms:
    @staticmethod
    def list_cols2item(df):
        object_columns = df.select_dtypes(include=['object']).columns
        df[object_columns] = df[object_columns].apply(
            lambda x : x[0]
        )
        return df
    
    @staticmethod
    def drop_nonnum(df):
        return df.select_dtypes(include=['float','int'])
        
    @staticmethod
    def fillna(df, val=0):
        return df.fillna(val)
    
    @staticmethod
    def dropna(df):
        return df.dropna(axis=1, how='any')

    @staticmethod
    def normalize(df):
        def min_max_normalize(column):
            return (column - column.min()) / (column.max() - column.min())
        return df.apply(min_max_normalize)

    @staticmethod
    def compose(transforms):
        def apply_transforms(df):
            for transform in transforms:
                df = transform(df)
            return df
        return apply_transforms
df_transforms = DFTransforms.compose([
    DFTransforms.list_cols2item,
    DFTransforms.drop_nonnum,
    DFTransforms.normalize,
    DFTransforms.dropna,
])
def save_labels(df, fn):
    with open(fn,'w') as _file:
        _file.write('\n'.join(list(df['dataset_name'])))

def save_tsv(df, fn):
    return df.to_csv(
        fn,
        sep='\t',
        index=False,
        header=False 
    )


In [None]:
modality = 'image'
cached_df = pd.read_csv(f'./{modality}_features_cache_0.csv')
save_labels(cached_df, f'./{modality}_labels.txt')
save_tsv(df_transforms(cached_df), f'cached_{modality}_metafeatures.tsv')
# cached_df = DFTransforms.list_cols2item(cached_df)

In [None]:
class MFDF(pd.DataFrame):
    @property
    def name(self):
        return f'{self=}'.partition('=')[0]

    def save_labels(self, *args, **kwargs):
        save_labels(self, *args, **kwargs)
        
    def save_tsv(self, *args, **kwargs):
        save_tsv(self, *args, **kwargs)
    

In [None]:
text_df = MFDF(pd.read_csv('./text_features_cache.csv'))
image_df = MFDF(pd.read_csv('./image_features_cache.csv'))
concat_df = MFDF(pd.concat([text_df, image_df], ignore_index=True))
# text_df.save_labels('./labels_test.txt')
print(concat_df)
print(concat_df.name)
concat_df.save_labels()

In [None]:
dir(concat_df)
f'{concat_df=}'.partition('=')[0]
# print(str(concat_df))

In [None]:

text_df = MFDF(df_transforms(text_df))
text_df.save_tsv('./test_text.tsv')
print(text_df)

In [None]:
print(text_df)

In [None]:
df = pd.DataFrame(features)
# print(len(TEXT_LOADERS))
df = df.fillna(0)

In [None]:
# print(df.dtypes)
import numpy as np
object_columns = df.select_dtypes(include=['object']).columns
df[object_columns] = df[object_columns].apply(
    lambda x : x[0]
)
df.to_csv('text_metafeatures.tsv', sep='\t', index=False, header=False)

In [None]:
def min_max_normalize(column):
    return (column - column.min()) / (column.max() - column.min())

# Normalize DataFrame by columns
normalized_df = df.apply(min_max_normalize)
normalized_df.to_csv(
    'text_metafeatures_normalized.tsv', 
    sep='\t', 
    index=False,
    header=False
    )
with open("data_labels.txt",'w') as _file:
    _file.write('\n'.join(labels))
    # _file.write('\n'.join(list(TEXT_LOADERS.keys())))

In [None]:
embd_cols = sorted(col for col in normalized_df.columns if 'embd' in col)
print(embd_cols)
normalized_df[embd_cols].to_csv(
    'text_metafeatures_normalized_embd.tsv',
    sep='\t',
    index=False,
    header=False
)
normalized_df.drop(columns=embd_cols).to_csv(
    'text_metafeatures_normalized_mfe.tsv',
    sep='\t',
    index=False,
    header=False
)

In [None]:
!code ./text_metafeatures.tsv

In [None]:
print(list(TEXT_LOADERS.keys()), sep='\n')

In [None]:
import torchvision
from pymfe.mfe import MFE
rn18 = torchvision.models.resnet18()
extractor = MFE()
rn18_features = extractor.extract_from_model(
    rn18
)

In [None]:
# import test_model_metafeatures
%load_ext autoreload
%autoreload 2
import os, sys
os.getcwd()
sys.path.insert(0, '..')
# from tests import test_model_metafeatures
# import tests
# print(dir(tests))
# print(tests)
import test_model_metafeatures
# tests.test
# from test_model_metafeatures import *
tmm = test_model_metafeatures.TestModelMetafeatures()
image_mf = tmm.test_image_model_metafeatures(test_model_metafeatures.IMAGE_MODELS[0])


In [None]:
# image_mf.
image_mf.properties["conv_count"]
image_mf.properties["output_shape"]
# image_mf.torch_model

In [None]:
image_mf.get_parameter_statistics(image_mf.dataframe['conv_dilations_0'])
image_mf.get_parameter_statistics(image_mf.dataframe)

In [None]:
# help(image_mf)
# TODO - consider making the metafeatures a subclass of dataframes themselves
# image_mf.dataframe
image_mf.get_parameter_statistics(image_mf.dataframe['conv_dilations_0'])
print(image_mf.dataframe.select_dtypes(include='float'))
print(image_mf.dataframe.shape)
image_mf.get_propertie

In [None]:
%load_ext autoreload
%autoreload 2
from conftest import IMAGE_MODELS, IMAGE_SEGMENTATION_MODELS


models = IMAGE_MODELS + IMAGE_SEGMENTATION_MODELS
from modlee import model_metafeatures as mmf
model_mfs = []
for model in IMAGE_MODELS:
    model_mf = mmf.ImageModelMetafeatures(model)
    model_prop = model_mf.properties
    model_prop.update({'model_name':'clf'})
    model_mfs.append(model_prop)
for model in IMAGE_SEGMENTATION_MODELS:
    model_mf = mmf.ImageSegmentationModelMetafeatures(model)
    model_prop = model_mf.properties
    model_prop.update({'model_name':'segment'})
    model_mfs.append(model_prop)

In [None]:
for model_mf in model_mfs:
    print(model_mf['conv_count'])
df_model = pd.DataFrame([model_mf for model_mf in model_mfs])
print(df_model['model_name'])
df_model['dataset_name']

In [None]:
# df_model
count_cols = [col for col in df_model.columns if 'count' in col]
print(count_cols)
df_model['dataset_name'] = df_model[count_cols].sum(axis=1)
df_model['dataset_name'] = df_model['dataset_name'].apply(str)
name_cols = ['dataset_name','model_name']
# df_model['dataset_name'] = df_model[name_cols].apply(lambda x: ' '.join(x))
df_model['dataset_name'] = df_model['model_name'] + df_model['dataset_name']
print(df_model[['dataset_name','model_name']])
save_labels(df_model, f'./model_labels.txt')
norm_model = df_transforms(df_model)
norm_model

save_tsv(norm_model, f'model_metafeatures.tsv')

In [None]:
!cat model_labels.txt

In [None]:
from datasets import load_dataset

# There is only one split on the hub
dataset = load_dataset("OGB/ogbg-molhiv")

dataset = dataset.shuffle(seed=0)

import networkx as nx
import matplotlib.pyplot as plt

# We want to plot the first train graph
graph = dataset["train"][0]

edges = graph["edge_index"]
num_edges = len(edges[0])
num_nodes = graph["num_nodes"]

# Conversion to networkx format
G = nx.Graph()
G.add_nodes_from(range(num_nodes))
G.add_edges_from([(edges[0][i], edges[1][i]) for i in range(num_edges)])

# Plot
nx.draw(G)


In [None]:
from transformers.models.graphormer.collating_graphormer import preprocess_item, GraphormerDataCollator

dataset_processed = dataset.map(preprocess_item, batched=False)


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("graph-ml", model="clefourrier/graphormer-base-pcqm4mv1")

In [None]:
# Load model directly
from transformers import AutoTokenizer, GraphormerForGraphClassification

model = GraphormerForGraphClassification.from_pretrained("clefourrier/graphormer-base-pcqm4mv2")

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("graph-ml", model="clefourrier/graphormer-base-pcqm4mv2")

# ONNX -> PyG starts below

In [4]:
from conftest import IMAGE_MODELS, IMAGE_SEGMENTATION_MODELS
model = IMAGE_SEGMENTATION_MODELS[2]
model = IMAGE_MODELS[0]
import modlee
converter = modlee.converter.Converter()

onnx_graph = converter.torch_model2onnx_graph(model)



In [5]:
models = IMAGE_MODELS + IMAGE_SEGMENTATION_MODELS

In [6]:
%load_ext autoreload
%autoreload 2
# onnx_graph
# dir(onnx_graph)
# print(dir(onnx_graph))
import torch_geometric
import matplotlib.pyplot as plt
# ax = plt.axes([0,0,5,10])
import networkx as nx 
import onnx
from onnx.tools import net_drawer
# print(onnx_graph.node)

def filter_node(x):
    return 'onnx::' in x \
        or 'Identity' in x \
        or 'fc.' in x
        
def prune_onnx_nx(onnx_nx):
    nodes_to_prune = [k for k in onnx_nx.nodes.keys() if filter_node(k)]
    # help(onnx_nx.remove_node)
    onnx_nx_layers_only = copy.deepcopy(onnx_nx)
    for node in nodes_to_prune:
        onnx_nx_layers_only.remove_node(node)
    return onnx_nx_layers_only
        
def onnx_graph2onnx_nx(onnx_graph, prune=True):
    onnx_pydot = onnx.tools.net_drawer.GetPydotGraph(
        converter.onnx_parameterless2onnx(onnx_graph).graph)
    onnx_pydot.set_name("onnx_graph")
    onnx_nx = nx.nx_pydot.from_pydot(onnx_pydot)
    if prune:
        onnx_nx = prune_onnx_nx(onnx_nx)
    return onnx_nx
    

In [7]:
onnx_nxs = []
for model in models:
    onnx_nxs.append(onnx_graph2onnx_nx(
        converter.torch_model2onnx_graph(model)
    ))

In [14]:
type(onnx_nxs[-1])
isinstance(onnx_nxs[-1],nx.graph.Graph)
len(onnx_nxs[-1].nodes())

networkx.classes.multidigraph.MultiDiGraph

True

586

In [12]:
nx.to_dict_of_dicts(onnx_nxs[0])

{'/conv1/Conv/Conv (op#0)\\n input0 input_1\\n input1 onnx': {'/conv1/Conv_output_00': {0: {}}},
 'input_10': {'/conv1/Conv/Conv (op#0)\\n input0 input_1\\n input1 onnx': {0: {}}},
 '/conv1/Conv_output_00': {'/relu/Relu/Relu (op#1)\\n input0 /conv1/Conv_output_0\\n output0 /relu/Relu_output_0': {0: {}}},
 '/relu/Relu/Relu (op#1)\\n input0 /conv1/Conv_output_0\\n output0 /relu/Relu_output_0': {'/relu/Relu_output_00': {0: {}}},
 '/relu/Relu_output_00': {'/maxpool/MaxPool/MaxPool (op#2)\\n input0 /relu/Relu_output_0\\n output0 /maxpool/MaxPool_output_0': {0: {}}},
 '/maxpool/MaxPool/MaxPool (op#2)\\n input0 /relu/Relu_output_0\\n output0 /maxpool/MaxPool_output_0': {'/maxpool/MaxPool_output_00': {0: {}}},
 '/maxpool/MaxPool_output_00': {'/layer1/layer1.0/conv1/Conv/Conv (op#3)\\n input0 /maxpool/MaxPool_output_0\\n input1 onnx': {0: {}},
  '/layer1/layer1.0/Add/Add (op#6)\\n input0 /layer1/layer1.0/conv2/Conv_output_0\\n input1 /maxpool/MaxPool_output_0\\n output0 /layer1/layer1.0/Add_out

In [1]:
for node in onnx_nxs[0].nodes(data=True):
# for node in onnx_nxs[0].nodes():
    print(node)

NameError: name 'onnx_nxs' is not defined

In [56]:
onnx_nxs_bak = copy.deepcopy(onnx_nxs)
for onnx_nx in onnx_nxs_bak:
    relabel_dict ={}
    for n,node in enumerate(onnx_nx.nodes(data=True)):
        # print(node[0])
        relabel_dict.update({node[0]:n})
        # print(type(node))
        # node[1]['index'] = n
        # print(node)
    nx.relabel_nodes(onnx_nx, relabel_dict, copy=False)
    # onnx_nx.adjacency_matrix
    
print(relabel_dict)
    
print(onnx_nx.nodes)

<networkx.classes.multidigraph.MultiDiGraph at 0x7f3b1ac360d0>

<networkx.classes.multidigraph.MultiDiGraph at 0x7f3b17e6d810>

<networkx.classes.multidigraph.MultiDiGraph at 0x7f3b20bef550>

<networkx.classes.multidigraph.MultiDiGraph at 0x7f3b1a7b6750>

<networkx.classes.multidigraph.MultiDiGraph at 0x7f3b1b21a010>

<networkx.classes.multidigraph.MultiDiGraph at 0x7f3b202b39d0>

<networkx.classes.multidigraph.MultiDiGraph at 0x7f3b17e5f4d0>

<networkx.classes.multidigraph.MultiDiGraph at 0x7f3b204a07d0>

<networkx.classes.multidigraph.MultiDiGraph at 0x7f3b1b1c3a50>

{'/Shape/Shape (op#6)\\n input0 input_1\\n output0 /Shape_output_0': 0, 'input_10': 1, '/Shape_output_00': 2, '/Constant/Constant (op#7)\\n output0 /Constant_output_0': 3, '/Constant_output_00': 4, '/Gather/Gather (op#8)\\n input0 /Shape_output_0\\n input1 /Constant_output_0\\n output0 /Gather_output_0': 5, '/Gather_output_00': 6, '/Shape_1/Shape (op#9)\\n input0 input_1\\n output0 /Shape_1_output_0': 7, '/Shape_1_output_00': 8, '/Constant_1/Constant (op#10)\\n output0 /Constant_1_output_0': 9, '/Constant_1_output_00': 10, '/Gather_1/Gather (op#11)\\n input0 /Shape_1_output_0\\n input1 /Constant_1_output_0\\n output0 /Gather_1_output_0': 11, '/Gather_1_output_00': 12, '/backbone/conv1/Conv/Conv (op#12)\\n input0 input_1\\n input1 onnx': 13, '/backbone/conv1/Conv_output_00': 14, '/backbone/relu/Relu/Relu (op#13)\\n input0 /backbone/conv1/Conv_output_0\\n output0 /backbone/relu/Relu_output_0': 15, '/backbone/relu/Relu_output_00': 16, '/backbone/maxpool/MaxPool/MaxPool (op#14)\\n input0 /

In [41]:
# dir(onnx_nx)
# onnx_nx.adjacency
# onnx_nx.adj
nx.adjacency_matrix(onnx_nx).todense()

matrix([[0, 0, 1, ..., 0, 0, 0],
        [1, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [47]:

g = nx.newman_watts_strogatz_graph(100, 20, 0.05)
# print(g.nodes)
print(onnx_nx.nodes)

['/conv1/Conv/Conv (op#0)\\n input0 input_1\\n input1 onnx', 'input_10', '/conv1/Conv_output_00', '/relu/Relu/Relu (op#1)\\n input0 /conv1/Conv_output_0\\n output0 /relu/Relu_output_0', '/relu/Relu_output_00', '/maxpool/MaxPool/MaxPool (op#2)\\n input0 /relu/Relu_output_0\\n output0 /maxpool/MaxPool_output_0', '/maxpool/MaxPool_output_00', '/layer1/layer1.0/conv1/Conv/Conv (op#3)\\n input0 /maxpool/MaxPool_output_0\\n input1 onnx', '/layer1/layer1.0/conv1/Conv_output_00', '/layer1/layer1.0/relu/Relu/Relu (op#4)\\n input0 /layer1/layer1.0/conv1/Conv_output_0\\n output0 /layer1/layer1.0/relu/Relu_output_0', '/layer1/layer1.0/relu/Relu_output_00', '/layer1/layer1.0/conv2/Conv/Conv (op#5)\\n input0 /layer1/layer1.0/relu/Relu_output_0\\n input1 onnx', '/layer1/layer1.0/conv2/Conv_output_00', '/layer1/layer1.0/Add/Add (op#6)\\n input0 /layer1/layer1.0/conv2/Conv_output_0\\n input1 /maxpool/MaxPool_output_0\\n output0 /layer1/layer1.0/Add_output_0', '/layer1/layer1.0/Add_output_00', '/layer1/

In [57]:
import karateclub
g2v = karateclub.graph2vec.Graph2Vec()
g2v.fit(onnx_nxs_bak)
# g2v.fit(g)

INFO:collecting all words and their counts
INFO:PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags
INFO:collected 83 word types and 9 unique tags from a corpus of 9 examples and 9897 words
INFO:Creating a fresh vocabulary
INFO:Doc2Vec lifecycle event {'msg': 'effective_min_count=5 retains 52 unique words (62.65% of original 83, drops 31)', 'datetime': '2024-05-20T16:17:34.051301', 'gensim': '4.3.2', 'python': '3.11.9 | packaged by conda-forge | (main, Apr 19 2024, 18:36:13) [GCC 12.3.0]', 'platform': 'Linux-5.15.0-1045-aws-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
INFO:Doc2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 9820 word corpus (99.22% of original 9897, drops 77)', 'datetime': '2024-05-20T16:17:34.052283', 'gensim': '4.3.2', 'python': '3.11.9 | packaged by conda-forge | (main, Apr 19 2024, 18:36:13) [GCC 12.3.0]', 'platform': 'Linux-5.15.0-1045-aws-x86_64-with-glibc2.31', 'event': 'prepare_vocab'}
INFO:deleting the raw counts diction

In [70]:
# for onnx_nx in onnx_nxs_bak:
#     print(onnx_nx)
#     g2v.infer([onnx_nx])
graph_embds = g2v.infer(onnx_nxs_bak)
graph_embds = np.array(graph_embds)
print(graph_embds.shape)
print(graph_embds[...,:2])
graph_embds.std(axis=0)

(9, 128)
[[-0.00105674  0.00207241]
 [ 0.00077086  0.00120153]
 [ 0.00212177  0.00226101]
 [-0.00295678 -0.00129032]
 [ 0.00299434 -0.00307984]
 [ 0.0028862  -0.00365495]
 [ 0.00366727 -0.00284002]
 [ 0.00161454 -0.00362909]
 [ 0.00329536  0.00059563]]


array([0.0020924 , 0.00233757, 0.00243714, 0.00171691, 0.00210903,
       0.00205651, 0.0019765 , 0.00207881, 0.00183127, 0.00234839,
       0.00150817, 0.00188816, 0.00205166, 0.00180361, 0.00203067,
       0.00204342, 0.0014617 , 0.00201847, 0.00248629, 0.00265794,
       0.00245506, 0.00221734, 0.00243728, 0.00224482, 0.00204721,
       0.00193488, 0.00168973, 0.00131468, 0.00150243, 0.00254576,
       0.00188076, 0.00229063, 0.00222308, 0.00196838, 0.0016687 ,
       0.00215539, 0.00212319, 0.00206592, 0.0020287 , 0.00228454,
       0.00200407, 0.00228356, 0.00180469, 0.00162153, 0.00184982,
       0.00238861, 0.00240034, 0.00169113, 0.00210505, 0.00264343,
       0.00135907, 0.00178642, 0.00231827, 0.00177731, 0.00199053,
       0.00246064, 0.00167716, 0.00167195, 0.00232026, 0.00191732,
       0.00250698, 0.00181417, 0.00221422, 0.00192077, 0.00231814,
       0.0016966 , 0.00252831, 0.0019067 , 0.00212087, 0.00192985,
       0.00212536, 0.00205462, 0.00250567, 0.00261688, 0.00203

In [None]:

# dir(onnx_nx)
import matplotlib as mpl
import copy
nodes_to_prune = [k for k in onnx_nx.nodes.keys() if filter_node(k)]
# help(onnx_nx.remove_node)
onnx_nx_layers_only = copy.deepcopy(onnx_nx)
for node in nodes_to_prune:
    onnx_nx_layers_only.remove_node(node)
print(onnx_nx_layers_only.nodes)
# pc = mpl.collections.PatchCollection(edges, cmap=cmap)
# pc.set_array(edge_colors)
# ax.show()
plt.show()
dir(onnx_nx_layers_only.edges)
# print(onnx_nx_layers_only.edges.keys())
e0 = onnx_nx_layers_only.edges.items()
print(e0)
onnx_nx.is_directed()
# print(onnx_nx.selfloop_edges)
n0 = list(onnx_nx.nodes.items())[0]
# print(n0[1])


In [None]:
onnx_nx_layers_only.edges

In [None]:
nodes_to_remove = []
for node_name, node_data in onnx_nx_layers_only.nodes.items():
    print(node_name)
    print(node_data)
    # if 'fontcolor' not in node_data:
    # if 'label' not in node_data:
    # if 'style' in node_data:
    #     nodes_to_remove.append(node_name)
        # onnx_nx_layers_only.remove_node(node_name)
    print('\n')

print(nodes_to_remove)
for node_to_remove in nodes_to_remove:
    onnx_nx_layers_only.remove_node(node_to_remove)
# n0 = onnx_nx.nodes[onnx_nx.nodes.keys()]
# edge_colors = range(2,onnx_nx_layers_only.number_of_edges()+2)


In [None]:

nx_drawing = nx.draw_networkx(onnx_nx_layers_only,
    pos=nx.spring_layout(onnx_nx_layers_only, seed=64),
    # pos=nx.bipartite_layout(onnx_nx),
    with_labels=False,
    node_size=10,
    # edge_color=edge_colors,
    # edge_cmap=plt.cm.plasma
    # ax=ax
)
print(onnx_nx_layers_only.edges)


In [None]:
dir(onnx_nx)
# dir(onnx_nx.nodes)

In [None]:
for node_name, node_data in onnx_nx.nodes.items():
    # type(node_data)
    print(onnx_nx.nodes[node_name])

In [None]:
# import torch_geometric as pyg
import numpy as np
import torch_geometric as pyg
# print(pyg.__version__)
# dir(pyg)
# dir(torch_geometric.torch_geometric)
# from torch_geometric.utils import convert
onnx_test = copy.deepcopy(onnx_nx_layers_only)
# nx.set_node_attributes(onnx_test, None, "label")
edge_dict = {'shape': 'box', 'color': '#0F9D58', 'style': 'filled', 'fontcolor': '#FFFFFF', 'label': None}
for k in edge_dict.keys():
    # nx.set_node_attributes(onnx_test, None, k)
    # nx.set_node_attributes(onnx_test, np.random.rand(98)*100, k)
    nx.set_node_attributes(onnx_test, np.random.rand()*100, k)
    nx.set_node_attributes(onnx_test, np.random.rand()*100, f"edge_{k}")
# dir(onnx_test.edges.items()[0])
onnx_test.edges
# for edge_name,edge_data in onnx_test.edges.items():
#     print(edge_data)
# nx.get_edge_attributes(onnx_test)
nx.draw_networkx(onnx_test,
    with_labels=False,
        node_size=10,
    )
# for node,node_data in onnx_test.nodes.items():
#     print(node_data)
onnx_pyg = pyg.utils.convert.from_networkx(
    onnx_test,
    group_node_attrs=['label'],
    group_edge_attrs=['shape']
    )
# !ls
# onnx_pyg.label
# dir(onnx_pyg)
# onnx_pyg.fully_specify()
# onnx_pyg.x
# onnx_pyg.get_tensor()
# onnx_pyg.num_node_features

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# dir(onnx_pyg)
onnx_pyg.edge_attrs()
onnx_pyg.is_directed()
# onnx_pyg.num_nodes  
# onnx_pyg.num_node_features

In [None]:
from transformers import GraphormerForGraphClassification

model = GraphormerForGraphClassification.from_pretrained(
    "clefourrier/pcqm4mv2_graphormer_base",
    num_classes=1, # num_classes for the downstream task 
    ignore_mismatched_sizes=True,
)



In [None]:
model(onnx_pyg)

In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch_geometric.nn import GCNConv

from torch_geometric.datasets import Planetoid

dataset = Planetoid(root='/tmp/Cora', name='Cora')
# dataset = DataLoader([onnx_pyg])

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # self.conv1 = GCNConv(dataset.num_node_features, 16)
        # self.conv2 = GCNConv(16, dataset.num_classes)
        # self.conv1 = GCNConv(98, 16)
        # self.conv1 = GCNConv(98, 1)
        self.conv1 = GCNConv(1, 16)
        # self.conv2 = GCNConv(16, dataset.num_classes)
        self.conv2 = GCNConv(16, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

gcn = GCN()


In [None]:
type(dataset)
type(onnx_pyg)

In [None]:
# dataset[0].__dict__
# dir(dataset[0])
# dataset=dataset[0]
len(dataset)
d0 = dataset[0]
# dir(d0)
d0.keys()
# d0.num_nodes
# d0.edge_index = d0.edge_index[...,:-4]
d0.num_node_features
d0.get_all_tensor_attrs()
d0.node_attrs()
d0.edge_attrs()
d0.edge_attr
len(d0.edge_index[0])
# d0.pos.shape
d0.x.shape
d0.y.shape
d0_nx = pyg.utils.convert.to_networkx(
    d0
)
d0_sub = d0_nx.subgraph(list(range(1000)))
layout_type = 'random'
layout_type = 'spectral'
layout_type = 'spring'
nx.draw_networkx(
    d0_sub,
    pos=getattr(nx, f"{layout_type}_layout")(d0_sub),
    font_color="white",
    # font_size=0,
    arrows=False,
    with_labels=False,
    node_size=2,
    edge_color=(0,0,0,0.23)
    )
# d0.x
# d0.edge_index
len(d0.edge_index[0])

# CORA Citation network
Nodes are papers, edges are citations

In [None]:
# dir(pyg.nn.models)
# pyg_unet = pyg.nn.models.GraphUNet()
pyg_models = pyg.nn.models
for pyg_model in dir(pyg_models):
    # print(pyg_model)
    # dir(pyg_model)
    if 'pretrained' in ' '.join((dir(getattr(pyg_models,pyg_model)))):
        print(pyg_model)

In [None]:
import argparse
import os.path as osp

import torch

from torch_geometric.datasets import QM9
from torch_geometric.loader import DataLoader
from torch_geometric.nn import DimeNet, DimeNetPlusPlus

Model = DimeNetPlusPlus

# path = osp.join(osp.dirname(osp.realpath(__file__)), '..', 'data', 'QM9')
path = osp.join(osp.dirname(osp.abspath('')), '..', 'data', 'QM9')
qm9_dataset = QM9(path)


In [None]:
len(qm9_dataset)
q0 = pyg.utils.convert.to_networkx(qm9_dataset[10])
# # print(q0.node_attrs())
# for q in qm9_dataset[:10]:
    # print(q.x)
nx.draw_networkx(q0,
    font_color="white",
    )

# QM9 Quantum chemical properties

In [None]:

# DimeNet uses the atomization energy for targets U0, U, H, and G, i.e.:
# 7 -> 12, 8 -> 13, 9 -> 14, 10 -> 15
idx = torch.tensor([0, 1, 2, 3, 4, 5, 6, 12, 13, 14, 15, 11])
qm9_dataset.data.y = qm9_dataset.data.y[:, idx]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:

# for target in range(12):
for target in range(1):
    # Skip target \delta\epsilon, since it can be computed via
    # \epsilon_{LUMO} - \epsilon_{HOMO}:
    if target == 4:
        continue

    # model, datasets = pyg.nn.SchNet.from_qm9_pretrained(path, qm9_dataset, target)
    model, datasets = Model.from_qm9_pretrained(path, qm9_dataset, target)
    train_dataset, val_dataset, test_dataset = datasets

    model = model.to(device)
    loader = DataLoader(test_dataset, batch_size=2)

    maes = []
    for data in loader:
        data = data.to(device)
        with torch.no_grad():
            pred = model(data.z, data.pos, data.batch)
        mae = (pred.view(-1) - data.y[:, target]).abs()
        maes.append(mae)

    mae = torch.cat(maes, dim=0)

    # Report meV instead of eV:
    mae = 1000 * mae if target in [2, 3, 4, 6, 7, 8, 9, 10] else mae

    print(f'Target: {target:02d}, MAE: {mae.mean():.5f} ± {mae.std():.5f}')

In [None]:
x_tr = next(iter(DataLoader(train_dataset,batch_size=3)))
x_tr[0]
x_tr[1]
nx.draw_networkx(pyg.utils.convert.to_networkx(x_tr))

In [None]:
# x_tr0 = train_dataset[0]
import random
from random import shuffle
len(train_dataset)
x_tr0 = random.choice(train_dataset)
# dir(x_tr0)
x_tr0
x_tr0.num_features
nx.draw_networkx(pyg.utils.convert.to_networkx(x_tr0), font_color="white")
#_tr0.z
# x_tr0.pos
print(x_tr0.z.shape)
print(x_tr0.pos.shape)
print(x_tr0.batch)
bs = 10
z_dummy = torch.randint(low=0,high=7,size=(bs,bs,))
model(
    z_dummy.to(device),
    # torch.randint((bs),dtype=torch.int64).to(device),
    torch.randn((bs,bs,3)).to(device),
    None
)
# The {Dime,Sch}Nets take as input the QM9 dataset:
# http://quantum-machine.org/datasets/
# - z (num_nodes) - the classes of the nodes?
# - pos (num_nodes, 3) - 3D positions. Why is this not the features (num_features=11)?

In [None]:
model(qm9_dataset[0])

In [None]:
# !pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
!pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git



In [None]:
torch.cuda.is_available()

In [None]:
torch.__version__

In [None]:
# dataset[0].num_node_features
dataset[0].edge_index
# dataset[0].num_features
# dataset[0].x
onnx_pyg.edge_index
# onnx_pyg.x
type(dataset[0]); type(onnx_pyg)
onnx_pyg.x.max()
onnx_pyg.num_nodes
# onnx_pyg.num_node_features = onnx_pyg.num_nodes
onnx_pyg.num_node_features
# [onnx_pyg].num_node_features
# type(dataset)
# help(torch_geometric.datasets)
# gcn(dataset[0])
# pyg_data = onnx_pyg.unsqueeze(-1)
BATCH_SIZE = 32
class PyGDataset(torch.utils.data.Dataset):
    def __init__(self, *args, **kwargs):
        pass
    
    def __len__(self):
        return BATCH_SIZE
    
    def __getitem__(self, idx):
        return onnx_pyg


pyg_dataset = DataLoader(PyGDataset(), batch_size=BATCH_SIZE)
# pyg_batch = next(iter(pyg_dataset))
# [torch.Tensor(onnx_pyg)]*10
# onnx_pyg*10
pyg_batch_list = pyg.data.Batch.from_data_list([onnx_pyg]*32)
gcn_out = gcn(pyg_batch_list)
# gcn_out = gcn(torch.Tensor(onnx_pyg))
gcn_out.shape
gcn_out

In [None]:
gcn.state_dict()

# ONNX -> PyDot -> NetworkX -> PyG

In [None]:
onnx_pyg_nx = pyg.utils.convert.to_networkx(
    onnx_pyg,
    
)

nx.draw_networkx(onnx_pyg_nx,
    pos=nx.spring_layout(onnx_pyg_nx, seed=64),
    with_labels=False,
    node_size=10)

# for node_name, node_data in onnx_pyg_nx.nodes.items():
#     print(node_data)

## Graph from ONNX -> PyDot -> NetworkX

In [None]:

nx.draw_networkx(onnx_pyg_nx,
    pos=nx.spring_layout(onnx_pyg_nx, seed=64),
    with_labels=False,
    node_size=10)

## Graph from ONNX -> PyDot -> NetworkX -> PyG -> NetworkX