In [None]:
import os
import glob
import numpy as np
import tqdm
import pandas as pd

import stellargraph as sg
from stellargraph.mapper import PaddedGraphGenerator
from stellargraph.layer import DeepGraphCNN, GCNSupervisedGraphClassification
from stellargraph import StellarDiGraph
from sklearn import model_selection

import tensorflow as tf 
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Conv1D, MaxPool1D, Dropout, Flatten
from tensorflow.keras.losses import categorical_crossentropy

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from spektral.layers import *
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow import keras

In [None]:
data_dir = '/home/nonroot/experiment/results/notebook/'
optsets = ['4211-bCap', '4372-bMed'] #, '500-bSum', '5624-bLim', '6310-bGeo']
base_optset = 'O0'
representation = 'cfgcompact'
feature = 'i2v'

In [None]:
benchmark_files = glob.glob(os.path.join(data_dir, base_optset, representation, f"*_{feature}.npz"))

def create_graph(filename):
    data = np.load(filename, allow_pickle=True)
    indexes, features = zip(*data['nodes'])
    node_data = pd.DataFrame(features, index=indexes)
    edges = pd.DataFrame(data['edges'], columns=['source', 'target', 'type'])
    return StellarDiGraph(node_data, edges=edges, edge_type_column="type")
    
def get_graph_labels(filename):
    data = np.load(filename, allow_pickle=True)
    return data['labels'][()]
    
graphs = []
labels = []
faileds = []

for benchmark in tqdm.tqdm(benchmark_files):
    try:
        name = os.path.basename(benchmark)
        # Does all optsets have this graph?
        ok = all(os.path.isfile(os.path.join(data_dir, opt, representation, name)) for opt in optsets + [base_optset])
        if not ok:
            continue
        graph = create_graph(os.path.join(data_dir, base_optset, representation, benchmark))
        bin_sizes = []
        for opt in optsets:
            fname = os.path.join(data_dir, opt, representation, name)
            bin_sizes.append(get_graph_labels(fname)['binary_size'])
        best_size = min(bin_sizes)
        # Make categorical. It may have multiple sereval opts with same size
        bin_sizes = [1 if b == best_size else 0 for b in bin_sizes]
        graphs.append(graph)
        labels.append(bin_sizes)
    except Exception as e:
        print(f"Error obtaining representation for benchmark `{benchmark}`: {e}")
        continue

print(f"There are {len(graphs)} graphs")

In [None]:
graph_labels = pd.DataFrame(labels, columns=optsets)
graph_labels.describe()

In [None]:
# generator = PaddedGraphGenerator(graphs=graphs)
batch_size = 16
classes = len(optsets)
layer_sizes = [32, 32]
activations = ['relu', 'relu']

generator = PaddedGraphGenerator(graphs=graphs)

graph_model = GCNSupervisedGraphClassification(
    layer_sizes=layer_sizes,
    activations=activations,
    generator=generator,
    dropout=0.0,
    pool_all_layers=True
)
x_inp, x_out = graph_model.in_out_tensors()
predictions = Dense(units=16, activation="relu")(x_out)
#predictions = Dense(units=8, activation="relu")(predictions)
outputs = Dense(units=classes, activation="sigmoid")(predictions)
#x_out = Conv1D(filters=45, kernel_size=3, strides=1)(x_out)
#x_out = MaxPool1D(pool_size=2)(x_out)
#x_out = Conv1D(filters=32, kernel_size=5, strides=1)(x_out)
#x_out = Flatten()(x_out)
#x_out = Dense(units=32, activation="relu")(x_out)
#x_out = Dropout(rate=0.1)(x_out)
#outputs = Dense(units=classes, activation="sigmoid")(x_out)

# Create the model and prepare it for training by specifying
# the loss and optimisation algorithm.
model = Model(inputs=x_inp, outputs=outputs)


model.compile(
    optimizer=Adam(learning_rate=0.005),
    loss=categorical_crossentropy,
    metrics=["acc"]
)

model.summary()

In [None]:
train_size = 0.9
validation_size = 0.1
test_size = 0.1

train_graphs, test_graphs = model_selection.train_test_split(
    graph_labels,
    train_size=train_size,
    test_size=test_size,
    stratify=graph_labels
)

train_graphs, val_graphs = model_selection.train_test_split(
    train_graphs,
    train_size=train_size,
    test_size=validation_size,
    stratify=train_graphs
)

gen = PaddedGraphGenerator(graphs=graphs)


train_gen = gen.flow(
    list(train_graphs.index - 1),
    targets=train_graphs.values,
    batch_size=batch_size,
    symmetric_normalization=False,
)

val_gen = gen.flow(
    list(val_graphs.index - 1),
    targets=val_graphs.values,
    batch_size=batch_size,
    symmetric_normalization=False,
)

test_gen = gen.flow(
    list(test_graphs.index - 1),
    targets=test_graphs.values,
    batch_size=batch_size,
    symmetric_normalization=False,
)

In [None]:
history = model.fit(
    train_gen,
    epochs=20,
    verbose=True,
    validation_data=val_gen,
    shuffle=True
)