In [1]:
import os
import glob
import numpy as np
import tqdm
import pandas as pd

import stellargraph as sg
from stellargraph.mapper import PaddedGraphGenerator
from stellargraph.layer import DeepGraphCNN, GCNSupervisedGraphClassification
from stellargraph import StellarDiGraph
from sklearn import model_selection

import tensorflow as tf 
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Conv1D, MaxPool1D, Dropout, Flatten
from tensorflow.keras.losses import categorical_crossentropy

from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from spektral.layers import *
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow import keras

2021-07-09 01:18:42.404329: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-07-09 01:18:43.390613: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2021-07-09 01:18:43.434081: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-07-09 01:18:43.434618: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce GTX 1070 computeCapability: 6.1
coreClock: 1.7845GHz coreCount: 15 deviceMemorySize: 7.93GiB deviceMemoryBandwidth: 238.66GiB/s
2021-07-09 01:18:43.434640: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2021-07-09 01:18:43.437099: I tensorflow/stream_executor/platfor

In [48]:
data_dir = '/home/nonroot/experiment/results/notebook/'
optsets = ['4211-bCap', '4372-bMed', '500-bSum'] #, '5624-bLim', '6310-bGeo']
base_optset = 'O0'
representation = 'cfgcompact'
feature = 'boo'

In [49]:
benchmark_files = glob.glob(os.path.join(data_dir, base_optset, representation, f"*_{feature}.npz"))

def create_graph(filename):
    data = np.load(filename, allow_pickle=True)
    indexes, features = zip(*data['nodes'])
    node_data = pd.DataFrame(features, index=indexes)
    edges = pd.DataFrame(data['edges'], columns=['source', 'target', 'type'])
    return StellarDiGraph(node_data, edges=edges, edge_type_column="type")
    
def get_graph_labels(filename):
    data = np.load(filename, allow_pickle=True)
    return data['labels'][()]
    
graphs = []
labels = []
faileds = []

for benchmark in tqdm.tqdm(benchmark_files):
    try:
        name = os.path.basename(benchmark)
        # Does all optsets have this graph?
        ok = all(os.path.isfile(os.path.join(data_dir, opt, representation, name)) for opt in optsets + [base_optset])
        if not ok:
            continue
        graph = create_graph(os.path.join(data_dir, base_optset, representation, benchmark))
        bin_sizes = []
        for i, opt in enumerate(optsets):
            fname = os.path.join(data_dir, opt, representation, name)
            # -----
            #graph = create_graph(os.path.join(data_dir, opt, representation, benchmark))
            #ls = [0] * len(optsets)
            #ls[i] = 1
            #graphs.append(graph)
            #labels.append(ls)
            # -----
            bin_sizes.append(get_graph_labels(fname)['binary_size'])
        best_size = min(bin_sizes)
        # Make categorical. It may have multiple sereval opts with same size
        bin_sizes = [1 if b == best_size else 0 for b in bin_sizes]
        graphs.append(graph)
        labels.append(bin_sizes)
    except Exception as e:
        print(f"Error obtaining representation for benchmark `{benchmark}`: {e}")
        continue

print(f"There are {len(graphs)} graphs")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5034/5034 [00:32<00:00, 156.12it/s]

There are 4930 graphs





In [50]:
summary = pd.DataFrame(
    [(g.number_of_nodes(), g.number_of_edges()) for g in graphs],
    columns=["nodes", "edges"],
)
summary.describe().round(1)

Unnamed: 0,nodes,edges
count,4930.0,4930.0
mean,64.9,95.1
std,39.1,60.5
min,1.0,0.0
25%,39.0,55.0
50%,57.0,83.0
75%,82.0,120.0
max,570.0,813.0


In [51]:
graph_labels = pd.DataFrame(labels, columns=optsets)
graph_labels.describe()

Unnamed: 0,4211-bCap,4372-bMed,500-bSum
count,4930.0,4930.0,4930.0
mean,0.328803,0.575254,0.531034
std,0.469826,0.494355,0.499087
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,1.0,1.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


In [56]:
# generator = PaddedGraphGenerator(graphs=graphs)
batch_size = 32
classes = len(optsets)
layer_sizes = [256, 256, 256, 128]
activations = ['relu', 'relu', 'relu', 'relu']

generator = PaddedGraphGenerator(graphs=graphs)

graph_model = GCNSupervisedGraphClassification(
    layer_sizes=layer_sizes,
    activations=activations,
    generator=generator,
    dropout=0.5,
    pool_all_layers=True
)
x_inp, x_out = graph_model.in_out_tensors()
predictions = Dense(units=800, activation="relu")(x_out)
predictions = Dense(units=32, activation="relu")(predictions)
outputs = Dense(units=classes, activation="softmax")(predictions)
#x_out = Conv1D(filters=45, kernel_size=3, strides=1)(x_out)
#x_out = MaxPool1D(pool_size=2)(x_out)
#x_out = Conv1D(filters=32, kernel_size=5, strides=1)(x_out)
#x_out = Flatten()(x_out)
#x_out = Dense(units=32, activation="relu")(x_out)
#x_out = Dropout(rate=0.1)(x_out)
#outputs = Dense(units=classes, activation="sigmoid")(x_out)

# Create the model and prepare it for training by specifying
# the loss and optimisation algorithm.
model = Model(inputs=x_inp, outputs=outputs)


model.compile(
    optimizer=Adam(learning_rate=0.005),
    loss=categorical_crossentropy,
    metrics=["acc"]
)

model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, None, 65)]   0                                            
__________________________________________________________________________________________________
dropout_21 (Dropout)            (None, None, 65)     0           input_25[0][0]                   
__________________________________________________________________________________________________
input_27 (InputLayer)           [(None, None, None)] 0                                            
__________________________________________________________________________________________________
graph_convolution_21 (GraphConv (None, None, 256)    16896       dropout_21[0][0]                 
                                                                 input_27[0][0]             

In [53]:
train_size = 0.9
validation_size = 0.1
test_size = 0.1

train_graphs, test_graphs = model_selection.train_test_split(
    graph_labels,
    train_size=train_size,
    test_size=test_size,
    stratify=graph_labels
)

train_graphs, val_graphs = model_selection.train_test_split(
    train_graphs,
    train_size=train_size,
    test_size=validation_size,
    stratify=train_graphs
)

gen = PaddedGraphGenerator(graphs=graphs)


train_gen = gen.flow(
    list(train_graphs.index - 1),
    targets=train_graphs.values,
    batch_size=batch_size,
    symmetric_normalization=False,
)

val_gen = gen.flow(
    list(val_graphs.index - 1),
    targets=val_graphs.values,
    batch_size=batch_size,
    symmetric_normalization=False,
)

test_gen = gen.flow(
    list(test_graphs.index - 1),
    targets=test_graphs.values,
    batch_size=batch_size,
    symmetric_normalization=False,
)

In [57]:
history = model.fit(
    train_gen,
    epochs=10,
    verbose=True,
    validation_data=val_gen,
    shuffle=True
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [55]:
pred = model.predict(test_gen, verbose=True)
print(pred)

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]
