# Generate CFG Input Representations (Optional)

In [1]:
import gc
import os
import sys
import yaml
import glob
import numpy as np
import pickle
import tensorflow as tf
import random
import tqdm
import numpy as np
import shutil

from collections import defaultdict

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from spektral.datasets import delaunay
from spektral.layers import *
from spektral.utils.convolution import localpooling_filter
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tqdm.contrib.concurrent import process_map, thread_map

from utils import yaml_load, get_section

%matplotlib inline

# Generating CFG Representations

First lets filter graphs whose number of nodes is below a determined shape (indicated by network_input_graph_shape). Larger graphs will be removed.

In [2]:
# Defining some useful variables
metadata_file = './data/ccpe-applications-information.yaml'
network_input_graph_shape = (150, 150)
network_input_features_shape = (150, 67)

metadata_info = yaml_load(metadata_file)

# Lets filter CFG files whose number of nodes is less than network_input_graph_shape
# For example, only CFG graphs with less than 150 nodes. Note: the graphs are represented as adjacency matrix (a quadratic matrix)
filtered_graph_files = {
    benchmark_name: {
        opt_seq: opt_values 
        for opt_seq, opt_values in values.items() if opt_values['number_cfg_nodes'] < network_input_graph_shape[0]
    } for benchmark_name, values in metadata_info.items()
}

# Remove applications that has less than 2 graphs with less than network_input_graph_shape
filtered_graph_files = {benchmark_name: values for benchmark_name, values in filtered_graph_files.items() if len(values) > 2}
print(f"Number of graphs with adjacency matrix below shape {network_input_graph_shape}: {sum([len(values) for _, values in filtered_graph_files.items()])}")

Number of graphs with adjacency matrix below shape (150, 150): 17092


And now, we are going to sample graphs (indicated by 'no_samples') from the filtered_graphs (i.e. below a determined shape). 

To do this we randomly select 'no_samples' applications and then, for each application, we select 2 different graphs (with different optimization sequences). At end, we will have a list of tuples with the following format: \[(CFGFILE1, CFGFILE2, SPEEDUP), ...\].

*Note*: The graphs are not loaded yet, only the file names are selected.

In [3]:
no_samples = 20000  # Number of samples to generate. Each sample is composed by 2 graphs
samples = []

# Lets choose 'no_samples' random CFG applications and then choose 2 different graphs from each application.
# Note: graphs here are the files (the graphs is not loaded yet).
# We will have a list of tuples: [(CFGFILE1, CFGFILE2, SPEEDUP), ...]
for s in random.choices(list(filtered_graph_files.keys()), k=no_samples):
    opt_seq1, opt_seq2 = random.sample(list(filtered_graph_files[s].keys()), 2)
    t = (
        filtered_graph_files[s][opt_seq1]['cfg_file'], 
        filtered_graph_files[s][opt_seq2]['cfg_file'],
        filtered_graph_files[s][opt_seq1]['exectime']/filtered_graph_files[s][opt_seq2]['exectime']
    )
    samples.append(t)

print(f"Sampled {len(samples)} CFGs")
print(f"First sample of {len(samples)} samples (example of output): {samples[0]}")

Sampled 20000 CFGs
First sample of 20000 samples (example of output): ('./data/ccpe-dados/cfg.llvm/extracted/Misc.flops.0.78.ll.cfg.yaml', './data/ccpe-dados/cfg.llvm/extracted/Misc.flops.0.63.ll.cfg.yaml', 0.844077694995504)


Now we are going to load the selected graphs and create a np.array as input to our GNN

In [4]:
# Load the graph and features from file
def load_cfg_from_file(filename: str, network_shape: tuple, features_shape: tuple) -> tuple:
    x = yaml_load(filename)
    
    # The graph is a list of adjacency. Lets transform it to an adjacency matrix
    graph = np.full(network_shape, False)
    for node, nodes_list in x['nodes'].items():
        for n in nodes_list:
            graph[node][n] = True

    # Lets read the features from the graphs
    features = np.zeros(features_shape)
    for node, node_list in x['nodes_features'].items():
        features[node] = node_list

    return graph, features

def load_sample(sample_tuple: tuple) -> tuple:
    graph1, features1 = load_cfg_from_file(sample_tuple[0], network_input_graph_shape, network_input_features_shape)
    graph2, features2 = load_cfg_from_file(sample_tuple[1], network_input_graph_shape, network_input_features_shape)
    
    graphs = np.empty((2, network_input_graph_shape[0], network_input_graph_shape[1]))
    graphs[0] = graph1
    graphs[1] = graph2

    features = np.empty((2, network_input_features_shape[0], network_input_features_shape[1]))
    features[0] = features1
    features[1] = features2

    # Returns:
    # * A matrix with 2 graphs with shape: (2, network_input_graph_shape[0], network_input_graph_shape[1])
    # * A matrix with 2 features with shape: (2, network_input_features_shape[0], network_input_features_shape[1])
    # * A float with speedup of CFG1/CFG2
    return graphs, features, sample_tuple[2]

In [5]:
input_graphs = np.empty((no_samples, 2, network_input_graph_shape[0], network_input_graph_shape[1]))
input_features = np.empty((no_samples, 2, network_input_features_shape[0], network_input_features_shape[1]))
speedups = np.empty((no_samples, 1))
i = 0
for sample in tqdm.tqdm(samples):
    graphs, features, speedup = load_sample(sample)
    input_graphs[i] = graphs
    input_features[i] = features
    speedups[i] = speedup
    i+=1
    
print(f"Generated {i} samples")
print(f"Graphs shape: {input_graphs.shape}")
print(f"Features shape: {input_features.shape}")
print(f"Speedup shape: {speedups.shape}")

100%|██████████| 20000/20000 [18:58<00:00, 17.56it/s]

Generated 20000 samples
Graphs shape: (20000, 2, 150, 150)
Features shape: (20000, 2, 150, 67)
Speedup shape: (20000, 1)





In [6]:
output_data_file = f'./data/cfgs_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}'
np.savez_compressed(output_data_file, graphs=input_graphs, features=input_features, speedups=speedups)
print(f"Data saved to {output_data_file}.npz")

output_samples_file = f'./data/selected_cfgs_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}.yaml'
with open(output_samples_file, 'wt') as f:
    yaml.dump(samples, f)
print(f"Samples information saved to {output_samples_file}")

Data saved to ./data/cfgs_20000samples_150x150.npz
Samples information saved to ./data/selected_cfgs_20000samples_150x150.yaml
