# Generate CFG Input Representation (Optional)

In [1]:
import gc
import os
import sys
import yaml
import glob
import numpy as np
import pickle
import tensorflow as tf
import random
import tqdm
import numpy as np

from collections import defaultdict

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from spektral.datasets import delaunay
from spektral.layers import *
from spektral.utils.convolution import localpooling_filter
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tqdm.contrib.concurrent import process_map, thread_map

from utils import yaml_load, get_section

%matplotlib inline

In [2]:
# Defining some useful variables
metadata_file = './data/ccpe-applications-information.yaml'
network_input_graph_shape = (150, 150)
network_input_features_shape = (150, 200)

metadata_info = yaml_load(metadata_file)

# Lets filter CFG files whose number of nodes is less than network_input_graph_shape
# For example, only CFG graphs with less than 150 nodes. Note: the graphs are represented as adjacency matrix (a quadratic matrix)
filtered_graph_files = {
    benchmark_name: {
        opt_seq: opt_values 
        for opt_seq, opt_values in values.items() if opt_values['number_cdfg_nodes'] < network_input_graph_shape[0]
    } for benchmark_name, values in metadata_info.items()
}

# Remove applications that has less than 2 graphs with less than network_input_graph_shape
filtered_graph_files = {benchmark_name: values for benchmark_name, values in filtered_graph_files.items() if len(values) > 2}
print(f"Number of graphs with adjacency matrix below shape {network_input_graph_shape}: {sum([len(values) for _, values in filtered_graph_files.items()])}")

Number of graphs with adjacency matrix below shape (150, 150): 887


And now, we are going to sample graphs (indicated by 'no_samples') from the filtered_graphs (i.e. below a determined shape). 

To do this we randomly select 'no_samples' applications and then, for each application, we select 2 different graphs (with different optimization sequences). At end, we will have a list of tuples with the following format: \[(CFGFILE1, CFGFILE2, SPEEDUP), ...\].

*Note*: The graphs are not loaded yet, only the file names are selected.

In [3]:
no_samples = 20000  # Number of samples to generate. Each sample is composed by 2 graphs
samples = []

# Lets choose 'no_samples' random CFG applications and then choose 2 different graphs from each application.
# Note: graphs here are the files (the graphs is not loaded yet).
# We will have a list of tuples: [(CFGFILE1, CFGFILE2, SPEEDUP), ...]
for s in random.choices(list(filtered_graph_files.keys()), k=no_samples):
    opt_seq1, opt_seq2 = random.sample(list(filtered_graph_files[s].keys()), 2)
    t = (
        filtered_graph_files[s][opt_seq1]['cdfg_file'], 
        filtered_graph_files[s][opt_seq2]['cdfg_file'],
        filtered_graph_files[s][opt_seq1]['exectime']/filtered_graph_files[s][opt_seq2]['exectime']
    )
    samples.append(t)

print(f"Sampled {len(samples)} CDFGs")
print(f"First sample of {len(samples)} samples (example of output): {samples[0]}")

Sampled 20000 CDFGs
First sample of 20000 samples (example of output): ('./data/ccpe-dados/cdfg.programl/programl.5000.05/extracted/Shootout.nestedloop.0.51.progaml.yaml', './data/ccpe-dados/cdfg.programl/programl.5000.05/extracted/Shootout.nestedloop.0.19.progaml.yaml', 0.02822142069263094)


In [4]:
# Embeddings
embeddings_file = './data/inst2vec/emb.p'
embeddings = np.load(embeddings_file, allow_pickle=True)

# Load the graph and features from file
def load_graph_from_file(filename: str) -> tuple:
    x = yaml_load(filename)
    return x['edges'], x['edges_features'], x['nodes_features']

def generate_graph_matrix(network_shape, features_shape, edges, edge_features, node_features, representation) -> tuple:
    valid_edges = {
        0: 'control' in representation,
        1: 'data' in representation,
        2: 'call' in representation
    }
    
    adj_matrix = np.full((network_shape), False, dtype='bool')
    feature_matrix = np.empty((features_shape), dtype='float32')
    
    for edge_no, edge in edges.items():
        edge_type = edge_features[edge_no][0]
        if valid_edges[edge_type] == True:
            adj_matrix[edge[0]][edge[1]] = True
    
    for node_no, feature_no in node_features.items():
        feature_no = feature_no[0]
        feature_matrix[node_no] = embeddings[feature_no]
        
    return adj_matrix, feature_matrix  
    

def load_sample_file(sample_tuple: tuple, representation_to_generate: tuple) -> tuple:
    edges_1, edge_feat_1, feat_1 = load_graph_from_file(sample_tuple[0])
    edges_2, edge_feat_2, feat_2 = load_graph_from_file(sample_tuple[1])
    
    graph1, features1 = generate_graph_matrix(
        network_input_graph_shape, network_input_features_shape, edges_1, 
        edge_feat_1, feat_1, representation_to_generate)
    graph2, features2 = generate_graph_matrix(
        network_input_graph_shape, network_input_features_shape, edges_2, 
        edge_feat_2, feat_2, representation_to_generate)
    
    graphs = np.empty((2, network_input_graph_shape[0], network_input_graph_shape[1]), dtype='bool')
    graphs[0] = graph1
    graphs[1] = graph2

    features = np.empty((2, network_input_features_shape[0], network_input_features_shape[1]), dtype='float32')
    features[0] = features1
    features[1] = features2

    # Returns:
    # * A matrix with 2 graphs with shape: (2, network_input_graph_shape[0], network_input_graph_shape[1])
    # * A matrix with 2 features with shape: (2, network_input_features_shape[0], network_input_features_shape[1])
    # * A float with speedup of CFG1/CFG2
    return graphs, features, np.array([sample_tuple[2]], dtype='float32')


def generate_sample(samples, representation, desc='Samples generated'):
    for rep in representation:
        assert rep in ['control', 'data', 'call'], f"Invalid representation {rep}"
    
    input_graphs = np.empty((len(samples), 2, network_input_graph_shape[0], network_input_graph_shape[1]), dtype='bool')
    input_features = np.empty((len(samples), 2, network_input_features_shape[0], network_input_features_shape[1]), dtype='float32')
    speedups = np.empty((len(samples), 1), dtype='float32')

    i = 0
    for sample in tqdm.tqdm(samples, desc=desc):
        # This function return a list of tuples, where each tuple is composed by:
        # np.array with 2 graphs, np.array with 2 features, speedup
        graphs, features, speedup = load_sample_file(sample, representation)
        input_graphs[i] = graphs
        input_features[i] = features
        speedups[i] = speedup
        i += 1

    print(f"Number of samples loaded: {len(samples)}")
    print(f"Input graphs shape: {input_graphs.shape}")
    print(f"Input features shape: {input_features.shape}")
    print(f"Input speedups (target) shape: {speedups.shape}")
    
    #np.savez_compressed(output_file, graphs=input_graphs, features=input_features, speedups=speedups)
    #print(f"Representation {representation} saved to {output_file}")
    return input_graphs, input_features, speedups

In [5]:
representation = ('control', 'data', 'call')

output_data_file = f"./data/cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}"
input_graphs, input_features, speedups = generate_sample(samples, representation)
np.savez_compressed(output_data_file, graphs=input_graphs, features=input_features, speedups=speedups)
print(f"Data saved to {output_data_file}.npz")

output_samples_file = f"./data/selected_cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}.yaml"
with open(output_samples_file, 'wt') as f:
    yaml.dump(samples, f)
print(f"Samples information saved to {output_samples_file}")

Samples generated: 100%|██████████| 20000/20000 [06:50<00:00, 48.70it/s]


Number of samples loaded: 20000
Input graphs shape: (20000, 2, 150, 150)
Input features shape: (20000, 2, 150, 200)
Input speedups (target) shape: (20000, 1)
Data saved to ./data/cdfg_control-data-call_20000samples_150x150.npz
Samples information saved to ./data/selected_cdfg_control-data-call_20000samples_150x150.yaml


In [6]:
representation = ('control', 'data')

output_data_file = f"./data/cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}"
input_graphs, input_features, speedups = generate_sample(samples, representation)
np.savez_compressed(output_data_file, graphs=input_graphs, features=input_features, speedups=speedups)
print(f"Data saved to {output_data_file}.npz")

output_samples_file = f"./data/selected_cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}.yaml"
with open(output_samples_file, 'wt') as f:
    yaml.dump(samples, f)
print(f"Samples information saved to {output_samples_file}")

Samples generated: 100%|██████████| 20000/20000 [06:50<00:00, 48.78it/s]


Number of samples loaded: 20000
Input graphs shape: (20000, 2, 150, 150)
Input features shape: (20000, 2, 150, 200)
Input speedups (target) shape: (20000, 1)
Data saved to ./data/cdfg_control-data_20000samples_150x150.npz
Samples information saved to ./data/selected_cdfg_control-data_20000samples_150x150.yaml


In [7]:
representation = ('control', 'call')

output_data_file = f"./data/cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}"
input_graphs, input_features, speedups = generate_sample(samples, representation)
np.savez_compressed(output_data_file, graphs=input_graphs, features=input_features, speedups=speedups)
print(f"Data saved to {output_data_file}.npz")

output_samples_file = f"./data/selected_cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}.yaml"
with open(output_samples_file, 'wt') as f:
    yaml.dump(samples, f)
print(f"Samples information saved to {output_samples_file}")

Samples generated: 100%|██████████| 20000/20000 [06:50<00:00, 48.70it/s]


Number of samples loaded: 20000
Input graphs shape: (20000, 2, 150, 150)
Input features shape: (20000, 2, 150, 200)
Input speedups (target) shape: (20000, 1)
Data saved to ./data/cdfg_control-call_20000samples_150x150.npz
Samples information saved to ./data/selected_cdfg_control-call_20000samples_150x150.yaml


In [8]:
representation = ('data', 'call')

output_data_file = f"./data/cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}"
input_graphs, input_features, speedups = generate_sample(samples, representation)
np.savez_compressed(output_data_file, graphs=input_graphs, features=input_features, speedups=speedups)
print(f"Data saved to {output_data_file}.npz")

output_samples_file = f"./data/selected_cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}.yaml"
with open(output_samples_file, 'wt') as f:
    yaml.dump(samples, f)
print(f"Samples information saved to {output_samples_file}")

Samples generated: 100%|██████████| 20000/20000 [06:50<00:00, 48.73it/s]


Number of samples loaded: 20000
Input graphs shape: (20000, 2, 150, 150)
Input features shape: (20000, 2, 150, 200)
Input speedups (target) shape: (20000, 1)
Data saved to ./data/cdfg_data-call_20000samples_150x150.npz
Samples information saved to ./data/selected_cdfg_data-call_20000samples_150x150.yaml


In [10]:
representation = ('control',)

output_data_file = f"./data/cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}"
input_graphs, input_features, speedups = generate_sample(samples, representation)
np.savez_compressed(output_data_file, graphs=input_graphs, features=input_features, speedups=speedups)
print(f"Data saved to {output_data_file}.npz")

output_samples_file = f"./data/selected_cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}.yaml"
with open(output_samples_file, 'wt') as f:
    yaml.dump(samples, f)
print(f"Samples information saved to {output_samples_file}")

Samples generated: 100%|██████████| 20000/20000 [06:50<00:00, 48.71it/s]


Number of samples loaded: 20000
Input graphs shape: (20000, 2, 150, 150)
Input features shape: (20000, 2, 150, 200)
Input speedups (target) shape: (20000, 1)
Data saved to ./data/cdfg_control_20000samples_150x150.npz
Samples information saved to ./data/selected_cdfg_control_20000samples_150x150.yaml


In [11]:
representation = ('data',)

output_data_file = f"./data/cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}"
input_graphs, input_features, speedups = generate_sample(samples, representation)
np.savez_compressed(output_data_file, graphs=input_graphs, features=input_features, speedups=speedups)
print(f"Data saved to {output_data_file}.npz")

output_samples_file = f"./data/selected_cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}.yaml"
with open(output_samples_file, 'wt') as f:
    yaml.dump(samples, f)
print(f"Samples information saved to {output_samples_file}")

Samples generated: 100%|██████████| 20000/20000 [06:51<00:00, 48.58it/s]


Number of samples loaded: 20000
Input graphs shape: (20000, 2, 150, 150)
Input features shape: (20000, 2, 150, 200)
Input speedups (target) shape: (20000, 1)
Data saved to ./data/cdfg_data_20000samples_150x150.npz
Samples information saved to ./data/selected_cdfg_data_20000samples_150x150.yaml


In [12]:
representation = ('call',)

output_data_file = f"./data/cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}"
input_graphs, input_features, speedups = generate_sample(samples, representation)
np.savez_compressed(output_data_file, graphs=input_graphs, features=input_features, speedups=speedups)
print(f"Data saved to {output_data_file}.npz")

output_samples_file = f"./data/selected_cdfg_{'-'.join(representation)}_{no_samples}samples_{network_input_graph_shape[0]}x{network_input_graph_shape[1]}.yaml"
with open(output_samples_file, 'wt') as f:
    yaml.dump(samples, f)
print(f"Samples information saved to {output_samples_file}")

Samples generated: 100%|██████████| 20000/20000 [06:50<00:00, 48.76it/s]


Number of samples loaded: 20000
Input graphs shape: (20000, 2, 150, 150)
Input features shape: (20000, 2, 150, 200)
Input speedups (target) shape: (20000, 1)
Data saved to ./data/cdfg_call_20000samples_150x150.npz
Samples information saved to ./data/selected_cdfg_call_20000samples_150x150.yaml
