# Generate CFG Input Representation

In [1]:
import gc
import os
import sys
import yaml
import glob
import numpy as np
import pickle
import tensorflow as tf
import random
import tqdm
import numpy as np

from collections import defaultdict

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from spektral.datasets import delaunay
from spektral.layers import *
from spektral.utils.convolution import localpooling_filter
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tqdm.contrib.concurrent import process_map, thread_map

from utils import yaml_load, get_section

%matplotlib inline

In [2]:
def filter_cdfgs_files_below_shape(metadata_dict, max_shape):
    # Lets filter CFG files whose number of nodes is less than max_shape
    filtered_graph_files = {
        benchmark_name: {
            opt_seq: opt_values 
            for opt_seq, opt_values in values.items() if opt_values['number_cdfg_nodes'] < max_shape[0]
        } for benchmark_name, values in metadata_dict.items()
    }

    # Remove applications that has less than 2 graphs with less than network_input_graph_shape
    filtered_graph_files = {benchmark_name: values for benchmark_name, values in filtered_graph_files.items() if len(values) > 2}
    print(f"Number of graphs with adjacency matrix below shape {max_shape}: {sum([len(values) for _, values in filtered_graph_files.items()])}")
    return filtered_graph_files

def sample_cdfgs(metadata_dict, num_samples):
    samples = []
    for s in random.choices(list(metadata_dict.keys()), k=num_samples):
        opt_seq1, opt_seq2 = random.sample(list(metadata_dict[s].keys()), 2)
        t = (
            metadata_dict[s][opt_seq1]['cdfg_file'], 
            metadata_dict[s][opt_seq2]['cdfg_file'],
            metadata_dict[s][opt_seq1]['exectime']/metadata_dict[s][opt_seq2]['exectime']
        )
        samples.append(t)

    print(f"Sampled {len(samples)} CDFGs")
    print(f"First sample of {len(samples)} samples (example of output): {samples[0]}")
    return samples

In [3]:
# Load the graph and features from file
no_total_features = 0
no_total_invalid_features = 0

def load_graph_from_file(filename: str) -> tuple:
    x = yaml_load(filename)
    return x['edges'], x['edges_features'], x['nodes_features']

def generate_graph_matrix(network_shape, features_shape, edges, edge_features, node_features, representation, embeddings_dict: dict) -> tuple:
    global no_total_features, no_total_invalid_features
    valid_edges = {
        0: 'control' in representation,
        1: 'data' in representation,
        2: 'call' in representation
    }
    
    # print(f'Valid edges: {valid_edges}')
    
    adj_matrix = np.full((network_shape), False, dtype='bool')
    feature_matrix = np.zeros((features_shape), dtype='float32')
    
    for edge_no, edge in edges.items():
        edge_type = edge_features[edge_no][0]
        if valid_edges[edge_type] == True:
            adj_matrix[edge[0]][edge[1]] = True
    
    non_zero_nodes = {node_no: is_not_zero_row for node_no, is_not_zero_row in enumerate(adj_matrix.any(axis=1))}
    
    for node_no, feature_no in node_features.items():
        if non_zero_nodes[node_no] == True:
            feature_no = feature_no[0]
            feature_matrix[node_no] = embeddings_dict[feature_no]
            if feature_no == 8564:
                no_total_invalid_features += 1
        no_total_features += 1
        
    # print(adj_matrix)
    # print(feature_matrix)
    return adj_matrix, feature_matrix  
    

def load_sample_file(sample_tuple: tuple, network_shape: tuple, features_shape: tuple, representation_to_generate: tuple, embeddings_dict: dict) -> tuple:
    # print(f'Loading first file: {sample_tuple[0]}')
    edges_1, edge_feat_1, feat_1 = load_graph_from_file(sample_tuple[0])
    # print(f'Loading second file: {sample_tuple[1]}')
    edges_2, edge_feat_2, feat_2 = load_graph_from_file(sample_tuple[1])
    
    graph1, features1 = generate_graph_matrix(
        network_shape, features_shape, edges_1, edge_feat_1, 
        feat_1, representation_to_generate, embeddings_dict)
    graph2, features2 = generate_graph_matrix(
        network_shape, features_shape, edges_2, edge_feat_2, 
        feat_2, representation_to_generate, embeddings_dict)
    
    graphs = np.zeros((2, network_shape[0], network_shape[1]), dtype='bool')
    graphs[0] = graph1
    graphs[1] = graph2

    features = np.zeros((2, features_shape[0], features_shape[1]), dtype='float32')
    features[0] = features1
    features[1] = features2
    
    speedup_array = np.array([sample_tuple[2]], dtype='float32')

    # Returns:
    # * A matrix with 2 graphs with shape: (2, network_input_graph_shape[0], network_input_graph_shape[1])
    # * A matrix with 2 features with shape: (2, network_input_features_shape[0], network_input_features_shape[1])
    # * A float with speedup of CFG1/CFG2
    return graphs, features, speedup_array


def generate_samples(samples: list, network_shape: tuple, feature_shape: tuple, representation: tuple, embeddings_dict: dict, desc: str = 'Samples generated'):
    global no_total_features, no_total_invalid_features
    no_total_features = 0
    no_total_invalid_features = 0
    for rep in representation:
        assert rep in ['control', 'data', 'call'], f"Invalid representation {rep}"
    
    input_graphs = np.empty((len(samples), 2, network_shape[0], network_shape[1]), dtype='bool')
    input_features = np.empty((len(samples), 2, feature_shape[0], feature_shape[1]), dtype='float32')
    speedups = np.empty((len(samples), 1), dtype='float32')

    # Some counters
    i = 0
    equal_graphs = 0
    
    # Load each sample and store in input_graphs, input_features and speedups
    for sample in tqdm.tqdm(samples, desc=desc):
        # This function return a list of tuples, where each tuple is composed by:
        # np.array with 2 graphs, np.array with 2 features, speedup
        graphs, features, speedup = load_sample_file(sample, network_shape, feature_shape, representation, embeddings_dict)           
        input_graphs[i] = graphs
        input_features[i] = features
        speedups[i] = speedup
        i += 1
        if np.array_equal(graphs[0], graphs[1]):
            equal_graphs += 1

    print(f"Number of samples loaded: {len(samples)}")
    print(f"Graphs shape: {input_graphs.shape}")
    print(f"Features shape: {input_features.shape}")
    print(f"Speedups (target) shape: {speedups.shape}")
    print(f"Number of samples with equal graphs: {equal_graphs}")
    
    print(f"Number of features assigned (FA): {no_total_features}")
    print(f"Invalid number of representations assigned: {no_total_invalid_features} (in relation to FA: {no_total_invalid_features/no_total_features}%)")
    
    #np.savez_compressed(output_file, graphs=input_graphs, features=input_features, speedups=speedups)
    #print(f"Representation {representation} saved to {output_file}")
    return input_graphs, input_features, speedups

# Common variables for all representations

In [4]:
data_dir = './data'
metadata_file = './data/ccpe-applications-information.yaml'
metadata_info = yaml_load(metadata_file)
print('Metadata loaded')

# Embeddings
embeddings_file = './data/inst2vec/emb.p'
embeddings_info = np.load(embeddings_file, allow_pickle=True)
print('Embbedings loaded')

Metadata loaded
Embbedings loaded


In [5]:
def get_output_filenames(data_dir: str, representation: tuple, num_samples, network_shape) -> tuple:
    output_data_file = f"cdfgs_{'-'.join(representation)}_{num_samples}samples_{network_shape[0]}x{network_shape[1]}"
    output_data_file = os.path.join(data_dir, output_data_file)
    
    selected_data_file = f"selected_cdfgs_{'-'.join(representation)}_{num_samples}samples_{network_shape[0]}x{network_shape[1]}.yaml"
    selected_data_file = os.path.join(data_dir, selected_data_file)
    return output_data_file, selected_data_file

In [6]:
def save_data_file(output_data_file: str, output_samples_file: str, samples: list, graphs, features, speedups):
    np.savez_compressed(output_data_file, graphs=graphs, features=features, speedups=speedups)
    print(f"Data saved to {output_data_file}.npz")
    
    with open(output_samples_file, 'wt') as f:
        yaml.dump(samples, f)
    print(f"Samples information saved to {output_samples_file}")

# Generate representations for (150x150) shape

### Common variables for all 150x150 representations

In [7]:
# Defining some useful variables
network_graph_shape = (150, 150)
network_features_shape = (150, 200)
n_samples = 20000  # Number of samples to generate. Each sample is composed by 2 graphs

cdfg_files = filter_cdfgs_files_below_shape(metadata_info, network_graph_shape)
samples = sample_cdfgs(cdfg_files, n_samples)

Number of graphs with adjacency matrix below shape (150, 150): 887
Sampled 20000 CDFGs
First sample of 20000 samples (example of output): ('./data/ccpe-dados/cdfg.programl/programl.5000.01/extracted/BenchmarkGame.nsieve-bits.0.11.progaml.yaml', './data/ccpe-dados/cdfg.programl/programl.5000.01/extracted/BenchmarkGame.nsieve-bits.0.2.progaml.yaml', 1.0059670138388164)


### Data generation

In [8]:
representations_to_generate = [
    ('control', 'data', 'call'),
    ('control', 'data'),
    ('control', 'call'),
    ('data', 'call'),
    ('control',),
    ('data',),
    ('call',)
]

for representation in representations_to_generate:
    print(f'Generating representation for: {representation}')
    output_data_file, output_samples_file = get_output_filenames(data_dir, representation, n_samples, network_graph_shape)
    graphs, features, speedups = generate_samples(samples, network_graph_shape, network_features_shape, representation, embeddings_info)
    save_data_file(output_data_file, output_samples_file, samples, graphs, features, speedups)
    print('Finished generation\n')

Samples generated:   0%|          | 7/20000 [00:00<05:07, 64.95it/s]

Generating representation for: ('control', 'data', 'call')


Samples generated: 100%|██████████| 20000/20000 [07:04<00:00, 47.12it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 150, 150)
Features shape: (20000, 2, 150, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 0
Number of features assigned (FA): 4709769
Invalid number of representations assigned: 3086242 (in relation to FA: 0.6552852167484222%)
Data saved to ./data/cdfgs_control-data-call_20000samples_150x150.npz


Samples generated:   0%|          | 4/20000 [00:00<09:24, 35.43it/s]

Samples information saved to ./data/selected_cdfgs_control-data-call_20000samples_150x150.yaml
Finished generation

Generating representation for: ('control', 'data')


Samples generated: 100%|██████████| 20000/20000 [07:04<00:00, 47.09it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 150, 150)
Features shape: (20000, 2, 150, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 0
Number of features assigned (FA): 4709769
Invalid number of representations assigned: 2831817 (in relation to FA: 0.6012645206166163%)
Data saved to ./data/cdfgs_control-data_20000samples_150x150.npz


Samples generated:   0%|          | 4/20000 [00:00<09:22, 35.54it/s]

Samples information saved to ./data/selected_cdfgs_control-data_20000samples_150x150.yaml
Finished generation

Generating representation for: ('control', 'call')


Samples generated: 100%|██████████| 20000/20000 [07:02<00:00, 47.37it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 150, 150)
Features shape: (20000, 2, 150, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 1241
Number of features assigned (FA): 4709769
Invalid number of representations assigned: 1816641 (in relation to FA: 0.385717643476782%)
Data saved to ./data/cdfgs_control-call_20000samples_150x150.npz


Samples generated:   0%|          | 4/20000 [00:00<09:26, 35.27it/s]

Samples information saved to ./data/selected_cdfgs_control-call_20000samples_150x150.yaml
Finished generation

Generating representation for: ('data', 'call')


Samples generated: 100%|██████████| 20000/20000 [07:04<00:00, 47.16it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 150, 150)
Features shape: (20000, 2, 150, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 0
Number of features assigned (FA): 4709769
Invalid number of representations assigned: 2702237 (in relation to FA: 0.5737514939692371%)
Data saved to ./data/cdfgs_data-call_20000samples_150x150.npz


Samples generated:   0%|          | 4/20000 [00:00<09:26, 35.29it/s]

Samples information saved to ./data/selected_cdfgs_data-call_20000samples_150x150.yaml
Finished generation

Generating representation for: ('control',)


Samples generated: 100%|██████████| 20000/20000 [07:02<00:00, 47.32it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 150, 150)
Features shape: (20000, 2, 150, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 1363
Number of features assigned (FA): 4709769
Invalid number of representations assigned: 1562216 (in relation to FA: 0.33169694734497596%)
Data saved to ./data/cdfgs_control_20000samples_150x150.npz


Samples generated:   0%|          | 7/20000 [00:00<04:56, 67.47it/s]

Samples information saved to ./data/selected_cdfgs_control_20000samples_150x150.yaml
Finished generation

Generating representation for: ('data',)


Samples generated: 100%|██████████| 20000/20000 [07:02<00:00, 47.31it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 150, 150)
Features shape: (20000, 2, 150, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 0
Number of features assigned (FA): 4709769
Invalid number of representations assigned: 2292509 (in relation to FA: 0.4867561445157926%)
Data saved to ./data/cdfgs_data_20000samples_150x150.npz


Samples generated:   0%|          | 4/20000 [00:00<09:21, 35.63it/s]

Samples information saved to ./data/selected_cdfgs_data_20000samples_150x150.yaml
Finished generation

Generating representation for: ('call',)


Samples generated: 100%|██████████| 20000/20000 [07:00<00:00, 47.55it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 150, 150)
Features shape: (20000, 2, 150, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 1650
Number of features assigned (FA): 4709769
Invalid number of representations assigned: 534624 (in relation to FA: 0.11351384749443126%)
Data saved to ./data/cdfgs_call_20000samples_150x150.npz
Samples information saved to ./data/selected_cdfgs_call_20000samples_150x150.yaml
Finished generation



# Generate representations for (300x300) shape

### Common variables for all 300x300 representations

In [9]:
# Defining some useful variables
network_graph_shape = (300, 300)
network_features_shape = (300, 200)
n_samples = 20000  # Number of samples to generate. Each sample is composed by 2 graphs

cdfg_files = filter_cdfgs_files_below_shape(metadata_info, network_graph_shape)
samples = sample_cdfgs(cdfg_files, n_samples)

Number of graphs with adjacency matrix below shape (300, 300): 2910
Sampled 20000 CDFGs
First sample of 20000 samples (example of output): ('./data/ccpe-dados/cdfg.programl/programl.5000.02/extracted/Misc.mandel-2.0.21.progaml.yaml', './data/ccpe-dados/cdfg.programl/programl.5000.02/extracted/Misc.mandel-2.0.28.progaml.yaml', 1.011798022347439)


### Data Generation

In [10]:
representations_to_generate = [
    ('control', 'data', 'call'),
    ('control', 'data'),
    ('control', 'call'),
    ('data', 'call'),
    ('control',),
    ('data',),
    ('call',)
]

for representation in representations_to_generate:
    print(f'Generating representation for: {representation}')
    output_data_file, output_samples_file = get_output_filenames(data_dir, representation, n_samples, network_graph_shape)
    graphs, features, speedups = generate_samples(samples, network_graph_shape, network_features_shape, representation, embeddings_info)
    save_data_file(output_data_file, output_samples_file, samples, graphs, features, speedups)
    print('Finished generation\n')

Samples generated:   0%|          | 0/20000 [00:00<?, ?it/s]

Generating representation for: ('control', 'data', 'call')


Samples generated: 100%|██████████| 20000/20000 [14:18<00:00, 23.31it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 300, 300)
Features shape: (20000, 2, 300, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 0
Number of features assigned (FA): 8557550
Invalid number of representations assigned: 5528569 (in relation to FA: 0.646045772446553%)
Data saved to ./data/cdfgs_control-data-call_20000samples_300x300.npz


Samples generated:   0%|          | 3/20000 [00:00<11:24, 29.21it/s]

Samples information saved to ./data/selected_cdfgs_control-data-call_20000samples_300x300.yaml
Finished generation

Generating representation for: ('control', 'data')


Samples generated: 100%|██████████| 20000/20000 [13:56<00:00, 23.91it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 300, 300)
Features shape: (20000, 2, 300, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 0
Number of features assigned (FA): 8557550
Invalid number of representations assigned: 5251072 (in relation to FA: 0.6136186174781334%)
Data saved to ./data/cdfgs_control-data_20000samples_300x300.npz


Samples generated:   0%|          | 3/20000 [00:00<11:21, 29.34it/s]

Samples information saved to ./data/selected_cdfgs_control-data_20000samples_300x300.yaml
Finished generation

Generating representation for: ('control', 'call')


Samples generated: 100%|██████████| 20000/20000 [13:52<00:00, 24.03it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 300, 300)
Features shape: (20000, 2, 300, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 385
Number of features assigned (FA): 8557550
Invalid number of representations assigned: 3238993 (in relation to FA: 0.3784953637431274%)
Data saved to ./data/cdfgs_control-call_20000samples_300x300.npz


Samples generated:   0%|          | 5/20000 [00:00<09:06, 36.62it/s]

Samples information saved to ./data/selected_cdfgs_control-call_20000samples_300x300.yaml
Finished generation

Generating representation for: ('data', 'call')


Samples generated: 100%|██████████| 20000/20000 [13:55<00:00, 23.95it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 300, 300)
Features shape: (20000, 2, 300, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 0
Number of features assigned (FA): 8557550
Invalid number of representations assigned: 4767927 (in relation to FA: 0.5571602853620488%)
Data saved to ./data/cdfgs_data-call_20000samples_300x300.npz


Samples generated:   0%|          | 3/20000 [00:00<11:35, 28.75it/s]

Samples information saved to ./data/selected_cdfgs_data-call_20000samples_300x300.yaml
Finished generation

Generating representation for: ('control',)


Samples generated: 100%|██████████| 20000/20000 [13:51<00:00, 24.06it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 300, 300)
Features shape: (20000, 2, 300, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 482
Number of features assigned (FA): 8557550
Invalid number of representations assigned: 2961496 (in relation to FA: 0.34606820877470773%)
Data saved to ./data/cdfgs_control_20000samples_300x300.npz


Samples generated:   0%|          | 3/20000 [00:00<11:30, 28.95it/s]

Samples information saved to ./data/selected_cdfgs_control_20000samples_300x300.yaml
Finished generation

Generating representation for: ('data',)


Samples generated: 100%|██████████| 20000/20000 [13:55<00:00, 23.94it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 300, 300)
Features shape: (20000, 2, 300, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 0
Number of features assigned (FA): 8557550
Invalid number of representations assigned: 4291526 (in relation to FA: 0.5014900292723969%)
Data saved to ./data/cdfgs_data_20000samples_300x300.npz


Samples generated:   0%|          | 3/20000 [00:00<11:24, 29.21it/s]

Samples information saved to ./data/selected_cdfgs_data_20000samples_300x300.yaml
Finished generation

Generating representation for: ('call',)


Samples generated: 100%|██████████| 20000/20000 [13:48<00:00, 24.14it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 300, 300)
Features shape: (20000, 2, 300, 200)
Speedups (target) shape: (20000, 1)
Number of samples with equal graphs: 531
Number of features assigned (FA): 8557550
Invalid number of representations assigned: 607177 (in relation to FA: 0.07095220010400173%)
Data saved to ./data/cdfgs_call_20000samples_300x300.npz
Samples information saved to ./data/selected_cdfgs_call_20000samples_300x300.yaml
Finished generation

