# Generate CFG Input Representations (Optional)

In [1]:
import gc
import os
import sys
import yaml
import glob
import numpy as np
import pickle
import tensorflow as tf
import random
import tqdm
import numpy as np
import shutil

from collections import defaultdict

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from spektral.datasets import delaunay
from spektral.layers import *
from spektral.utils.convolution import localpooling_filter
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Input, Dense, Concatenate, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tqdm.contrib.concurrent import process_map, thread_map

from utils import yaml_load, get_section

%matplotlib inline

In [8]:
def filter_cfgs_files_below_shape(metadata_dict, max_shape):
    # Lets filter CFG files whose number of nodes is less than max_shape
    filtered_graph_files = {
        benchmark_name: {
            opt_seq: opt_values 
            for opt_seq, opt_values in values.items() if opt_values['number_cfg_nodes'] < max_shape[0]
        } for benchmark_name, values in metadata_dict.items()
    }

    # Remove applications that has less than 2 graphs with less than network_input_graph_shape
    filtered_graph_files = {benchmark_name: values for benchmark_name, values in filtered_graph_files.items() if len(values) > 2}
    print(f"Number of graphs with adjacency matrix below shape {max_shape}: {sum([len(values) for _, values in filtered_graph_files.items()])}")
    return filtered_graph_files

def sample_cfgs(metadata_dict, num_samples):
    samples = []
    for s in random.choices(list(metadata_dict.keys()), k=num_samples):
        opt_seq1, opt_seq2 = random.sample(list(metadata_dict[s].keys()), 2)
        t = (
            metadata_dict[s][opt_seq1]['cfg_file'], 
            metadata_dict[s][opt_seq2]['cfg_file'],
            metadata_dict[s][opt_seq1]['exectime']/metadata_dict[s][opt_seq2]['exectime']
        )
        samples.append(t)

    print(f"Sampled {len(samples)} CFGs")
    print(f"First sample of {len(samples)} samples (example of output): {samples[0]}")
    return samples

def load_graph_from_file(filename: str) -> tuple:
    x = yaml_load(filename)
    return x['nodes'], x['nodes_features']

def generate_graph_matrix(network_shape, features_shape, nodes, nodes_features) -> tuple:
    # The graph is represented as a list of adjacency. Let's transform it to an adjacency matrix
    graph = np.full(network_shape, False, dtype='bool')
    for node, node_list in nodes.items():
        for n in node_list:
            graph[node][n] = True

    # Lets read the features from the graphs
    features = np.zeros(features_shape, dtype='float32')
    for node, node_list in nodes_features.items():
        features[node] = node_list

    return graph, features

def load_sample_file(sample_tuple: tuple, network_shape: tuple, features_shape: tuple) -> tuple:
    graph1, features1 = load_graph_from_file(sample_tuple[0])
    graph1, features1 = generate_graph_matrix(network_shape, features_shape, graph1, features1)
    graph2, features2 = load_graph_from_file(sample_tuple[1])
    graph2, features2 = generate_graph_matrix(network_shape, features_shape, graph2, features2)
 
    graphs = np.zeros((2, network_shape[0], network_shape[1]), dtype='bool')
    graphs[0] = graph1
    graphs[1] = graph2

    features = np.zeros((2, features_shape[0], features_shape[1]), dtype='float32')
    features[0] = features1
    features[1] = features2
    
    speedup_array = np.array([sample_tuple[2]], dtype='float32')
    
    return graphs, features, speedup_array

def generate_samples(samples: list, network_shape: tuple, feature_shape: tuple, desc: str = 'Samples generated'):
    input_graphs = np.empty((len(samples), 2, network_shape[0], network_shape[1]), dtype='bool')
    input_features = np.empty((len(samples), 2, feature_shape[0], feature_shape[1]), dtype='float32')
    speedups = np.empty((len(samples), 1), dtype='float32')
    
    # Some counters
    i = 0
    equal_graphs = 0
    
    # Load each sample and store in input_graphs, input_features and speedups
    for sample in tqdm.tqdm(samples, desc=desc):
        # This function return a list of tuples, where each tuple is composed by:
        # np.array with 2 graphs, np.array with 2 features, speedup
        graphs, features, speedup = load_sample_file(sample, network_shape, feature_shape)           
        input_graphs[i] = graphs
        input_features[i] = features
        speedups[i] = speedup
        i += 1
        if np.array_equal(graphs[0], graphs[1]):
            equal_graphs += 1

    print(f"Number of samples loaded: {len(samples)}")
    print(f"Graphs shape: {input_graphs.shape}")
    print(f"Features shape: {input_features.shape}")
    print(f"Speedups (target) shape: {speedups.shape}")
    print(f"Number of sampes with equal graphs: {equal_graphs}")
    
    #np.savez_compressed(output_file, graphs=input_graphs, features=input_features, speedups=speedups)
    #print(f"Representation {representation} saved to {output_file}")
    return input_graphs, input_features, speedups

## Common variables for all representations

In [3]:
data_dir = './data'
metadata_file = './data/ccpe-applications-information.yaml'
metadata_info = yaml_load(metadata_file)
print('Metadata loaded')

Metadata loaded


In [4]:
def get_output_filenames(data_dir: str, num_samples: int, network_shape: tuple) -> tuple:
    output_data_file = f"cfgs_{num_samples}samples_{network_shape[0]}x{network_shape[1]}"
    output_data_file = os.path.join(data_dir, output_data_file)
    
    selected_data_file = f"selected_cfgs_{num_samples}samples_{network_shape[0]}x{network_shape[1]}.yaml"
    selected_data_file = os.path.join(data_dir, selected_data_file)
    return output_data_file, selected_data_file

In [5]:
def save_data_file(output_data_file: str, output_samples_file: str, samples: list, graphs, features, speedups):
    np.savez_compressed(output_data_file, graphs=graphs, features=features, speedups=speedups)
    print(f"Data saved to {output_data_file}.npz")
    
    with open(output_samples_file, 'wt') as f:
        yaml.dump(samples, f)
    print(f"Samples information saved to {output_samples_file}")

# Generate representations for (150x150) shape

In [10]:
# Defining some useful variables
network_graph_shape = (150, 150)
network_features_shape = (150, 67)
n_samples = 20000  # Number of samples to generate. Each sample is composed by 2 graphs

cfg_files = filter_cfgs_files_below_shape(metadata_info, network_graph_shape)
samples = sample_cfgs(cfg_files, n_samples)

output_data_file, output_samples_file = get_output_filenames(data_dir, n_samples, network_graph_shape)
graphs, features, speedups = generate_samples(samples, network_graph_shape, network_features_shape)
save_data_file(output_data_file, output_samples_file, samples, graphs, features, speedups)

Samples generated:   0%|          | 1/20000 [00:00<36:03,  9.24it/s]

Number of graphs with adjacency matrix below shape (150, 150): 17092
Sampled 20000 CFGs
First sample of 20000 samples (example of output): ('./data/ccpe-dados/cfg.llvm/extracted/PolyBench.linear-algebra-blas-syr2k.0.96.ll.cfg.yaml', './data/ccpe-dados/cfg.llvm/extracted/PolyBench.linear-algebra-blas-syr2k.0.40.ll.cfg.yaml', 1.1450459163531148)


Samples generated: 100%|██████████| 20000/20000 [19:17<00:00, 17.28it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 150, 150)
Features shape: (20000, 2, 150, 67)
Speedups (target) shape: (20000, 1)
Number of sampes with equal graphs: 983
Data saved to ./data/cfgs_20000samples_150x150.npz
Samples information saved to ./data/selected_cfgs_20000samples_150x150.yaml


# Generate representations for (300x300) shape

In [13]:
# Defining some useful variables
network_graph_shape = (300, 300)
network_features_shape = (300, 67)
n_samples = 20000  # Number of samples to generate. Each sample is composed by 2 graphs

cfg_files = filter_cfgs_files_below_shape(metadata_info, network_graph_shape)
samples = sample_cfgs(cfg_files, n_samples)

output_data_file, output_samples_file = get_output_filenames(data_dir, n_samples, network_graph_shape)
graphs, features, speedups = generate_samples(samples, network_graph_shape, network_features_shape)
save_data_file(output_data_file, output_samples_file, samples, graphs, features, speedups)

Samples generated:   0%|          | 0/20000 [00:00<?, ?it/s]

Number of graphs with adjacency matrix below shape (300, 300): 19837
Sampled 20000 CFGs
First sample of 20000 samples (example of output): ('./data/ccpe-dados/cfg.llvm/extracted/McGill.chomp.0.89.ll.cfg.yaml', './data/ccpe-dados/cfg.llvm/extracted/McGill.chomp.0.0.ll.cfg.yaml', 1.0102431352808965)


Samples generated: 100%|██████████| 20000/20000 [22:44<00:00, 14.66it/s]


Number of samples loaded: 20000
Graphs shape: (20000, 2, 300, 300)
Features shape: (20000, 2, 300, 67)
Speedups (target) shape: (20000, 1)
Number of sampes with equal graphs: 861
Data saved to ./data/cfgs_20000samples_300x300.npz
Samples information saved to ./data/selected_cfgs_20000samples_300x300.yaml
