**Imports**

In [None]:
import os
import sys
import glob
import time
import yaml as yl
import pickle as pk
import numpy as np
import pandas as pd
import subprocess
import datetime
import matplotlib.pyplot as plt

from google.colab import drive
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, classification_report

**Measure the initial time**

In [None]:
initial_time = time.time()

**Stellar** **Graph**

In [None]:
%pip install -q stellargraph==1.2.1

import stellargraph as sg

from stellargraph.mapper import PaddedGraphGenerator
from stellargraph.layer import DeepGraphCNN
from stellargraph import StellarDiGraph

**Tensorflow**

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Conv2D, Conv1D, MaxPool2D, MaxPool1D, Dropout, Flatten
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping

**Profiler**

In [None]:
#%pip install line_profiler
%pip install memory_profiler
%pip install wandb

#import line_profiler
%load_ext memory_profiler
import wandb

The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


**Functions**

In [None]:
def execute(cmdline):
  """Execute a command."""
  try:
    ret = subprocess.run(cmdline,
                        shell=True,
                        check=True,
                        capture_output=True)
  except subprocess.CalledProcessError:
    print('Execute: {}'.format(cmdline))
    return ''

  return ret.stdout.decode(), ret.stderr.decode()

def save(filename, data):
  """Save a file."""
  fout = open(filename, 'w')
  fout.write(data)
  fout.close()

def random_features(sequence):
  """Create random features."""
  new_sequence = np.random.random(sequence.shape)
  return new_sequence

def zero_to_random_features(sequence):
  """"Transform 0 in a random number."""
  new_sequence = np.copy(sequence)
  new_sequence[new_sequence == 0] = np.random.random()
  return new_sequence

def one_to_random_features(sequence):
  """"Transform 1 in a random number."""
  new_sequence = np.copy(sequence)
  new_sequence[new_sequence == 1] = np.random.random()
  return new_sequence


def get_edges_dataFrame(edges, edge_type='original'):
  """Return the edges."""
  source = []
  target = []
  type_ = []
  for node1, node2, data in edges:
      source.append(node1)
      target.append(node2)
      if edge_type == 'original':
          type_.append(data)
      elif edge_type == 'default':
          type_.append('default')
      elif edge_type == 'random':
          type_.append(np.random.random())
      else:
          print('Error: edge_type', edge_type)
          sys.exit(1)

  return pd.DataFrame({'source': source, 'target': target, 'type': type_})

def stellar_graph_no_edge_features(graph):
  """Remove the type of the edges: <type> -> default."""

  edges = get_edges_dataFrame(graph.edges(1), 'default')

  s_graph = StellarDiGraph(graph.node_features(),
                           edges=edges,
                           edge_type_column="type")
  return s_graph

def stellar_graph_no_node_features(graph):
  """Remove the node features."""

  edges = get_edges_dataFrame(graph.edges(1), 'original')

  nof_nodes = graph.number_of_nodes()
  nof_features = graph.node_feature_shapes()['default'][0]

  node_features = np.zeros((nof_nodes, nof_features))

  s_graph = StellarDiGraph(node_features,
                           edges=edges,
                           edge_type_column="type")
  return s_graph

def stellar_graph_random_edge_features(graph):
  """Add random edge features."""

  edges = get_edges_dataFrame(graph.edges(1), 'random')

  s_graph = StellarDiGraph(graph.node_features(),
                           edges=edges,
                           edge_type_column="type")
  return s_graph

def stellar_graph_random_node_features(graph):
  """Add random node features."""

  edges = get_edges_dataFrame(graph.edges(1), 'original')

  nof_nodes = graph.number_of_nodes()
  nof_features = graph.node_feature_shapes()['default'][0]

  node_features = np.random.random((nof_nodes, nof_features))

  s_graph = StellarDiGraph(node_features,
                           edges=edges,
                           edge_type_column="type")
  return s_graph

def stellar_graph_no_features(graph):
  """Remove the egde/node features."""

  edges = get_edges_dataFrame(graph.edges(1), 'default')

  nof_nodes = graph.number_of_nodes()
  nof_features = graph.node_feature_shapes()['default'][0]

  node_features = np.zeros((nof_nodes, nof_features))

  s_graph = StellarDiGraph(node_features,
                           edges=edges,
                           edge_type_column="type")
  return s_graph

def create_model_cnn(labels, input_shape, model_type='1d'):
  """Create the model."""
  layer_sizes = [3]
  model = Sequential()

  if model_type == '1d':
    # padronizado com o GNN
    model.add(Conv1D(filters=16, kernel_size=sum(layer_sizes), strides=sum(layer_sizes), activation="relu", padding='same', input_shape=input_shape))
    model.add(MaxPool1D(pool_size=2, strides=2))
    model.add(Conv1D(filters=32, kernel_size=5, strides=1, activation="relu"))
    model.add(Flatten())
    model.add(Dense(units=128, activation="relu"))
    model.add(Dropout(rate=0.5))
    model.add(Dense(units=labels, activation = 'softmax'))
  elif model_type == '2d':
    # modelo 1
    # padronizado com o GNN
    model.add(Conv2D(filters=16, kernel_size=(sum(layer_sizes), sum(layer_sizes)), strides=(sum(layer_sizes), sum(layer_sizes)), activation="relu", padding='same', input_shape=input_shape))
    model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
    model.add(Conv2D(filters=32, kernel_size=(5, 5), strides=(1, 1), activation="relu"))
    model.add(Flatten())
    model.add(Dense(units=128, activation="relu"))
    model.add(Dropout(rate=0.5))
    model.add(Dense(units=labels, activation = 'softmax'))
  else:
    print('Model type error')
    sys.exit(1)

  return model

def create_model_lstm(labels, maxlen: int, embedding_dim: int, num_classes: int, dense_layer_size: int):

  # Keras model
  inp = Input(shape=(maxlen, embedding_dim,), dtype="float32", name="code_in")
  x = LSTM(embedding_dim, implementation=1, return_sequences=True, name="lstm_1")(inp)
  x = LSTM(embedding_dim, implementation=1, name="lstm_2")(x)

  # Heuristic model: outputs 1-of-num_classes prediction
  x = BatchNormalization()(x)
  x = Dense(dense_layer_size, activation="relu")(x)
  outputs = Dense(labels, activation="sigmoid")(x)

  return Model(inputs=inp, outputs=outputs)

  self.model.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=['accuracy'])


def create_model_gnn(labels, graphs):
  generator = PaddedGraphGenerator(graphs=graphs)

  k = 35  # the number of rows for the output tensor
  layer_sizes = [32, 32, 32, 1]

  dgcnn_model = DeepGraphCNN(
    layer_sizes=layer_sizes,
    activations=["tanh", "tanh", "tanh", "tanh"],
    k=k,
    bias=False,
    generator=generator,  
  )
  x_inp, x_out = dgcnn_model.in_out_tensors()
  
  # Add the convolutional, max pooling, and dense layers
  x_out = Conv1D(filters=16, kernel_size=sum(layer_sizes), strides=sum(layer_sizes), activation="relu")(x_out)
  x_out = MaxPool1D(pool_size=2, strides=2)(x_out)
  x_out = Conv1D(filters=32, kernel_size=5, strides=1, activation="relu")(x_out)
  x_out = Flatten()(x_out)
  x_out = Dense(units=128, activation="relu")(x_out)
  x_out = Dropout(rate=0.5)(x_out)
  outputs = Dense(units=labels, activation="softmax")(x_out)
  
  #
  # Create the model and prepare it for training by specifying the loss and optimization algorithm
  #
  model = Model(inputs=x_inp, outputs=outputs)

  return model

def load_desc_train_val_test_dataset_sequence(dataset_type,
                                              compiler_level,
                                              description_dataset_filename,
                                              features):
  
  filename = '{}.yaml'.format(description_dataset_filename)
  
  wget = 'wget www.csl.uem.br/repository/data/datasets/{}'.format(filename)
  !$wget 
  
  fin = open(filename, 'r')
  description_dataset = yl.load(fin)
  fin.close()

  for dataset, _ in description_dataset.items():
    !mkdir $dataset
    wget = 'wget www.csl.uem.br/repository/data/{}/{}/{}.tar.xz'.format(dataset, compiler_level, dataset_type)
    tar = 'tar xfJ {}.tar.xz -C {}'.format(dataset_type, dataset)
    rm = 'rm -rf {}*'.format(dataset_type)
    !$wget
    !$tar
    !$rm
  
  X = {'training': [], 'validation': [], 'test': []}
  Y = {'training': [], 'validation': [], 'test': []} 

  labels = []

  for phase in ['training', 'validation', 'test']:

    for dataset, dataset_data in description_dataset.items():

      if phase not in dataset_data:
        continue

      for label, samples in dataset_data[phase].items():

        int_label = int(label)
        
        # if dataset_type == 'prog2image' and int_label > 40:
        if int_label > FLAGS_max_labels:
          continue
        
        labels.append(int_label-1)

        dataset_directory = os.path.join(dataset, dataset_type, label)

        for sample in samples:
          
          try:
            representation = np.load('{}/{}.npz'.format(dataset_directory, sample))
          except:
            print('Erro load', dataset_directory, sample, flush=True)
            continue

          if FLAGS_dataset_type in ['prog2image']:
            representation = np.array(representation['values'], np.int8)
          else:
            representation = representation['values']
          if features == 'random_features':
            representation = random_features(representation)
          elif features == 'zero_to_random_features':
            representation = zero_to_random_features(representation)
          elif features == 'one_to_random_features':
            representation = one_to_random_features(representation)

          Y[phase].append(int_label-1)
          X[phase].append(representation)

  labels = list(dict.fromkeys(labels))
  
  datasets = '* '.join(list(description_dataset.keys()))
  rm = 'rm -rf {}* {}*'.format(datasets, description_dataset_filename)
  !$rm

  return X['training'], Y['training'], X['validation'], Y['validation'], X['test'], Y['test'], len(labels)

def load_desc_train_val_test_dataset_graph(dataset_type,
                                           embeddings,
                                           compiler_level,
                                           description_dataset_filename,
                                           features):
  
  filename = '{}.yaml'.format(description_dataset_filename)
  
  wget = 'wget www.csl.uem.br/repository/data/datasets/{}'.format(filename)
  !$wget 
  
  fin = open(filename, 'r')
  description_dataset = yl.load(fin)
  fin.close()

  for dataset, _ in description_dataset.items():
    !mkdir $dataset
    wget = 'wget www.csl.uem.br/repository/data/{}/{}/{}.tar.xz'.format(dataset, compiler_level, dataset_type)
    tar = 'tar xfJ {}.tar.xz -C {}'.format(dataset_type, dataset)
    rm = 'rm -rf {}*'.format(dataset_type)
    !$wget
    !$tar
    !$rm

  X = []
  Y = {'training': [], 'validation': [], 'test': []} 
  Y_index = {'training': [], 'validation': [], 'test': []} 

  labels = []

  for phase in ['training', 'validation', 'test']:

    for dataset, dataset_data in description_dataset.items():

      if phase not in dataset_data:
        continue

      for label, samples in dataset_data[phase].items():

        int_label = int(label)

        labels.append(int_label-1)

        dataset_directory = os.path.join(dataset, dataset_type, embeddings, label)

        for sample in samples:

          try:
            filename = '{}/{}.pk'.format(dataset_directory, sample)
            fin = open(filename, 'rb')
            representation = pk.load(fin)
            fin.close()                  
          except:
            print('Erro load', dataset_directory, sample, flush=True)
            continue

          if features == 'no_edge_features':
            representation = stellar_graph_no_edge_features(representation)
          elif features == 'no_node_features':
            representation = stellar_graph_no_node_features(representation)
          elif features == 'random_edge_features':
            representation = stellar_graph_random_edge_features(representation)        
          elif features == 'random_node_features':
            representation = stellar_graph_random_node_features(representation)       
          elif features == 'no_features':
            representation = stellar_graph_no_features(representation)

          Y[phase].append(int_label-1)
          Y_index[phase].append(len(X))
          X.append(representation)

  labels = list(dict.fromkeys(labels))
  
  datasets = '* '.join(list(description_dataset.keys()))
  rm = 'rm -rf {}* {}*'.format(datasets, description_dataset_filename)
  !$rm
  
  Y_train = pd.Series(Y['training'], index=Y_index['training'], name='label', dtype="category")  
  Y_val = pd.Series(Y['validation'], index=Y_index['validation'], name='label', dtype="category")  
  Y_test = pd.Series(Y['test'], index=Y_index['test'], name='label', dtype="category")
  
  Y_train = pd.get_dummies(Y_train)
  Y_val = pd.get_dummies(Y_val)
  Y_test = pd.get_dummies(Y_test)

  return X, Y_train, Y_val, Y_test, len(labels)

**Arguments**

In [None]:
#
# TRAIN, VALIDATION AND TEST RATIOS
#

# Train data ratio
FLAGS_train_ratio = 0.75

# Validation data ratio
FLAGS_val_ratio = 0.25

# Test data ratio
FLAGS_test_ratio = 0.20

#
# MODEL
#

# Number of epochs
#
# Zhang = 100
# Brauckmann = 1000
# Cummins = 300
#
FLAGS_epochs = 200

# Patience
FLAGS_patience = 20

# Verbose
FLAGS_training_verbose = 1

#
# GOOGLE DRIVER
#
FLAGS_driver = '/content/drive'

#
# CONTROLS
#

# Store the results?
FLAGS_store_results = False

# Store the model?
FLAGS_store_model = False

# Store Wandb data?
FLAGS_store_wandb = False

#
# DATASET
#

# MAX POJ DATASET
FLAGS_max_labels = 50

# Compiler level
#
# - O0 (the compiler level enabled during data generation)
# - O0_50C -> POJ 50 classes
# - Oz
FLAGS_compiler_level = 'O0_50C'

# Type
#
# Sequences (vector, matrix)
# - inst2vec.lower
# - inst2vec.2d (2d)
# - ir2vec.program
# - milepost
# - llvm_histogram
# - lbp
# - rbp
# - prog2image (2d)
#
# Graph
# - cfgcallnoroot
# - cfgcallcompactnoroot
# - cfgcallcompact1enoroot
# - cdfgcallnoroot
# - cdfgcallcompactnoroot
# - cdfgcallcompact1enoroot
# - cdfgplusnorrot
# - programlnoroot
#
FLAGS_dataset_type = 'prog2image'

# CNN or GNN?
FLAGS_CNN_dataset_types = ['inst2vec.lower',
                           'inst2vec.2d',
                           'ir2vec.program',
                           'milepost',
                           'llvm_histogram',
                           'lbp',
                           'rbp',
                           'prog2image']

# CNN 1d or CNN 2d?
FLAGS_CNN_2d_dataset_types = ['inst2vec.2d', 'prog2image']

# Embeddings
#
# bag_of_words
# inst2vec
# ir2vec
# opcode
# word2vec
FLAGS_embeddings = 'ir2vec' 

# Features type: random, zero to random, one to random
FLAGS_features = 'all_features'

# Round
FLAGS_round = 'round3'

# Description dataset filename
#
# AnghaBestSeqsSBLP2021_500_inst2vec_TVT.yaml // não existem 1000 exemplos para inst2vec
# AnghaBestSeqsSBLP2021_1000.TVT.yaml // 1000 por classe
# CodeNetBestSeqsSBLP2021_TVT.yaml // 500 por classe
# AnghaLoops_TVT.yaml // 300 por classe
# POJ_TVT.yaml // 500 por classe
# POJ50C_TVT.yaml // 500 por 50 classes
#
FLAGS_description_dataset_filename = 'POJ50C_TVT'

#
# RESULTS
#

# Top directory
FLAGS_top_directory = 'My Drive/CC22'

# Result directory suffix
FLAGS_suffix = ''

# Results directory
if FLAGS_dataset_type in FLAGS_CNN_dataset_types:
  last = '{}_{}'.format(FLAGS_features,
                        FLAGS_compiler_level) if not FLAGS_suffix else '{}_{}_{}'.format(FLAGS_features,
                                                                                         FLAGS_compiler_level,
                                                                                         FLAGS_suffix)
else:
    last = '{}_{}_{}'.format(FLAGS_features,
                             FLAGS_embeddings,
                             FLAGS_compiler_level) if not FLAGS_suffix else '{}_{}_{}_{}'.format(FLAGS_features,
                                                                                                 FLAGS_embeddings,
                                                                                                 FLAGS_compiler_level,
                                                                                                 FLAGS_suffix)
 
FLAGS_results_directory = os.path.join(FLAGS_driver,
                                       FLAGS_top_directory,
                                       FLAGS_description_dataset_filename,
                                       FLAGS_round,
                                       FLAGS_dataset_type,
                                       last)

# Wandb project name
FLAGS_project_name = 'CC22'

# Breakdown the runtime
FLAGS_times = {}

**Open Google Drive**

In [None]:
if FLAGS_store_results:
  drive.mount('/content/drive')

**Initiate Wandb**

In [None]:
if FLAGS_store_wandb:
  PROJECT_NAME = '{}_{}_{}_{}_{}'.format(FLAGS_project_name,
                                         FLAGS_description_dataset_filename,
                                         FLAGS_round,
                                         FLAGS_dataset_type,
                                         last)
  wandb.init(project=PROJECT_NAME, entity='andersonfaustino')

**Load the dataset**

In [None]:
start = time.time()

if FLAGS_dataset_type in FLAGS_CNN_dataset_types:
  # dados estão em diretórios reparados
  #
  #  training:
  #    1: 
  #    2:
  #    ...
  #  validation:
  #     ...
  #  test:
  #     ...
  #
  X_train, Y_train, X_val, Y_val, X_test, Y_test, FLAGS_labels = load_desc_train_val_test_dataset_sequence(FLAGS_dataset_type,
                                                                                                           FLAGS_compiler_level,
                                                                                                           FLAGS_description_dataset_filename,
                                                                                                           FLAGS_features)

else:
  # dados estão em diretórios reparados
  #
  #  training:
  #    1: 
  #    2:
  #    ...
  #  validation:
  #     ...
  #  test:
  #     ...
  #
  X, Y_train, Y_val, Y_test, FLAGS_labels = load_desc_train_val_test_dataset_graph(FLAGS_dataset_type,
                                                                                   FLAGS_embeddings,
                                                                                   FLAGS_compiler_level,
                                                                                   FLAGS_description_dataset_filename,
                                                                                   FLAGS_features)

end = time.time()

FLAGS_times['loading'] = end - start

--2021-10-06 15:04:55--  http://www.csl.uem.br/repository/data/datasets/POJ50C_TVT.yaml
Resolving www.csl.uem.br (www.csl.uem.br)... 186.233.153.209
Connecting to www.csl.uem.br (www.csl.uem.br)|186.233.153.209|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://www.csl.uem.br/repository/data/datasets/POJ50C_TVT.yaml [following]
--2021-10-06 15:04:56--  https://www.csl.uem.br/repository/data/datasets/POJ50C_TVT.yaml
Connecting to www.csl.uem.br (www.csl.uem.br)|186.233.153.209|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 312831 (305K) [application/octet-stream]
Saving to: ‘POJ50C_TVT.yaml’


2021-10-06 15:04:58 (345 KB/s) - ‘POJ50C_TVT.yaml’ saved [312831/312831]

--2021-10-06 15:05:00--  http://www.csl.uem.br/repository/data/POJ/O0_50C/prog2image.tar.xz
Resolving www.csl.uem.br (www.csl.uem.br)... 186.233.153.209
Connecting to www.csl.uem.br (www.csl.uem.br)|186.233.153.209|:80... connected.
HTTP request sent, 

**Prepare the dataset (CNN)**

In [None]:
if FLAGS_dataset_type in FLAGS_CNN_dataset_types:
  if FLAGS_dataset_type in ['prog2image']:
    X_train = np.array(X_train, dtype=np.int8)
    Y_train = np.array(Y_train, dtype=np.int8)
    X_val = np.array(X_val, dtype=np.int8)
    Y_val = np.array(Y_val, dtype=np.int8)
    X_test = np.array(X_test, dtype=np.int8)
    Y_test = np.array(Y_test, dtype=np.int8)
  else:
    X_train = np.array(X_train)
    Y_train = np.array(Y_train)
    X_val = np.array(X_val)
    Y_val = np.array(Y_val)
    X_test = np.array(X_test)
    Y_test = np.array(Y_test)

  # 1D Model
  if FLAGS_dataset_type not in FLAGS_CNN_2d_dataset_types:
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
  else:
    # 2D Model
    rows = X_train[0].shape[0]
    cols = X_train[0].shape[1]
    X_train = X_train.reshape(X_train.shape[0], rows, cols, 1)
    X_val = X_val.reshape(X_val.shape[0], rows, cols, 1)
    X_test = X_test.reshape(X_test.shape[0], rows, cols, 1)

  print('Training:', X_train.shape[0], flush=True)
  print('Validation:', X_val.shape[0], flush=True)
  print('Test:', X_test.shape[0], flush=True)


Training: 15000
Validation: 5000
Test: 5000


**Prepare the dataset (GNN)**

In [None]:
if FLAGS_dataset_type not in FLAGS_CNN_dataset_types:
  %memit gen = PaddedGraphGenerator(graphs=X)

  %memit train_gen = gen.flow(list(Y_train.index), targets=Y_train.values, batch_size=50, symmetric_normalization=False)
  %memit val_gen = gen.flow(list(Y_val.index), targets=Y_val.values, batch_size=1, symmetric_normalization=False)
  %memit test_gen = gen.flow(list(Y_test.index), targets=Y_test.values, batch_size=1, symmetric_normalization=False) 

  print('Training:', len(Y_train), flush=True)
  print('Validation:', len(Y_val), flush=True)
  print('Test:', len(Y_test), flush=True)

**Create the model**

In [None]:
if FLAGS_dataset_type in FLAGS_CNN_dataset_types:
  input_shape = X_train[0].shape
  model_type = '1d' if FLAGS_dataset_type not in FLAGS_CNN_2d_dataset_types else '2d'

  model = create_model_cnn(FLAGS_labels, input_shape, model_type)
  model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])
  model.summary()
else:
  model = create_model_gnn(FLAGS_labels, X)
  model.compile(optimizer=Adam(learning_rate=0.0001), loss=categorical_crossentropy, metrics=['accuracy'])
  model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 84, 86, 16)        160       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 42, 43, 16)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 38, 39, 32)        12832     
_________________________________________________________________
flatten (Flatten)            (None, 47424)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               6070400   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                6

**Training**

In [None]:
es_callback = EarlyStopping(monitor="val_accuracy", patience=FLAGS_patience, restore_best_weights=True)

start = time.time()

if FLAGS_dataset_type in FLAGS_CNN_dataset_types:
  %memit history = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=FLAGS_epochs, verbose=FLAGS_training_verbose, shuffle=True, callbacks=[es_callback])
else:  
  %memit history = model.fit(train_gen, validation_data=val_gen, epochs=FLAGS_epochs, verbose=FLAGS_training_verbose, shuffle=True, callbacks=[es_callback])

end = time.time()

FLAGS_times['training'] = end - start

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

**Testing**

In [None]:
start = time.time()

if FLAGS_dataset_type in FLAGS_CNN_dataset_types:
  %memit test_metrics = model.evaluate(X_test, Y_test)
else:
  %memit test_metrics = model.evaluate(test_gen)

end = time.time()

FLAGS_times['evaluating'] = end - start

test_metrics_dict = {}
for name, val in zip(model.metrics_names, test_metrics):
  print('{}: {:0.4f}'.format(name, val), flush=True)
  test_metrics_dict[name] = val

**Predicting**

In [None]:
start = time.time()

if FLAGS_dataset_type in FLAGS_CNN_dataset_types:
  %memit predicted = model.predict(X_test)
else:
  %memit predicted = model.predict(test_gen)

end = time.time()

FLAGS_times['predicting'] = end - start

**Statistics**

In [None]:
pred_y = predicted.argmax(axis=-1)

cm = confusion_matrix(Y_test if FLAGS_dataset_type in FLAGS_CNN_dataset_types else Y_test.idxmax(axis=1), pred_y)
print('Confusion matrix')
print(cm)

cr = classification_report(Y_test if FLAGS_dataset_type in FLAGS_CNN_dataset_types else Y_test.idxmax(axis=1), pred_y)
print('\n\nClassification report')
print(cr)

**Measure the final time**

In [None]:
final_time = time.time()
FLAGS_times['elapsed'] = final_time - initial_time

**Store the results**

In [None]:
if FLAGS_store_results:
  os.makedirs(FLAGS_results_directory, exist_ok=True)   

  #
  # Store the results
  #

  # History
  fout = open('{}/history.yaml'.format(FLAGS_results_directory), 'w')
  yl.dump(history.history, fout)
  fout.close()

  # Test metrics
  fout = open('{}/test_metrics.yaml'.format(FLAGS_results_directory), 'w')
  yl.dump(test_metrics_dict, fout)
  fout.close()

  # Dataset
  if FLAGS_dataset_type in FLAGS_CNN_dataset_types:
    np.savez_compressed('{}/y_train.npz'.format(FLAGS_results_directory), values=Y_train)
    np.savez_compressed('{}/y_val.npz'.format(FLAGS_results_directory), values=Y_val)
    np.savez_compressed('{}/y_test.npz'.format(FLAGS_results_directory), values=Y_test)
  else:
    Y_train.to_pickle('{}/train_graphs.pkl'.format(FLAGS_results_directory))
    Y_val.to_pickle('{}/val_graphs.pkl'.format(FLAGS_results_directory))
    Y_test.to_pickle('{}/test_graphs.pkl'.format(FLAGS_results_directory))

  # Dataset Summary - GNN
  if FLAGS_dataset_type not in FLAGS_CNN_dataset_types:
    summary = pd.DataFrame(
      [(g.number_of_nodes(), g.number_of_edges()) for g in X],
      columns=['nodes', 'edges'],
    )
    fout = open('{}/summary.yaml'.format(FLAGS_results_directory), 'w')
    yl.dump(summary.describe().to_dict(), fout)
    fout.close()

  # Predicted
  np.savez_compressed('{}/predicted'.format(FLAGS_results_directory), values=predicted)

  # Confusion matrix
  np.savez_compressed('{}/confusion_matrix'.format(FLAGS_results_directory), values=cm)

  # Classification report
  fout = open('{}/classification_report.pk'.format(FLAGS_results_directory), 'wb')
  pk.dump(cr, fout)
  fout.close()

  # Time
  fout = open('{}/elapsed_time.yaml'.format(FLAGS_results_directory), 'w')
  yl.dump(FLAGS_times, fout)
  fout.close()

  #
  # Store the hadware specifications
  #

  spec, _ = execute("nvidia-smi")
  filename = '{}/nvidia-smi.txt'.format(FLAGS_results_directory)
  save(filename, spec)

  spec, _ = execute("cat /proc/cpuinfo")
  filename = '{}/cpuinfo.txt'.format(FLAGS_results_directory)
  save(filename, spec)

  spec, _ = execute("cat /proc/meminfo")
  filename = '{}/meminfo.txt'.format(FLAGS_results_directory)
  save(filename, spec)  

  #
  # Store the model
  #
  if FLAGS_store_model:
    directory = '{}/model'.format(FLAGS_results_directory)
    os.makedirs(directory, exist_ok=True)
    model.save(directory)

**Free the memory**

In [None]:
if FLAGS_dataset_type not in FLAGS_CNN_dataset_types:
  del X
  del Y_train
  del Y_val
  del Y_test
  del gen
  del train_gen
  del val_gen
  del test_gen
else:
  del X_train
  del X_val
  del X_test
  del Y_train
  del Y_val
  del Y_test

del model
del history
del test_metrics
del test_metrics_dict
del predicted
del cm
del cr

**Flush Google Driver**

In [None]:
if FLAGS_store_results:
  #
  # Flush the Driver
  #
  drive.flush_and_unmount()

**Finish**

In [None]:
for key, value in FLAGS_times.items():
    print(key, value)