**IMPORTS**

In [None]:
import os
import sys
import time
import glob as gl
import yaml as yl
import pickle as pk
import numpy as np
import pandas as pd
import subprocess
import datetime
import random

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf

from math import sqrt
from absl import app, flags, logging
from dataclasses import dataclass
from google.colab import drive

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

!pip install stellargraph

from stellargraph.mapper import PaddedGraphGenerator
from stellargraph.layer import DeepGraphCNN
from stellargraph.layer import GCNSupervisedGraphClassification
from stellargraph import StellarDiGraph

from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Conv1D, MaxPool1D, Dropout, Flatten

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy

**FLAGS**

In [None]:
@dataclass
class Flags:
    """Flags"""
    dataset: str
    representation: str
    train_dataset_directory: str
    train_p: int
    classes: int
    results_directory: str
    model: str
    patience: int
    epochs: int
    rounds: int
    verbose: bool
    print_model: bool
    print_cm: bool
    print_cr: bool

**FUNCTIONS**

In [None]:
def build_model_gcn(X, classes):
    generator = PaddedGraphGenerator(graphs=X)

    gc_model = GCNSupervisedGraphClassification(
        layer_sizes=[64, 64],
        activations=["relu", "relu"],
        generator=generator,
        dropout=0.5,
    )
    x_inp, x_out = gc_model.in_out_tensors()
    predictions = Dense(units=32, activation="relu")(x_out)
    predictions = Dense(units=16, activation="relu")(predictions)
    predictions = Dense(units=classes, activation="softmax")(predictions)
    
    model = Model(inputs=x_inp, outputs=predictions)

    model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss=categorical_crossentropy,
                  metrics=['accuracy'])

    return model

In [None]:
def build_model_dgcnn(X, classes):
    """Create the model."""
    generator = PaddedGraphGenerator(graphs=X)
    
    k = 35  # the number of rows for the output tensor
    layer_sizes = [32, 32, 32, 1]
    
    dgcnn_model = DeepGraphCNN(
        layer_sizes=layer_sizes,
        activations=["tanh", "tanh", "tanh", "tanh"],
        k=k,
        bias=False,
        generator=generator,  
    )
    x_inp, x_out = dgcnn_model.in_out_tensors()
    
    x_out = Conv1D(filters=16,
                   kernel_size=sum(layer_sizes),
                   strides=sum(layer_sizes),
                   activation="relu")(x_out)
    x_out = MaxPool1D(pool_size=2,
                      strides=2)(x_out)
    x_out = Conv1D(filters=32,
                   kernel_size=5,
                   strides=1,
                   activation="relu")(x_out)
    x_out = Flatten()(x_out)
    x_out = Dense(units=128,
                  activation="relu")(x_out)
    x_out = Dropout(rate=0.5)(x_out)
    outputs = Dense(units=classes,
                    activation="softmax")(x_out)
    
    model = Model(inputs=x_inp, outputs=outputs)

    model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss=categorical_crossentropy,
                  metrics=['accuracy'])
    
    return model

In [None]:
def load_dataset(dataset_directory, classes, percentage):
    X = []
    y = []
    y_idx = []
    for label in range(1, classes+1):
        ddir = os.path.join(dataset_directory, str(label))
        samples = gl.glob('{}/*.pk'.format(ddir))
        random.shuffle(samples)

        total = len(samples)*percentage/100
        
        counter = 0
        for sample in samples:
            try:
                with open(sample, 'rb') as fin:
                    x_val = pk.load(fin)

                y_val = label - 1
                y.append(y_val)
                y_idx.append(len(X))
                X.append(x_val)
            except:
                print('Erro: ', sample)
                continue

            if counter == total:
                break

    y = np.array(y)

    return X, y

In [None]:
def execute(FLAGS):
    """Execute."""

    # Breakdown the runtime
    flags_times = {}

    #
    # Initialize the execution
    #
  
    #
    # Load the dataset
    #
    print('\nLoading the dataset ...')
    start = time.time()

    X, y = load_dataset(FLAGS.train_dataset_directory, FLAGS.classes, FLAGS.train_p)

    end = time.time()

    # Store load time
    flags_times['loading'] = end - start
        
    #
    # Build the model
    #
    print('\nBuilding the dataset ...')

    if FLAGS.model == 'gcn':
        model = build_model_gcn(X, FLAGS.classes)
    elif FLAGS.model == 'dgcnn':
        model = build_model_dgcnn(X, FLAGS.classes)
    else:
        logging.error('Model error.')
        sys.exit(1)

    if FLAGS.print_model:
        print()
        model.summary()

    #
    # Create the output directory
    #
    os.makedirs(FLAGS.results_directory, exist_ok=True)

    #
    # Trainning and Test
    #

    es_callback = EarlyStopping(monitor="accuracy",
                                patience=FLAGS.patience,
                                restore_best_weights=True)


    kf = StratifiedKFold(n_splits=FLAGS.n_splits, shuffle=True, random_state=42)
    split = kf.split(X, y)
    for i, (train_idx, test_idx) in enumerate(split):
        print('\n====>>> ROUND: {}'.format(i))

        #
        # Prepare de data
        #

        gen = PaddedGraphGenerator(graphs=X)
        
        X_train = gen.flow(list(train_idx),
                             targets=pd.get_dummies(y[train_idx]),
                             batch_size=50,
                             symmetric_normalization=False)
        X_test = gen.flow(list(test_idx),
                            targets=pd.get_dummies(y[test_idx]),
                            batch_size=1,
                            symmetric_normalization=False) 

        #
        # Training
        #
        print('\nTraining ...')
        start = time.time()

        history = model.fit(X_train,
                            epochs=FLAGS.epochs,
                            verbose=1 if FLAGS.verbose else 0,
                            shuffle=True,
                            callbacks=[es_callback])

        end = time.time()

        # Store the training time
        flags_times['training_{}'.format(i)] = end - start

        if not FLAGS.verbose:
            hist = pd.DataFrame(history.history)
            print(hist.tail())

        #
        # Predicting
        #
        print('\nPredicting ...')
        start = time.time()

        y_pred = model.predict(X_test)
        y_pred = y_pred.argmax(axis=-1)
        end = time.time()

        # Store the predicting time
        flags_times['predicting_{}'.format(i)] = end - start

        #
        # Statistic
        #
        print('\nCalculating statistics ...')
        acc = accuracy_score(y[test_idx], y_pred)
        cm = confusion_matrix(y[test_idx], y_pred)
        cr = classification_report(y[test_idx], y_pred)
       
        print('\nAccuracy:', acc)

        if FLAGS.print_cm:
            print('\nConfusion matrix')
            print(cm)

        if FLAGS.print_cr:
            print('\nClassification report')
            print(cr)

        #
        # Finalize the execution
        #
        # Create the output directory
        print('\nStoring the results ...')

        # Store the history
        np.savez_compressed('{}/history_{}'.format(FLAGS.results_directory, i), values=history)

        # Store the statistics
        np.savez_compressed('{}/statistics_{}'.format(FLAGS.results_directory, i), cm=cm, cr=cr, acc=acc)

        # Store the prediction
        np.savez_compressed('{}/y_pred_{}'.format(FLAGS.results_directory, i), values=y_pred)

        # Store y_test
        np.savez_compressed('{}/y_test_{}'.format(FLAGS.results_directory, i), values=y[test_idx])

        # Store the elapsed time
        fout = open('{}/elapsed_time_{}.yaml'.format(FLAGS.results_directory, i), 'w')
        yl.dump(flags_times, fout)
        fout.close()

**MAIN**

In [None]:
# 
# Flags
#
flags = Flags(
    dataset="ojclone32",
    representation="histogram",
    train_dataset_directory="",
    train_p=100,
    classes=32,
    results_directory="/content/drive/My Drive/CGO2022",
    scaler=False,
    model="model1",
    patience=50,
    epochs=500,
    rounds=5,
    verbose=False,
    print_model=False,
    print_cm=False,
    print_cr=False
)
flags.train_dataset_directory = flags.representation
flags.results_directory = os.path.join(flags.results_directory, flags.model, flags.representation)

#
# Get the dataset
#
wget = 'wget www.csl.uem.br/repository/data/{}/{}.tar.xz'.format(flags.dataset, flags.representation)
tar = 'tar xfJ {}.tar.xz'.format(flags.representation)
!$wget
!$tar

#
# Open the Drive
#
drive.mount('/content/drive')

#
# Execute
#
execute(flags)

#
# Flush the Driver
#
drive.flush_and_unmount()

#
# Remove the dataset
#
rm = 'rm -rf {}*'.format(flags.dataset_directory)
!$rm