**IMPORTS**

In [None]:
import os
import sys
import time
import glob as gl
import yaml as yl
import pickle as pk
import numpy as np
import pandas as pd
import subprocess
import datetime
import random

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import tensorflow as tf

from math import sqrt
from absl import app, flags, logging
from dataclasses import dataclass
from google.colab import drive

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

from tensorflow.keras import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Input, LSTM, Dense, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPool1D, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPool2D

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import categorical_crossentropy

**FLAGS**

In [None]:
@dataclass
class Flags:
    """Flags"""
    dataset: str
    representation: str
    train_dataset_directory: str
    train_p: int
    classes: int
    results_directory: str
    scaler: bool
    model: str
    patience: int
    epochs: int
    rounds: int
    verbose: bool
    print_model: bool
    print_cm: bool
    print_cr: bool

**FUNCTIONS**

In [None]:
def build_model_lstm(input_shape, embedding_dim, classes):
    dense_layer_size = 32
    inp = Input(shape=input_shape, dtype="float32", name="code_in")
    x = LSTM(embedding_dim, implementation=1, return_sequences=True, name="lstm_1")(inp)
    x = LSTM(embedding_dim, implementation=1, name="lstm_2")(x)
    
    # Heuristic model: outputs 1-of-num_classes prediction
    x = BatchNormalization()(x)
    x = Dense(dense_layer_size, activation="relu")(x)
    outputs = Dense(classes, activation="softmax")(x)

    model = Model(inputs=inp, outputs=outputs)

    model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss=categorical_crossentropy,
                  metrics=['accuracy'])

    return model;

In [None]:
def build_model_cnn_1D(input_shape, classes):
    """Create the model."""
    layer_sizes = [3]
    model = Sequential()
    
    model.add(Conv1D(filters=16,
                     kernel_size=sum(layer_sizes),
                     strides=sum(layer_sizes),
                     activation="relu",
                     padding='same',
                     input_shape=input_shape))
    model.add(MaxPool1D(pool_size=2,
                        strides=2))
    model.add(Conv1D(filters=32,
                     kernel_size=5,
                     strides=1,
                     activation="relu"))
    model.add(Flatten())
    model.add(Dense(units=128,
                    activation="relu"))
    model.add(Dropout(rate=0.5))
    model.add(Dense(units=classes,
                    activation = 'softmax'))

    model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss=categorical_crossentropy,
                  metrics=['accuracy'])
    
    return model;

In [None]:
def build_model_cnn_2D(input_shape, classes):
    """Create the model."""
    layer_sizes = [3]
    model = Sequential()

    model.add(Conv2D(filters=16,
                     kernel_size=(sum(layer_sizes), sum(layer_sizes)),
                     strides=(sum(layer_sizes), sum(layer_sizes)),
                     activation="relu",
                     padding='same',
                     input_shape=input_shape))
    model.add(MaxPool2D(pool_size=(2, 2),
                            strides=(2, 2)))
    model.add(Conv2D(filters=32,
                     kernel_size=(5, 5),
                     strides=(1, 1),
                     activation="relu"))
    model.add(Flatten())
    model.add(Dense(units=128,
                    activation="relu"))
    model.add(Dropout(rate=0.5))
    model.add(Dense(units=classes,
                    activation = 'softmax'))

    model.compile(optimizer=Adam(learning_rate=0.0001),
                  loss=categorical_crossentropy,
                  metrics=['accuracy'])
    
    return model;

In [None]:
def build_model_rf():
    model = RandomForestClassifier(random_state=0)
    return model

In [None]:
def build_model_svm():
    model = LinearSVC(random_state=0, tol=1e-5)
    return model

In [None]:
def build_model_knn():
    model = KNeighborsClassifier(n_neighbors=15, weights='uniform')
    return model

In [None]:
def build_model_lr():
    model = LogisticRegression(C=1e5)
    return model

In [None]:
def build_model_mlp():
    model = MLPClassifier(random_state=1, max_iter=300)
    return model

In [None]:
def load_dataset(dataset_directory, classes, percentage):
    X = []
    y = []
    for label in range(1, classes+1):
        ddir = os.path.join(dataset_directory, str(label))
        samples = gl.glob('{}/*.npz'.format(ddir))
        random.shuffle(samples)

        total = len(samples)*percentage/100

        counter = 0
        for sample in samples:
            try:
                x_val = np.load(sample)
                x_val = x_val['values']
                y_val = label - 1
                X.append(x_val)
                y.append(y_val)
                counter += 1
            except:
                print('Erro: ', sample)
                continue

            if counter == total:
                break

    X = np.array(X)
    y = np.array(y)

    return X, y

In [None]:
def execute(FLAGS):
    """Execute."""

    # Breakdown the runtime
    flags_times = {}

    #
    # Initialize the execution
    #
  
    #
    # Load the dataset
    #
    print('\nLoading the dataset ...')
    start = time.time()

    X, y = load_dataset(FLAGS.train_dataset_directory, FLAGS.classes, FLAGS.train_p)

    #
    # Scaling the data
    #
    if FLAGS.scaler:
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)

    end = time.time()

    # Store load time
    flags_times['loading'] = end - start

    #
    # Build the model
    #
    print('\nBuilding the dataset ...')

    if FLAGS.model == 'lstm':
        if len(X.shape) == 2:
            X = X.reshape(X.shape[0], X.shape[1], 1)
            embedding_dim =  X[0].shape[0]
        else:
            embedding_dim =  X[0].shape[1]

        model = build_model_lstm(X[0].shape, embedding_dim, FLAGS.classes)
    elif FLAGS.model == 'cnn':
        if len(X.shape) == 2:
            X = X.reshape(X.shape[0], X.shape[1], 1)
            model = build_model_cnn_1D(X[0].shape, FLAGS.classes)
        elif len(X.shape) == 3:
            X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1)
            model = build_model_cnn_2D(X[0].shape, FLAGS.classes)
    elif FLAGS.model == 'rf':
        model = build_model_rf()
    elif FLAGS.model == 'svm':
        model = build_model_svm()
    elif FLAGS.model == 'knn':
        model = build_model_knn()
    elif FLAGS.model == 'lr':
        model = build_model_lr()
    elif FLAGS.model == 'mlp':
        model = build_model_mlp()
    else:
      logging.error('Model error.')
      sys.exit(1)

    if FLAGS.print_model and not FLAGS.model in ['lr', 'mlp', 'svn', 'rf', 'knn']:
        print()
        model.summary()

    #
    # Create the output directory
    #
    os.makedirs(FLAGS.results_directory, exist_ok=True)

    #
    # Trainning and Test
    #

    es_callback = EarlyStopping(monitor="accuracy",
                                patience=FLAGS.patience,
                                restore_best_weights=True)

    kf = StratifiedKFold(n_splits=FLAGS.rounds, shuffle=True, random_state=42)
    split = kf.split(X, y)
    for i, (train_idx, test_idx) in enumerate(split):

        print('\n====>>> ROUND: {}'.format(i))

        #
        # Training
        #
        print('\nTraining ...')
        start = time.time()


        if FLAGS.model in ['lr', 'mlp', 'svn', 'rf', 'knn']:
            history = model.fit(X[train_idx],
                                y[train_idx])
        else:
            history = model.fit(X[train_idx],
                                pd.get_dummies(y[train_idx]),
                                epochs=FLAGS.epochs,
                                verbose=1 if FLAGS.verbose else 0,
                                shuffle=True,
                                callbacks=[es_callback])            

        end = time.time()

        # Store the training time
        flags_times['training_{}'.format(i)] = end - start

        if not FLAGS.verbose and not FLAGS.model in ['lr', 'mlp', 'svn', 'rf', 'knn']:
            hist = pd.DataFrame(history.history)
            print(hist.tail())

        #
        # Predicting
        #
        print('\nPredicting ...')
        start = time.time()

        if FLAGS.model in ['lr', 'mlp', 'svn', 'rf', 'knn']:
            y_pred = model.predict_proba(X[test_idx])
        else:
            y_pred = model.predict(X[test_idx])

        y_pred = y_pred.argmax(axis=-1)
        end = time.time()

        # Store the predicting time
        flags_times['predicting_{}'.format(i)] = end - start

        #
        # Statistic
        #
        print('\nCalculating statistics ...')
        acc = accuracy_score(y[test_idx], y_pred)
        cm = confusion_matrix(y[test_idx], y_pred)
        cr = classification_report(y[test_idx], y_pred)
       
        print('\nAccuracy:', acc)

        if FLAGS.print_cm:
            print('\nConfusion matrix')
            print(cm)

        if FLAGS.print_cr:
            print('\nClassification report')
            print(cr)

        #
        # Finalize the execution
        #

        # Create the output directory
        print('\nStoring the results ...')

        # Store the history
        if not FLAGS.model in ['lr', 'mlp', 'svn', 'rf', 'knn']:
            np.savez_compressed('{}/history_{}'.format(FLAGS.results_directory, i), values=history)

        # Store the statistics
        np.savez_compressed('{}/statistics_{}'.format(FLAGS.results_directory, i), cm=cm, cr=cr, acc=acc)

        # Store the prediction
        np.savez_compressed('{}/y_pred_{}'.format(FLAGS.results_directory, i), values=y_pred)

        # Store y_test
        np.savez_compressed('{}/y_test_{}'.format(FLAGS.results_directory, i), values=y[test_idx])

        # Store the elapsed time
        fout = open('{}/elapsed_time_{}.yaml'.format(FLAGS.results_directory, i), 'w')
        yl.dump(flags_times, fout)
        fout.close()


**MAIN**

In [None]:
# 
# Flags
#
flags = Flags(
    dataset="ojclone32",
    representation="inst2vec",
    train_dataset_directory="",
    train_p=100,
    classes=32,
    results_directory="/content/drive/My Drive/CGO2022",
    scaler=False,
    model="model1",
    patience=50,
    epochs=500,
    rounds=5,
    verbose=True,
    print_model=False,
    print_cm=False,
    print_cr=False
)
flags.train_dataset_directory = flags.representation
flags.results_directory = os.path.join(flags.results_directory, flags.model, flags.representation)

#
# Get the dataset
#
wget = 'wget www.csl.uem.br/repository/data/{}/{}.tar.xz'.format(flags.dataset, flags.representation)
tar = 'tar xfJ {}.tar.xz'.format(flags.representation)
!$wget
!$tar

#
# Open the Drive
#
drive.mount('/content/drive')

#
# Execute
#
execute(flags)

#
# Flush the Driver
#
drive.flush_and_unmount()

#
# Remove the dataset
#
rm = 'rm -rf {}*'.format(flags.dataset_directory)
!$rm