In [11]:
import gc
import os
import os.path
import random
import re
from time import time
from typing import Tuple, Union, Dict

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import ShuffleSplit
from torch import Tensor, optim, nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [12]:
DEVICE_TYPE = "cuda:0"
MODEL_TYPE = "mrm8488/codebert-base-finetuned-detect-insecure-code"
PATH_TO_DATASET = "dataset"
BATCH_SIZE = 1
NUM_LABELS = 159
LR = 0.0001
EPOCHS = 5
LOG_FREQUENCY = 1

# Utils and Device Settings

In [13]:
SEPARATOR = {"stars": "".join(["*"] * 100), "dashes": "".join(["-"] * 100), "dots": "".join(["."] * 100)}


def get_device(device_type: str) -> torch.device:
    if device_type == "cpu":
        print("\n Running on device 'cpu' \n")
        return torch.device("cpu")

    if re.match(r"\bcuda:\b\d+", device_type):
        if not torch.cuda.is_available():
            print("\n WARNING: running on cpu since device {} is not available \n".format(device_type))
            return torch.device("cpu")

        gc.collect()
        torch.cuda.empty_cache()
        print("\n Running on device '{}' \n".format(device_type))
        return torch.device(device_type)

    raise ValueError("ERROR: {} is not a valid device! Supported device are 'cpu' and 'cuda:n'".format(device_type))


def make_deterministic(seed: int):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False

In [14]:
DEVICE = get_device(DEVICE_TYPE)





In [15]:
class LossTracker:

    def __init__(self):
        self.val, self.avg, self.sum, self.count = 0, 0, 0, 0

    def reset(self):
        self.val, self.avg, self.sum, self.count = 0, 0, 0, 0

    def update(self, val: float, n: int = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


class MetricsTracker:

    def __init__(self):
        super().__init__()
        self.__predictions, self.__true_labels, self.__metrics = [], [], {}
        self.__monitored_metrics = ["accuracy", "precision", "recall", "f1"]

    def compute_metrics(self) -> Dict:
        predictions_flat = np.argmax(np.concatenate(self.__predictions, axis=0), axis=1)
        true_labels_flat = np.concatenate(self.__true_labels, axis=0)
        print(predictions_flat, true_labels_flat)
        self.__metrics = {
            "accuracy": accuracy_score(true_labels_flat, predictions_flat),
            "precision": precision_score(true_labels_flat, predictions_flat, average='macro'),
            "recall": recall_score(true_labels_flat, predictions_flat, average='macro'),
            "f1": f1_score(true_labels_flat, predictions_flat, average='macro')
        }
        return self.__metrics

    def update_best_metrics(self) -> Dict:
        return {m: self.__metrics[m] for m in self.__monitored_metrics}

    def update(self, predictions: np.ndarray, true_labels: np.ndarray):
        self.__predictions.append(predictions)
        self.__true_labels.append(true_labels)

    def reset(self):
        self.__predictions, self.__true_labels = [], []

    def init_best_metrics(self) -> Dict:
        return {m: 0 for m in self.__monitored_metrics}

# Data Handling

In [16]:
class CGTDataset(Dataset):

    def __init__(self, train: bool = True, fold_num: int = 0, data_type: str = "all"):
        self.__data_type = data_type
        self.__items = self.__build_dataset()
        self.__tokenizer = AutoTokenizer.from_pretrained(MODEL_TYPE, padding='max_length')

        ss = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
        train_idx, test_idx = [(i_train, i_test) for i_train, i_test in ss.split(range(len(self.__items)))][fold_num]
        self.__fold_idx = train_idx if train else test_idx

    def __encode(self, text: str):
        return self.__tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    def __make_item(self, dataset: pd.DataFrame):
        item = {}
        path2bytecode = os.path.join(PATH_TO_DATASET, "bytecode", dataset['fp_bytecode'].values[0] + ".hex")
        if (self.__data_type == "all" or self.__data_type == "bytecode") and os.path.exists(path2bytecode):
            item['path2bytecode'] = path2bytecode
        path2runtime = os.path.join(PATH_TO_DATASET, "runtime", dataset['fp_runtime'].values[0] + ".rt.hex")
        if (self.__data_type == "all" or self.__data_type == "runtime") and os.path.exists(path2runtime):
            item['path2runtime'] = path2runtime
        path2source = os.path.join(PATH_TO_DATASET, "source", dataset['fp_sol'].values[0] + ".sol")
        if (self.__data_type == "all" or self.__data_type == "source") and os.path.exists(path2source):
            item['path2source'] = path2source
        return item

    def __build_dataset(self):
        dataset = pd.read_csv(os.path.join(PATH_TO_DATASET, "consolidated.csv"), sep=";")
        gt, items = {"none": 0}, []
        for _, row in dataset.iterrows():
            prop = row["property"].lower() if row['property_holds'] == 't' else 'none'
            if prop not in gt.keys():
                gt[prop] = len(gt.values())
            item = self.__make_item(dataset)
            item["gt"] = gt[prop]
            items.append(item)
        return items

    def __load_f(self, path_to_item: str) -> Tensor:
        with open(path_to_item, 'r') as fp:
            return self.__encode(fp.read())

    def __load_input(self, index: int) -> Dict:
        item = self.__items[index]
        return {k.split("path2")[-1]: self.__load_f(v) for k, v in item.items() if "path2" in k}

    def __load_label(self, index: int) -> Tensor:
        return self.__items[index]['gt']

    def __getitem__(self, index: int) -> Tuple[Dict, Tensor]:
        index = self.__fold_idx[index]
        x, y = self.__load_input(index), self.__load_label(index)
        return x, y

    def __len__(self) -> int:
        return len(self.__fold_idx)


class DataHandler:
    def __init__(self):
        self._dataset = CGTDataset

    def train_test_loaders(self, fold_num: int) -> Tuple:
        return self.get_loader(train=True, fold_num=fold_num), self.get_loader(train=False, fold_num=fold_num)

    def get_loader(self, train: bool, fold_num: int) -> DataLoader:
        dataset = self._dataset(train, fold_num)
        return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=train, pin_memory=True, drop_last=True)

# Model

In [17]:
class BERT(nn.Module):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.__bert = AutoModelForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels=NUM_LABELS,
                                                                         ignore_mismatched_sizes=True)

    def forward(self, input_id: Tensor, attention_mask: Tensor, label: Tensor) -> Tensor:
        input_id = input_id.squeeze(1)
        attention_mask = attention_mask.squeeze(1)
        return self.__bert(input_ids=input_id, attention_mask=attention_mask, labels=label)


class ModelBERT:

    def __init__(self):
        self.__optimizer = None
        self.__strategy = BERT().to(DEVICE)

    def predict(self, x: Dict, y: Tensor) -> Union[Tensor, any]:
        x = x["source"]
        input_id, attention_mask = x["input_ids"], x["attention_mask"]
        return self.__strategy(input_id, attention_mask, y)

    def optimize(self, x: Dict, y: Tensor) -> float:
        self.__optimizer.zero_grad()
        pred = self.predict(x, y)
        loss = self.get_loss(pred)
        loss.backward()
        self.__optimizer.step()
        return loss.item()

    @staticmethod
    def get_loss(pred: Dict) -> Tensor:
        return pred["loss"]

    def log_strategy(self, path_to_log: str):
        open(os.path.join(path_to_log, "strategy.txt"), 'a+').write(str(self.__strategy))

    def train_mode(self):
        self.__strategy = self.__strategy.train()

    def eval_mode(self):
        self.__strategy = self.__strategy.eval()

    def save(self, path_to_save: str):
        path_to_pth = os.path.join(path_to_save, "model.pth")
        print("\nSaving model at {}...".format(path_to_pth))
        torch.save(self.__strategy.state_dict(), path_to_pth)
        print("... Model saved successfully!\n")

    def set_optimizer(self, learning_rate: float, optimizer_type: str = "adamw"):
        optimizers_map = {"adam": optim.Adam, "adamw": optim.AdamW, "rmsprop": optim.RMSprop, "sgd": optim.SGD}
        self.__optimizer = optimizers_map[optimizer_type](self.__strategy.parameters(), lr=learning_rate)

# Training

In [18]:
class TrainerBERT:
    def __init__(self, path_to_log: str):
        self.__path_to_log = path_to_log
        self.__metrics_tracker, self.__train_loss, self.__val_loss = MetricsTracker(), LossTracker(), LossTracker()
        self.__best_val_loss, self.__best_metrics = 100.0, self.__metrics_tracker.init_best_metrics()
        self.__path_to_metrics = str(os.path.join(path_to_log, "metrics.csv"))

    @staticmethod
    def __to_device(x: Dict, y: Tensor):
        return {k1: {k2: v2.to(DEVICE) for k2, v2 in v1.items()} for k1, v1 in x.items()}, y.to(DEVICE)

    def __train_epoch(self, model: ModelBERT, data: DataLoader, epoch: int):
        for i, (x, y) in enumerate(data):
            if not "source" in x.keys():
                continue
            x, y = self.__to_device(x, y)
            tl = model.optimize(x, y)
            self.__train_loss.update(tl)
            if i % 5 == 0:
                print("[ Epoch: {} - Batch: {} ] | Loss: {:.4f} ".format(epoch + 1, i, tl))

    def __eval_epoch(self, model: ModelBERT, data: DataLoader):
        for i, (x, y) in enumerate(data):
            x, y = self.__to_device(x, y)
            pred = model.predict(x, y)
            self.__metrics_tracker.update(pred.logits.detach().cpu().numpy(), y.detach().cpu().numpy())
            vl = model.get_loss(pred).item()
            if i % 5 == 0:
                print("[ Batch: {} ] | Loss: {:.4f} ]".format(i, vl))

    def __check_metrics(self):
        """ Computes, prints and logs the current metrics using the metrics tracker """
        epoch_metrics = self.__metrics_tracker.compute_metrics()
        self.__print_metrics(epoch_metrics)
        self.__log_metrics(epoch_metrics)

    def __log_metrics(self, metrics: Dict):
        log_data = pd.DataFrame({"train_loss": [self.__train_loss.avg], "val_loss": [self.__val_loss.avg],
                                 **{"best_" + k: [v] for k, v in self.__best_metrics.items()},
                                 **{k: [v] for k, v in metrics.items()}})
        header = log_data.keys() if not os.path.exists(self.__path_to_metrics) else False
        log_data.to_csv(self.__path_to_metrics, mode='a', header=header, index=False)

    def __print_metrics(self, metrics: Dict):
        for mn, mv in metrics.items():
            print((" {} " + "".join(["."] * (15 - len(mn))) + " : {:.4f} (Best: {:.4f})")
                  .format(mn.capitalize(), mv, self.__best_metrics[mn]))

    def train(self, model: ModelBERT, training_set: DataLoader, test_set: DataLoader):
        """
        Trains the given model (a PyTorch nn.Module) for "epochs" epochs
        :param model: the model to be trained (a PyTorch nn.Module)
        :param training_set: the data loader containing the training data
        :param test_set: the data loader containing the validation/test data
        """
        model.log_strategy(self.__path_to_log)
        model.set_optimizer(LR)

        for epoch in range(EPOCHS):

            model.train_mode()
            self.__train_loss.reset()
            self.print_heading("training", epoch, EPOCHS)

            start = time()
            self.__train_epoch(model, training_set, epoch)
            self.print_train_performance(train_time=time() - start)

            if epoch % LOG_FREQUENCY == 0:
                model.eval_mode()
                self.__val_loss.reset()
                self.__metrics_tracker.reset()
                self.print_heading("validating", epoch, EPOCHS)

                start = time()
                with torch.no_grad():
                    self.__eval_epoch(model, test_set)
                self.print_val_performance(val_time=time() - start)

                self.__check_metrics()
                self.__check_if_best_model(model)

    def __check_if_best_model(self, model: ModelBERT):
        """
        Checks whether the provides model is the new best model based on the values of the validation loss.
        If yes, updates the best metrics and validation loss (as side effect) and saves the model to file
        :param model: the model to be possibly saved as new best model
        """
        if 0 < self.__val_loss.avg < self.__best_val_loss:
            self.__best_val_loss = self.__val_loss.avg
            self.__best_metrics = self.__metrics_tracker.update_best_metrics()
            print("\n -> Saving new best model...")
            model.save(self.__path_to_log)

    def print_train_performance(self, train_time: float):
        """
        Prints the training time/loss for the most recent epoch
        :param train_time: the training time for the most recent epoch
        """
        print("\n" + SEPARATOR["stars"])
        print(" Train Time ... : {:.4f}".format(train_time))
        print(" Train Loss ... : {:.4f}".format(self.__train_loss.avg))
        print(SEPARATOR["stars"])

    def print_val_performance(self, val_time: float):
        """
        Prints the validation time/loss for the most recent epoch
        :param val_time: the validation time for the most recent epoch
        """
        print("\n" + SEPARATOR["stars"])
        print(" Val Time ... : {:.4f}".format(val_time))
        print(" Val Loss ... : {:.4f}".format(self.__val_loss.avg))
        print(SEPARATOR["stars"] + "\n")

    @staticmethod
    def print_heading(mode: str, epoch: int, epochs: int):
        print("\n" + SEPARATOR["dashes"])
        print("\t\t {} epoch {}/{}".format(mode.upper(), epoch + 1, epochs))
        print(SEPARATOR["dashes"] + "\n")

# Perform CV

In [19]:
make_deterministic(0)

log_dir = "{}".format(time())
path_to_log = os.path.join("logs", log_dir)
os.makedirs(path_to_log)

for fold_num in range(5):
    print("\n Loading data for fold '{}'".format(fold_num + 1))
    train_loader, test_loader = DataHandler().train_test_loaders(fold_num)
    model = ModelBERT()

    print("\n" + SEPARATOR["dashes"])
    print("\t\t Training fold {}".format(fold_num + 1))
    print(SEPARATOR["dashes"] + "\n")

    trainer = TrainerBERT(path_to_log)
    trainer.train(model, train_loader, test_loader)


 Loading data for fold '1'


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mrm8488/codebert-base-finetuned-detect-insecure-code and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([159, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([159]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



----------------------------------------------------------------------------------------------------
		 Training fold 1
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
		 TRAINING epoch 1/5
----------------------------------------------------------------------------------------------------

[ Epoch: 1 - Batch: 0 ] | Loss: 5.4192 

****************************************************************************************************
 Train Time ... : 1.9228
 Train Loss ... : 5.4192
****************************************************************************************************

----------------------------------------------------------------------------------------------------
		 VALIDATING epoch 1/5
----------------------------------------------------------------------------------------------------

[ Batch: 0 ] | Loss: 4.2425 ]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at mrm8488/codebert-base-finetuned-detect-insecure-code and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([159, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([159]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



----------------------------------------------------------------------------------------------------
		 Training fold 2
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
		 TRAINING epoch 1/5
----------------------------------------------------------------------------------------------------

[ Epoch: 1 - Batch: 0 ] | Loss: 4.8724 

****************************************************************************************************
 Train Time ... : 1.5889
 Train Loss ... : 4.8724
****************************************************************************************************

----------------------------------------------------------------------------------------------------
		 VALIDATING epoch 1/5
----------------------------------------------------------------------------------------------------

[ Batch: 0 ] | Loss: 5.3857 ]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


[ Epoch: 2 - Batch: 0 ] | Loss: 5.1909 

****************************************************************************************************
 Train Time ... : 2.2849
 Train Loss ... : 5.1909
****************************************************************************************************

----------------------------------------------------------------------------------------------------
		 VALIDATING epoch 2/5
----------------------------------------------------------------------------------------------------

[ Batch: 0 ] | Loss: 4.5204 ]

****************************************************************************************************
 Val Time ... : 0.2874
 Val Loss ... : 0.0000
****************************************************************************************************

[2] [0]
 Accuracy ....... : 0.0000 (Best: 0.0000)
 Precision ...... : 0.0000 (Best: 0.0000)
 Recall ......... : 0.0000 (Best: 0.0000)
 F1 ............. : 0.0000 (Best: 0.0000)

----------------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 