In [None]:
!pip install transformers
!pip install tensorflow[and-cuda]==2.17.0
!pip install tf-keras==2.17.0
!pip install pandas
!pip install tensorflow_datasets
!pip install tensorrt==10.0.1

In [None]:
!wget -O MADE-WIC.zip https://zenodo.org/records/13370805/files/MADE-WIC.zip?download=1
!unzip -j MADE-WIC.zip "MADE-WIC/Dataset/devign/*" -d devign
!rm MADE-WIC.zip

In [1]:
import tensorflow as tf
from transformers import TFRobertaModel, RobertaTokenizer
from tf_keras import Model
from tf_keras.layers import Dense, Input, Dropout
from tf_keras.regularizers import L2
from tf_keras.metrics import Precision, Recall
from tf_keras.losses import BinaryCrossentropy
from tf_keras.optimizers import AdamW
import tensorflow_datasets as tfds
import pandas as pd

2025-01-16 15:24:41.045417: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-16 15:24:41.060249: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-16 15:24:41.075167: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-16 15:24:41.079721: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-16 15:24:41.093424: I tensorflow/core/platform/cpu_feature_guar

In [2]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
dropout_prob = 0.1
l2_reg_lambda = 0.2
learning_rate = 2e-5
#num_epochs = 1
num_epochs = 10
batch_size = 16
max_length = 512

tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = TFRobertaModel.from_pretrained("microsoft/codebert-base")

input_ids = Input(shape=(512, ), dtype='int32', name='input_ids')
attention_mask = Input(shape=(512, ), dtype='int32', name='attention_mask')
model = model([input_ids, attention_mask])
embedding = model.last_hidden_state[:, 0, :]
embedding = Dropout(dropout_prob)(embedding)

output = Dense(1,
                kernel_initializer='glorot_normal',
                kernel_regularizer=L2(l2_reg_lambda),
                bias_regularizer=L2(l2_reg_lambda),
                activation='sigmoid',
                name='satd')(embedding)

model = Model(inputs=[input_ids, attention_mask], outputs=output)

model.compile(loss=BinaryCrossentropy(),
              optimizer=AdamW(learning_rate),
              metrics=['accuracy', Precision(), Recall()])

2025-01-16 15:25:31.907195: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2021] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 62651 MB memory:  -> device: 0, name: NVIDIA A100-SXM-64GB, pci bus id: 0000:c8:00.0, compute capability: 8.0
All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at microsoft/codebert-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [4]:
df = pd.read_csv('devign/complete.csv')


df.fillna(value='', inplace=True)
df.replace(to_replace=[None], value='', inplace=True)
dataset = tf.data.Dataset.from_tensor_slices((df['Function'], df['Devign']))
#dataset = dataset.take(1000)
num_samples = len(dataset)

train_ds = dataset.take(int(num_samples * 0.8))
validation_ds = dataset.skip(int(num_samples * 0.8)).take(int(num_samples * 0.1))
test_ds = dataset.skip(int(num_samples * 0.9))

print('Samples in train dataset:', len(train_ds))
print('Samples in validation dataset:', len(validation_ds))
print('Samples in test dataset:', len(test_ds))

Samples in train dataset: 21825
Samples in validation dataset: 2728
Samples in test dataset: 2729


In [12]:
def encode_examples(tokenizer, ds):
    # Prepare Input list
    input_ids_list = []
    attention_mask_list = []
    label_list = []

    for code, vulnerable in tfds.as_numpy(ds):
        bert_input = tokenizer.encode_plus(code.decode(),
                                        add_special_tokens=True,
                                        max_length=max_length,
                                        padding='max_length',
                                        return_attention_mask=True,
                                        truncation=True
                                        )
        input_ids_list.append(bert_input['input_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append(vulnerable)

    return { 'input_ids':  tf.convert_to_tensor(input_ids_list),
              'attention_mask': tf.convert_to_tensor(attention_mask_list) }, tf.convert_to_tensor(label_list)

train_ds_encoded, train_labels = encode_examples(tokenizer, train_ds)
validation_ds_encoded, validation_labels = encode_examples(tokenizer, validation_ds)
test_ds_encoded, test_labels = encode_examples(tokenizer, test_ds)

2025-01-16 16:36:28.739139: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [6]:
model.fit(train_ds_encoded, train_labels, epochs=num_epochs, batch_size=batch_size, validation_data=(validation_ds_encoded, validation_labels))

Epoch 1/10


I0000 00:00:1737037811.768813 3652824 service.cc:146] XLA service 0x14815326e9f0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1737037811.768844 3652824 service.cc:154]   StreamExecutor device (0): NVIDIA A100-SXM-64GB, Compute Capability 8.0
2025-01-16 15:30:11.774833: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-01-16 15:30:12.104659: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907
I0000 00:00:1737037812.177746 3652824 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tf_keras.src.callbacks.History at 0x1492a01f7040>

In [7]:
predictions = model.predict(test_ds_encoded)



In [10]:
def calculate_scores(predictions, label):

    if hasattr(label, "ndim") and label.ndim > 1:
        label = label.squeeze()

    tp = 0
    tn = 0
    fp = 0
    fn = 0

    for index in range(len(predictions)):
        prediction = predictions[index] if isinstance(predictions[index], bool) else predictions[index][0] > 0.5

        if(label[index] == True):
            if(prediction == True):
                tp = tp + 1
            else:
                fn = fn + 1
        else:
            if(prediction == False):
                tn = tn + 1
            else:
                fp = fp + 1

    print("TP:", tp)
    print("TN:", tn)
    print("FP:", fp)
    print("FN:", fn)

    precision = tp / (tp + fp) if tp + fp > 0 else -1
    recall = tp / (tp + fn) if tp + fn > 0 else -1
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f1 = 2 * ((precision * recall) / (precision + recall)) if precision + recall > 0 else -1

    print("\nPrecision:", precision)
    print("Recall:", recall)
    print("Accuracy:", accuracy)
    print("F1:", f1)

In [11]:
calculate_scores(predictions, test_labels)

TP: 598
TN: 1110
FP: 366
FN: 655

Precision: 0.6203319502074689
Recall: 0.4772545889864326
Accuracy: 0.6258702821546354
F1: 0.539467749210645
