# CS5814: Assignment 2

## Problem 3

Add description

### Imports and config

In [1]:
import json
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AdamW, AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Setting Random Seed
RANDOM_SEED = 3819969

torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)

In [3]:
torch.cuda.empty_cache()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


### Preprocessing Data

Add description

In [4]:
def preprocess_data(file_loc='code_dataset.jsonl', generate_histogram=False):
    """
    Loads and processing the jsonl file,

    :param file_loc: location of target jsonl file
    :param generate_histogram: Flag to display histogram of function lengths
    :return: dataframe of preprocessed jsons
    """

    with open(file_loc, 'r') as json_file:
        json_list = list(json_file)

    code_list = []
    for json_str in json_list:
        result = json.loads(json_str)
        code_list.append(result)

    code_df = pd.DataFrame(code_list)

    total = code_df['target'].sum()
    proportion = total / code_df.shape[0]

    print("Insecure code counts: {}, Total code counts: {}, Proportion {}".format(total, code_df.shape[0], proportion))

    if generate_histogram:
        plt.hist(code_df['func'].str.len(), bins=100)
        plt.show()

    return code_df

In [5]:
def split_data(input_data, attention_data, label_data, train_ratio=0.8, val_ratio=0.10, max_len=512):
    """
    Splits data in accordance with provdied ratios, additionally discards functions with > max_len tokens
        as these will not be processed by the model will (can truncate, yet may truncate the error in the code)

    :param input_data: input functions
    :param attention_data: attention map
    :param label_data: target labels
    :param train_ratio: ratio of data to train on
    :param val_ratio: ratio of data to validate with (test is inferred from this and train)
    :param max_len: max number of tokens allowed for training date

    :return: 3 tuples for train val and test containing (input, attention, target)
    """
    # Removing excessively long elements from dataset
    valid_token_index = [i for i in range(len(input_data)) if len(input_data[i]) <= max_len]
    X_data = np.array(input_data)[valid_token_index]
    A_data = np.array(attention_data)[valid_token_index]
    Y_data = np.array(label_data)[valid_token_index]

    dataset_size = len(X_data)

    # Determining index to split dataset
    random_id = random.sample(range(dataset_size), dataset_size)
    train_split_id = int(train_ratio * dataset_size)
    val_split_id = int((train_ratio + val_ratio) * dataset_size)

    train_ids = random_id[:train_split_id]
    val_ids = random_id[train_split_id:val_split_id]
    test_ids = random_id[val_split_id:]

    X_train = torch.tensor(list(X_data[train_ids]))
    A_train = torch.tensor(list(A_data[train_ids]))
    Y_train = torch.tensor(list(Y_data[train_ids]))

    X_val = torch.tensor(list(X_data[val_ids]))
    A_val = torch.tensor(list(A_data[val_ids]))
    Y_val = torch.tensor(list(Y_data[val_ids]))

    X_test = torch.tensor(list(X_data[test_ids]))
    A_test = torch.tensor(list(A_data[test_ids]))
    Y_test = torch.tensor(list(Y_data[test_ids]))

    return (X_train, A_train, Y_train), (X_val, A_val, Y_val), (X_test, A_test, Y_test)

In [6]:
def tokenize(code_df, model_name='codebert-base'):
    """
    Apply the tokenizer from the huggingface pretrained model

    :param code_df: dataframe of preprocess code (from jsonl)
    :param model_name: model name (targeting local install)
    :return: 3 tuples for train val and test containing (input, attention, target)
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    inputs = tokenizer(code_df['func'].tolist(), truncation=False, padding='max_length')

    input_data = inputs['input_ids']
    attention_data = inputs['attention_mask']
    label_data = torch.tensor(code_df['target'].tolist())  # TODO - this can be directly converted to a np array

    return split_data(input_data, attention_data, label_data, max_len=512)

## Fine Tuning

Add Description

In [7]:
def train(model, train_data, val_data, epochs=5, batch_size=16, learning_rate=2e-5, validate_per=500,
          run_name="temp", run_descrption=None):
    """
    Main fine-tuning training loop for the provided model

    :param model: model loaded with predefined weights
    :param train_data: tuple of X_train, A_train, Y_train (X = inputs, A = attention, Y = target)
    :param val_data: tuple X_val, A_val, Y_val
    :param epochs: number of epochs for training
    :param batch_size: batch size (see note below about batch_hack)
    :param learning_rate: optimizer learning rate
    :param validate_per: number of weight updates before validation occurs
                            (notes: - if batch_size = 32, and validate_per = 32, validation will occur every batch
                                    - this is wrt the start of each epoch
                                    - validation will always occour at the start of each epoch (step 0))
    :param run_name: name used to saving checkpoints and log files within codebert_finetune_runs
    :param run_descrption: string that is saved to info.txt describing the run


    :return: None (models are saved in checkpoints along with log data)
    """

    # Creating dir to save logs and checkpoints, re
    dir_name = "codebert_finetune_runs/{}".format(run_name)
    if os.path.exists(dir_name):
        print("run name already exists, exiting to prevent overwriting")
        return 0
    else:
        os.makedirs(dir_name)

    # Saving run description.txt
    if run_descrption is not None:
        with open("{}/info.txt".format(dir_name), "a+") as f:
            f.write(run_descrption)

    # Unpacking data
    X_train, A_train, Y_train = train_data
    X_val, A_val, Y_val = val_data

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    batch_hack = batch_size  # See note below regarding limited GPU memory

    # Initializing arrays for tracking loss
    train_loss_hist = []
    val_loss_hist = []

    # Counter to track batches (see note below related to GPU memory)
    batch_count = 0
    # validate_per_batch = int(validate_per/batch_hack)

    # Moving model to GPU if configured
    model = model.to(device)
    for epoch in range(epochs):

        # Generating random index for manual shuffling of data each epoch as note using DataLoaders
        permutation = torch.randperm(X_train.shape[0])

        # Note here that only a single element is loaded at each iteration (batch size = 1) due to GPU memory constraint
        for i in range(0, X_train.shape[0], 1):

            # Loading batch and moving to device
            indices = permutation[i:i + 1]
            batch_X, batch_Y = X_train[indices].to(device), Y_train[indices].to(device), \

            model.train()

            # Forward pass
            outputs = model(batch_X, labels=batch_Y)

            # Backward pass
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            # Tracking loss
            train_loss_hist.append(loss.item())

            # Training output
            train_output = "Epoch:{} Step:{} Training_loss:{:.6f}".format(epoch, i, loss.item())
            print(train_output+" Training_loss_avg:{:.6f}".format(np.average(train_loss_hist[-50:])))
            with open("{}/train_loss.txt".format(dir_name), "a+") as f:
                f.write(train_output+"\n")

            # Validation
            if i % validate_per == 0:
                val_loss_total = 0
                model.eval()
                print("Validating:")
                for j in tqdm(range(0, X_val.shape[0], 1)):
                    # Loading singular validation data (overwrites train data as can only load 1 intp GPU)
                    batch_X, batch_Y = X_val[j].to(device).reshape(1, -1), Y_val[j].to(device).reshape(1, -1)
                    with torch.no_grad():
                        val_outputs = model(batch_X, labels=batch_Y)
                    val_loss_total += val_outputs['loss'].item()

                # Adding average loss to tracker
                val_average = val_loss_total / (X_val.shape[0])
                val_loss_hist.append(val_average)

                # Validation output and logging
                val_output = "Epoch:{} Step:{} Val_loss:{:.6f}".format(epoch, i, val_average)
                print(val_output)
                with open("{}/val_los.txt".format(dir_name), "a+") as f:
                    f.write(val_output+"\n")

        # End of epoch checkpoint
        model.save_pretrained("{}/epoch_{}".format(dir_name, epoch + 1))

In [8]:
def main():
    """
    Main configuration function for a given finetune run
    :return: None
    """

    run_name = "test3"
    model_name = 'codebert-base'
    checkpoint_location = None

    code_df = preprocess_data(file_loc='code_dataset.jsonl')
    train_data, val_data, test_data = tokenize(code_df, model_name=model_name)

    # Loading model from checkpoint if location provided
    if checkpoint_location is None:
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint_location)

    train(model=model,
          train_data=train_data,
          val_data=val_data,
          epochs=5,
          batch_size=1,
          learning_rate=1e-4,
          validate_per=500,
          run_name=run_name,
          run_descrption="Fixed validation bug, lr=1e-4, validate per 500, batch 1")

In [None]:
main()

Insecure code counts: 3729, Total code counts: 8000, Proportion 0.466125


  X_data = np.array(input_data)[valid_token_index]
  A_data = np.array(attention_data)[valid_token_index]
Some weights of the model checkpoint at codebert-base were not used when initializing RobertaForSequenceClassification: ['pooler.dense.bias', 'pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifi

Epoch:0 Step:0 Training_loss:0.432128 Training_loss_avg:0.432128
Validating:


100%|████████████████████████████████████████████████████████████████████████████████| 416/416 [00:35<00:00, 11.59it/s]


Epoch:0 Step:0 Val_loss:0.891982
Epoch:0 Step:1 Training_loss:0.210745 Training_loss_avg:0.321437
Epoch:0 Step:2 Training_loss:2.511854 Training_loss_avg:1.051576
Epoch:0 Step:3 Training_loss:0.122689 Training_loss_avg:0.819354
Epoch:0 Step:4 Training_loss:0.130322 Training_loss_avg:0.681548
Epoch:0 Step:5 Training_loss:1.907819 Training_loss_avg:0.885926
Epoch:0 Step:6 Training_loss:1.492967 Training_loss_avg:0.972646
Epoch:0 Step:7 Training_loss:0.482240 Training_loss_avg:0.911345
Epoch:0 Step:8 Training_loss:0.586573 Training_loss_avg:0.875260
Epoch:0 Step:9 Training_loss:0.628762 Training_loss_avg:0.850610
Epoch:0 Step:10 Training_loss:0.569177 Training_loss_avg:0.825025
Epoch:0 Step:11 Training_loss:0.559368 Training_loss_avg:0.802887
Epoch:0 Step:12 Training_loss:0.386136 Training_loss_avg:0.770829
Epoch:0 Step:13 Training_loss:0.275752 Training_loss_avg:0.735467
Epoch:0 Step:14 Training_loss:1.955227 Training_loss_avg:0.816784
Epoch:0 Step:15 Training_loss:0.216072 Training_loss

Epoch:0 Step:125 Training_loss:0.659448 Training_loss_avg:0.766927
Epoch:0 Step:126 Training_loss:0.848518 Training_loss_avg:0.770740
Epoch:0 Step:127 Training_loss:0.922362 Training_loss_avg:0.772488
Epoch:0 Step:128 Training_loss:0.510374 Training_loss_avg:0.766284
Epoch:0 Step:129 Training_loss:1.050969 Training_loss_avg:0.770146
Epoch:0 Step:130 Training_loss:0.457420 Training_loss_avg:0.767798
Epoch:0 Step:131 Training_loss:1.046237 Training_loss_avg:0.777140
Epoch:0 Step:132 Training_loss:1.067309 Training_loss_avg:0.787320
Epoch:0 Step:133 Training_loss:0.402990 Training_loss_avg:0.784544
Epoch:0 Step:134 Training_loss:0.447087 Training_loss_avg:0.783404
Epoch:0 Step:135 Training_loss:1.121124 Training_loss_avg:0.796044
Epoch:0 Step:136 Training_loss:0.396352 Training_loss_avg:0.795061
Epoch:0 Step:137 Training_loss:1.045407 Training_loss_avg:0.808197
Epoch:0 Step:138 Training_loss:1.087401 Training_loss_avg:0.792831
Epoch:0 Step:139 Training_loss:1.352401 Training_loss_avg:0.81