In [None]:
!pip install transformers datasets

In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

import transformers
from transformers import AutoTokenizer, RobertaModel, DebertaV2Model, get_scheduler

from datasets import Dataset

from tqdm.notebook import tqdm

In [None]:
DIR = "/content/drive/MyDrive/code_cyber_sec/"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# MODEL = "deepset/roberta-base-squad2-distilled"
MODEL = "deepset/deberta-v3-large-squad2"

MAX_LENGTH = 256
STRIDE = 30

LR = 2e-5
DR = 0.01
EPOCHS = 30

## Data Processing

In [None]:
#reading data
train_df = pd.read_csv(DIR+"/data/train.csv")
train_df.shape

(1377, 4)

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1377 entries, 0 to 1376
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   ID            1377 non-null   int64 
 1   Text          1377 non-null   object
 2   ContainsCode  1377 non-null   bool  
 3   CodeList      532 non-null    object
dtypes: bool(1), int64(1), object(2)
memory usage: 33.7+ KB


In [None]:
train_df.head()

Unnamed: 0,ID,Text,ContainsCode,CodeList
0,2,Software development is an exciting field that...,False,
1,4,Another important aspect of software developme...,True,git commit -m 'Initial commit'
2,5,"In the world of software development, language...",False,
3,8,Software development is an intricate process t...,False,
4,9,"In this modern era, software development has i...",True,public class HelloWorld { public static void m...


In [None]:
train_df['CodeList']= train_df['CodeList'].fillna("")

In [None]:
train_df.head()

Unnamed: 0,ID,Text,ContainsCode,CodeList
0,2,Software development is an exciting field that...,False,
1,4,Another important aspect of software developme...,True,git commit -m 'Initial commit'
2,5,"In the world of software development, language...",False,
3,8,Software development is an intricate process t...,False,
4,9,"In this modern era, software development has i...",True,public class HelloWorld { public static void m...


### Removing extra spaces

In [None]:
train_df['Text'] = train_df['Text'].str.strip()

### Calculating span start and span end

In [None]:
#function to get the start and end idx of the span
def get_start_end(context, span):
  if span == "":
    return 0, 0

  for idx, val in enumerate(context):
      if span[0] == val and context[idx:idx+len(span)] == span:
          return idx, idx+len(span)

  return 0, 0

In [None]:
#function to map the start_end to the dataframe
def map_start_end(df, context_col_name, span_col_name):
    df = df.copy()
    for idx in tqdm(range(df.shape[0]), total=df.shape[0]):
        start, end = get_start_end(df.loc[idx, context_col_name], df.loc[idx, span_col_name])
        df.loc[idx, 'span_start'] = start
        df.loc[idx, 'span_end'] = end

    return df

In [None]:
#applying the above function to train_df
train_df = map_start_end(train_df, "Text", "CodeList")

  0%|          | 0/1377 [00:00<?, ?it/s]

In [None]:
train_df.sample(5)

Unnamed: 0,ID,Text,ContainsCode,CodeList,span_start,span_end
515,727,"In software development, TypeScript is often u...",True,TypeScript,25.0,35.0
1175,1625,Software development is a journey. It starts w...,False,,0.0,0.0
337,478,"Ruby on Rails, or Rails, is a server-side web ...",False,,0.0,0.0
755,1043,"In software development, choosing the right pr...",False,,0.0,0.0
351,501,Software development is an iterative process. ...,False,,0.0,0.0


### Preparing data for Huggingface dataset

In [None]:
train_ds = Dataset.from_pandas(train_df)
train_ds

Dataset({
    features: ['ID', 'Text', 'ContainsCode', 'CodeList', 'span_start', 'span_end'],
    num_rows: 1377
})

### Defining the tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Downloading (…)okenizer_config.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/8.65M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/18.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

### Function to convert data to model inputs

In [None]:
def process_train_data(examples):
    """
    Function to split large context to multiple chunks and tokenize the data
    """
    context = examples['Text']
    answer_start  = examples['span_start']
    answer_end = examples['span_end']

    token_data = tokenizer(
        context,
        padding='max_length',
        truncation="only_first",
        max_length=MAX_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        )

    offset_mappings = token_data.pop("offset_mapping")
    sample_map = token_data.pop("overflow_to_sample_mapping")

    token_data['ans_start'] = []
    token_data['ans_end'] = []

    for idx, offset in enumerate(offset_mappings):
        sample_idx = sample_map[idx]
        start, end = answer_start[sample_idx], answer_end[sample_idx]
        seq_ids = token_data.sequence_ids(idx)

        ## Calculating start and end of tokenized context
        context_start = 1
        context_end = 1
        while seq_ids[context_end] != None:
            context_end += 1

        context_end -= 1

        ## Calculating indices of answer start token and end token
        if start<offset[context_start][0] or end>offset[context_end][1]:
            token_data['ans_start'].append(0)
            token_data['ans_end'].append(0)
        else:
            ans_idx = context_start
            while ans_idx <= context_end and offset[ans_idx][0]<=start:
                ans_idx += 1
            token_data['ans_start'].append(ans_idx-1)

            ans_idx = context_end
            while ans_idx >= context_start and offset[ans_idx][1]>=end:
                ans_idx -=1
            token_data['ans_end'].append(ans_idx+1)

    return token_data

In [None]:
#Applying the above function to train_ds
train_ds_tok = train_ds.map(process_train_data, batched=True, remove_columns=train_ds.column_names)

Map:   0%|          | 0/1377 [00:00<?, ? examples/s]

In [None]:
len(train_ds), len(train_ds_tok)

(1377, 1377)

## Train-Val split

In [None]:
split_ds = train_ds_tok.train_test_split(test_size=0.15, seed=42)

In [None]:
split_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'ans_start', 'ans_end'],
        num_rows: 1170
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'ans_start', 'ans_end'],
        num_rows: 207
    })
})

## DataLoaders

In [None]:
TRAIN_BATCH_SIZE = 4
VAL_BATCH_SIZE = 4

In [None]:
train_loader = torch.utils.data.DataLoader(split_ds['train'].with_format("torch"), batch_size=TRAIN_BATCH_SIZE)
val_loader = torch.utils.data.DataLoader(split_ds['test'].with_format("torch"), batch_size=VAL_BATCH_SIZE)

## Model

In [None]:
class RobertaQA(nn.Module):
    def __init__(self, ):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(MODEL, add_pooling_layer=False)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask)

        #(batch_size, num_tokens, embedding_size)
        hn = outputs['last_hidden_state']
        drop_hn = self.dropout(hn)

        #(batch_size, num_tokens, 2)
        logits = self.linear(drop_hn)

        start_logits, end_logits = logits.split(1, dim = -1)
        start_logits, end_logits = start_logits.squeeze(-1), end_logits.squeeze(-1)

        return start_logits, end_logits

In [None]:
class DebertaQA(nn.Module):
    def __init__(self, ):
        super().__init__()
        self.deberta = DebertaV2Model.from_pretrained(MODEL)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(1024, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.deberta(input_ids, attention_mask, token_type_ids)

        #(batch_size, num_tokens, embedding_size)
        hn = outputs['last_hidden_state']
        drop_hn = self.dropout(hn)

        #(batch_size, num_tokens, 2)
        logits = self.linear(drop_hn)

        start_logits, end_logits = logits.split(1, dim = -1)
        start_logits, end_logits = start_logits.squeeze(-1), end_logits.squeeze(-1)

        return start_logits, end_logits

In [None]:
model = DebertaQA()
#load the model
model.load_state_dict(torch.load(DIR+"/model/deberta_qa.bin", map_location=DEVICE))
model.to(DEVICE)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

DebertaQA(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 1024, padding_idx=0)
      (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-23): 24 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (key_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (value_proj): Linear(in_features=1024, out_features=1024, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-07, elementwise_affine=True)
       

## Loss Function and Optimizer

In [None]:
def loss_fn(start_logits, end_logits, start_y, end_y):
    loss = nn.CrossEntropyLoss()

    start_loss = loss(start_logits, start_y)
    end_loss = loss(end_logits, end_y)

    total_loss = (start_loss + end_loss)/2

    return total_loss

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
optimize_params = [
    {
        "params": [
            p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": DR,
    },
    {
        "params": [
            p for n, p in param_optimizer if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]
optimizer = torch.optim.AdamW(optimize_params, lr = LR)

In [None]:
num_train_steps = int(len(train_loader) * EPOCHS)
scheduler = get_scheduler(
    'cosine',
    optimizer,
    num_warmup_steps = int(num_train_steps/10),
    num_training_steps = num_train_steps
)

## Training Loops

In [None]:
def train_one_step(model, data, loss_fn, optimizer):
    optimizer.zero_grad()

    for k,v in data.items():
        data[k] = v.to(DEVICE)

    start_logits, end_logits = model(data['input_ids'], data['attention_mask'], data['token_type_ids'])
    # start_logits, end_logits = model(data['input_ids'], data['attention_mask'])

    loss = loss_fn(start_logits, end_logits, data['ans_start'], data['ans_end'])

    loss.backward()
    optimizer.step()

    return loss

In [None]:
def train(model, dataloader, loss_fn, optimizer, scheduler=None):
    model.train()
    total_loss = 0
    num_batches = len(dataloader)
    loop = tqdm(enumerate(dataloader), total=num_batches)

    for batch_idx, data in loop:
        loss = train_one_step(model, data, loss_fn, optimizer)

        if scheduler is not None:
            scheduler.step()

        total_loss += loss.item()
        loop.set_postfix({'loss': loss.item()})

    loss = total_loss/num_batches
    print(f"Train loss: {loss}")

    return loss

In [None]:
def evaluate(model, dataloader, loss_fn):
    model.eval()
    total_loss = 0
    num_batches = len(dataloader)
    loop = tqdm(enumerate(dataloader), total=num_batches)

    with torch.no_grad():
      for batch_idx, data in loop:
          for k,v in data.items():
              data[k] = v.to(DEVICE)

          start_logits, end_logits = model(data['input_ids'], data['attention_mask'], data['token_type_ids'])
          # start_logits, end_logits = model(data['input_ids'], data['attention_mask'])

          loss = loss_fn(start_logits, end_logits, data['ans_start'], data['ans_end'])

          total_loss += loss.item()
          loop.set_postfix({'loss':loss.item()})

    loss = total_loss/num_batches
    print(f"Val loss: {loss}")

    return loss

## Training

In [None]:
EPOCHS = 10

In [None]:
train_loss_lst, val_loss_lst = [], []
for epoch in range(EPOCHS):
    print(f"Epoch {epoch+1}\n-------------------------------")
    train_loss = train(model, train_loader, loss_fn, optimizer, scheduler)
    val_loss = evaluate(model, val_loader, loss_fn)
    train_loss_lst.append(train_loss)
    val_loss_lst.append(val_loss)
    # save the model after every epoch
    torch.save(model.state_dict(), DIR+"model/deberta_qa1.bin")

Epoch 1
-------------------------------


  0%|          | 0/293 [00:00<?, ?it/s]

Train loss: 0.0031394382404700853


  0%|          | 0/52 [00:00<?, ?it/s]

Val loss: 1.5986802044911417
Epoch 2
-------------------------------


  0%|          | 0/293 [00:00<?, ?it/s]

Train loss: 0.027805055092376713


  0%|          | 0/52 [00:00<?, ?it/s]

Val loss: 1.230112417187347
Epoch 3
-------------------------------


  0%|          | 0/293 [00:00<?, ?it/s]

Train loss: 0.032336772354584285


  0%|          | 0/52 [00:00<?, ?it/s]

Val loss: 0.7686983380751301
Epoch 4
-------------------------------


  0%|          | 0/293 [00:00<?, ?it/s]

Train loss: 0.005339644908395812


  0%|          | 0/52 [00:00<?, ?it/s]

Val loss: 1.3065184502558596
Epoch 5
-------------------------------


  0%|          | 0/293 [00:00<?, ?it/s]

Train loss: 0.000614281501974314


  0%|          | 0/52 [00:00<?, ?it/s]

Val loss: 1.4320488575749857
Epoch 6
-------------------------------


  0%|          | 0/293 [00:00<?, ?it/s]

Train loss: 0.00011729673832269948


  0%|          | 0/52 [00:00<?, ?it/s]

Val loss: 1.4562499923523404
Epoch 7
-------------------------------


  0%|          | 0/293 [00:00<?, ?it/s]

Train loss: 0.07616567984023774


  0%|          | 0/52 [00:00<?, ?it/s]

Val loss: 0.802512009318574
Epoch 8
-------------------------------


  0%|          | 0/293 [00:00<?, ?it/s]

Train loss: 0.04115568547465136


  0%|          | 0/52 [00:00<?, ?it/s]

Val loss: 1.202567766537623
Epoch 9
-------------------------------


  0%|          | 0/293 [00:00<?, ?it/s]

Train loss: 0.021972183360543903


  0%|          | 0/52 [00:00<?, ?it/s]

Val loss: 1.103103436218832
Epoch 10
-------------------------------


  0%|          | 0/293 [00:00<?, ?it/s]

Train loss: 0.006171216698921636


  0%|          | 0/52 [00:00<?, ?it/s]

Val loss: 1.2349230687820216
