In [None]:
!pip install transformers datasets

In [2]:
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from transformers import AutoTokenizer, RobertaModel, DebertaV2Model

from datasets import Dataset

from tqdm.notebook import tqdm

In [3]:
DIR = "/content/drive/MyDrive/code_cyber_sec/"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

ROBERTA_MODEL = "deepset/roberta-base-squad2-distilled"
DEBERTA_MODEL = "deepset/deberta-v3-large-squad2"

MAX_LENGTH = 256
STRIDE = 15

## Loading Data

In [4]:
#reading train file
train_df = pd.read_csv(DIR+"/data/train.csv")
train_df.shape

(1377, 4)

In [5]:
#reading test file
test_df = pd.read_csv(DIR+"/data/test.csv")
test_df.shape

(531, 4)

In [6]:
train_df.head()

Unnamed: 0,ID,Text,ContainsCode,CodeList
0,2,Software development is an exciting field that...,False,
1,4,Another important aspect of software developme...,True,git commit -m 'Initial commit'
2,5,"In the world of software development, language...",False,
3,8,Software development is an intricate process t...,False,
4,9,"In this modern era, software development has i...",True,public class HelloWorld { public static void m...


In [7]:
test_df.head()

Unnamed: 0,ID,Text,ContainsCode,CodeList
0,1,Test-driven development (TDD) is a software de...,,
1,3,The development process often starts with a co...,,
2,6,There are many tools and frameworks available ...,,
3,7,"In the world of software development, the most...",,
4,14,"In the world of software development, testing ...",,


### Removing extra spaces

In [8]:
train_df['Text'] = train_df['Text'].str.strip()
test_df['Text'] = test_df['Text'].str.strip()

## Model

In [None]:
roberta_tokenizer = AutoTokenizer.from_pretrained(ROBERTA_MODEL)
deberta_tokenizer = AutoTokenizer.from_pretrained(DEBERTA_MODEL)
# tokenizer = AutoTokenizer.from_pretrained(DEBERTA_MODEL)

In [10]:
class RobertaQA(nn.Module):
    def __init__(self, ):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(ROBERTA_MODEL, add_pooling_layer=False)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(768, 2)

    def forward(self, input_ids, attention_mask):
        outputs = self.roberta(input_ids, attention_mask)

        #(batch_size, num_tokens, embedding_size)
        hn = outputs['last_hidden_state']
        drop_hn = self.dropout(hn)

        #(batch_size, num_tokens, 256)
        logits = self.linear(drop_hn)

        start_logits, end_logits = logits.split(1, dim = -1)
        start_logits, end_logits = start_logits.squeeze(-1), end_logits.squeeze(-1)

        return start_logits, end_logits

In [None]:
roberta = RobertaQA()
#load the model
roberta.load_state_dict(torch.load(DIR+"/model/roberta_qa2.bin", map_location=DEVICE))
roberta.to(DEVICE)

In [12]:
class DebertaQA(nn.Module):
    def __init__(self, ):
        super().__init__()
        self.deberta = DebertaV2Model.from_pretrained(DEBERTA_MODEL)
        self.dropout = nn.Dropout(0.2)
        self.linear = nn.Linear(1024, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.deberta(input_ids, attention_mask, token_type_ids)

        #(batch_size, num_tokens, embedding_size)
        hn = outputs['last_hidden_state']
        drop_hn = self.dropout(hn)

        #(batch_size, num_tokens, 256)
        logits = self.linear(drop_hn)

        start_logits, end_logits = logits.split(1, dim = -1)
        start_logits, end_logits = start_logits.squeeze(-1), end_logits.squeeze(-1)

        return start_logits, end_logits

In [None]:
deberta = DebertaQA()
#load the model
deberta.load_state_dict(torch.load(DIR+"/model/deberta_qa1.bin", map_location=DEVICE))
deberta.to(DEVICE)

## Inference

In [None]:
def predict(sample, n_best=20):
    deberta.eval()

    context = sample['Text']

    preds = [None for _ in range(len(context))]

    #tokenizer input data
    input_toks = deberta_tokenizer(
        context,
        padding='max_length',
        truncation="only_first",
        max_length=MAX_LENGTH,
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_tensors='pt'
        )

    offsets = input_toks['offset_mapping']
    sample_map = input_toks["overflow_to_sample_mapping"]

    with torch.no_grad():
      #get the logits
        start_logits, end_logits = deberta(
            input_ids = input_toks['input_ids'].to(DEVICE),
            attention_mask= input_toks['attention_mask'].to(DEVICE),
            token_type_ids = input_toks['token_type_ids'].to(DEVICE)
            )
      #converting to numpy arrays
        start_logits, end_logits = start_logits.cpu().numpy(), end_logits.cpu().numpy()

    for idx, offset in tqdm(enumerate(offsets), total=len(offsets)):
        sample_idx = sample_map[idx]

        #get the top n_best logit indexes for a chunk
        start_idxs = np.argsort(start_logits[idx])[-1:n_best-1:-1]
        end_idxs = np.argsort(end_logits[idx])[-1:n_best-1:-1]

        best_answer = (0.0, "")
        #try all valid combinantions of start and end
        for start_idx in start_idxs:
            if start_idx<0 or offset[start_idx] is None:
                continue
            for end_idx in end_idxs:
                if start_idx>end_idx  or end_idx>len(offset) or offset[end_idx] is None:
                    continue

                score = start_logits[idx][start_idx] + end_logits[idx][end_idx]

                start_char, end_char = offset[start_idx][0], offset[end_idx][1]
                span = context[sample_idx][start_char:end_char]

                if best_answer[0]<=score:
                    best_answer = (score, span)

        #for an example, we will store the answer with max score from all its chunks.
        if preds[sample_idx] == None:
            preds[sample_idx] = best_answer
        else:
            if preds[sample_idx][0]<best_answer[0]:
                preds[sample_idx] = best_answer

    return {"score_span":np.array(preds)}

In [None]:
test_ds = Dataset.from_pandas(test_df)

In [None]:
roberta_preds = test_ds.map(predict, batched=True, batch_size=64)

In [None]:
deberta_preds = test_ds.map(predict, batched=True, batch_size=32)

In [53]:
preds = (deberta_preds['score_span'], roberta_preds['score_span'])
def get_best_span(preds, idx):
  scores = [pred[idx][0] for pred in preds]
  spans = [pred[idx][1] for pred in preds]

  span_idx = np.argsort(scores)[::-1][0]
  span = spans[span_idx]

  return span

In [54]:
code_list = []
contains_code = []

for idx in range(len(deberta_preds['score_span'])):
  span = get_best_span(preds, idx)

  item_words = span.split()
  if len(item_words) <= 1:
    code_list.append("")
    contains_code.append(False)
  else:
    code_list.append(span)
    contains_code.append(True)

In [55]:
test_df['ContainsCode'] = contains_code
test_df['CodeList'] = code_list

In [56]:
test_df.head()

Unnamed: 0,ID,Text,ContainsCode,CodeList
0,1,Test-driven development (TDD) is a software de...,True,@Test public void testAdd() { Calculator calcu...
1,3,The development process often starts with a co...,True,for(int i = 0 i < arr.length i++) System.out.p...
2,6,There are many tools and frameworks available ...,True,\nclass App extends React.Component render() r...
3,7,"In the world of software development, the most...",True,var input = document.createElement('input'); ...
4,14,"In the world of software development, testing ...",False,


In [57]:
trn_df = train_df[test_df.columns]
trn_df.head()

Unnamed: 0,ID,Text,ContainsCode,CodeList
0,2,Software development is an exciting field that...,False,
1,4,Another important aspect of software developme...,True,git commit -m 'Initial commit'
2,5,"In the world of software development, language...",False,
3,8,Software development is an intricate process t...,False,
4,9,"In this modern era, software development has i...",True,public class HelloWorld { public static void m...


In [58]:
df = pd.concat([trn_df, test_df], axis=0)
df = df.sort_values(by='ID', ignore_index=True)
df.head()

Unnamed: 0,ID,Text,ContainsCode,CodeList
0,1,Test-driven development (TDD) is a software de...,True,@Test public void testAdd() { Calculator calcu...
1,2,Software development is an exciting field that...,False,
2,3,The development process often starts with a co...,True,for(int i = 0 i < arr.length i++) System.out.p...
3,4,Another important aspect of software developme...,True,git commit -m 'Initial commit'
4,5,"In the world of software development, language...",False,


In [59]:
from sklearn.preprocessing import MultiLabelBinarizer

In [60]:
def generate_submission(df):
  df = df.copy()
  df["CodeList"] = df["CodeList"].fillna("")
  mlb = MultiLabelBinarizer()
  s1 = df["CodeList"]
  t1 = mlb.fit_transform(s1)
  return t1, mlb.classes_

In [61]:
sub, classes = generate_submission(df)

In [62]:
len(classes), classes

(95,
 array(['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*',
        '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7',
        '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D',
        'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
        'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '_',
        '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
        'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y',
        'z', '{', '|', '}'], dtype=object))

In [63]:
submission = pd.DataFrame(sub[:, 2:])

In [64]:
submission

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,83,84,85,86,87,88,89,90,91,92
0,1,0,0,0,0,0,0,0,1,1,...,1,1,1,1,0,0,0,1,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,1,1,...,1,1,0,0,0,1,0,0,0,0
3,1,0,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1903,1,0,0,0,0,0,0,0,1,1,...,1,0,0,1,0,1,0,0,0,0
1904,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1905,1,1,0,0,0,0,0,1,1,1,...,1,1,0,1,0,0,0,1,0,1
1906,1,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,1,0,0,0,0


In [65]:
submission.to_csv(DIR+"/submission.csv", index=False)

In [None]:
# We have a extra class of '\n'
classes.shape

(94,)