In [1]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
import pandas as pd
from tokenizers.processors import BertProcessing, RobertaProcessing
from tree_sitter import Language, Parser
import os
from pathlib import Path
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import AdamW, RobertaModel, AutoModel,RobertaTokenizer, AutoModelForMaskedLM, RobertaForMaskedLM

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [3]:
def create_java_only_dataset():
    if not os.path.isfile("Data/Java_Unified_Data_with_SHA.csv"):
        df = pd.read_csv("Data/Unified_Data_with_SHA.csv")
        df2 = df[df["language_name"]=='Java']
        df2.reset_index(drop=True,inplace=True)
        df2.to_csv("Data/Java_Unified_Data_with_SHA.csv",index=False)

In [4]:
# create_java_only_dataset()

In [5]:
def get_uuid(text):
    return text.split("/")[-1].split(".")[0]

In [6]:
def remove_comments_and_docstrings(source):

    def replacer(match):
        s = match.group(0)
        if s.startswith('/'):
            return " " # note: a space and not an empty string
        else:
            return s
    pattern = re.compile(
        r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
        re.DOTALL | re.MULTILINE
    )
    temp=[]
    for x in re.sub(pattern, replacer, source).split('\n'):
        if x.strip()!="":
            temp.append(x)
    return '\n'.join(temp)

In [7]:
def create_report_files():
    if not os.path.isdir("Data/Report_Files/"):
        Path("Data/Report_Files/").mkdir(parents=True, exist_ok=True)
        df = pd.read_csv("Data/Java_Unified_Data_with_SHA.csv")
        for item in df.iterrows():
            uuid_name = item[1]['before_fix_uuid_file_path'].split("/")[-1].split(".")[0]
            file = open ("Data/Report_Files/{}.txt".format(uuid_name),"w")
            file.write(item[1]['title'] + " " + item[1]['description'])
            file.close()

In [8]:
# create_report_files()

In [9]:
def convert_file_to_ast(file_path, parser):
    file = open(file_path,"r")
    file_content = file.read()
    tree = parser.parse(bytes(file_content,"utf-8"))
    return tree.root_node.sexp()

In [10]:
def create_ast_files():
    if not os.path.isdir("Data/AST_Files/"):
        Path("Data/AST_Files/").mkdir(parents=True, exist_ok=True)
        df = pd.read_csv("Data/Java_Unified_Data_with_SHA.csv")
        JAVA_LANGUAGE = Language('build/my-languages.so', 'java')
        parser = Parser()
        parser.set_language(JAVA_LANGUAGE)
        for item in df.iterrows():
            before_fix_uuid_name = item[1]['before_fix_uuid_file_path'].split("/")[-1].split(".")[0]
            before_fix_file = open ("Data/AST_Files/{}.txt".format(before_fix_uuid_name),"w")
            before_fix_file.write(convert_file_to_ast(item[1]['before_fix_uuid_file_path'],parser))
            before_fix_file.close()
            
            after_fix_uuid_name = item[1]['after_fix_uuid_file_path'].split("/")[-1].split(".")[0]
            after_fix_file = open ("Data/AST_Files/{}.txt".format(after_fix_uuid_name),"w")
            after_fix_file.write(convert_file_to_ast(item[1]['after_fix_uuid_file_path'],parser))
            after_fix_file.close()

In [11]:
# create_ast_files()

In [31]:
def file_reader(before_fix_ast_paths,after_fix_ast_path, report_paths):
    if not isinstance(before_fix_ast_paths, str):
        accumulate  = [[],[],[]]
        for before_ast, after_ast, report in zip(before_fix_ast_paths,after_fix_ast_path, report_paths):
            with open(report,"r") as file:
                accumulate[0].append(file.read())
            with open(before_ast,"r") as file:
                accumulate[1].append(file.read())  
            with open(after_ast,"r") as file:
                accumulate[2].append(file.read())  
    else:
        accumulate = []
        with open(report_paths,"r") as file:
            accumulate.append(file.read())
        with open(before_fix_ast_paths,"r") as file:
            accumulate.append(file.read()) 
        with open(after_fix_ast_path,"r") as file:
            accumulate.append(file.read()) 
    return accumulate

In [120]:
def find_difference(before, after):
    before, after = np.array(before), np.array(after)
    maxlength =  max(len(before), len(after))
    padded_before = before if len(before) == maxlength else np.pad(before,(0,maxlength - len(before)), constant_values=-1)
    padded_after = after if len(after) == maxlength else np.pad(after,(0,maxlength - len(after)), constant_values=-1)
    difference = np.where(padded_before!= padded_after)
    start = difference[0][0]
    end = len(before) - 1 if len(before) < maxlength else difference[0][-1] 
    return torch.tensor([start]),torch.tensor([end])

In [121]:
class BugDataset(Dataset):

    def __init__(self, file_path=None,dataframe=None,tokenizer=None):
        if file_path is not None:
            self.dataset = pd.read_csv(file_path)
        else:
            self.dataset =dataframe
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.dataset)


    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        rows = self.dataset.iloc[idx,:]
        if isinstance(idx, int):
            before_fix_ast_path = "Data/AST_Files/" + get_uuid(rows['before_fix_uuid_file_path']) + ".txt"
            after_fix_ast_path = "Data/AST_Files/" + get_uuid(rows['after_fix_uuid_file_path']) + ".txt"
            report_files = "Data/Report_Files/" + get_uuid(rows['before_fix_uuid_file_path']) + ".txt"
        else:
            before_fix_ast_path = rows['before_fix_uuid_file_path'].map(lambda x:"Data/AST_Files/" + get_uuid(x) + ".txt").tolist()
            after_fix_ast_path = rows['after_fix_uuid_file_path'].map(lambda x:"Data/AST_Files/" + get_uuid(x) + ".txt").tolist()
            report_files = rows['before_fix_uuid_file_path'].map(lambda x:"Data/Report_Files/" + get_uuid(x) + ".txt").tolist()
        temp = file_reader(before_fix_ast_path, after_fix_ast_path, report_files)
        report, before, after = self.tokenizer.encode_plus(temp[0],truncation=True, max_length=512)['input_ids'], self.tokenizer.encode_plus(temp[1],truncation=True, max_length=512)['input_ids'], self.tokenizer.encode_plus(temp[2],truncation=True, max_length=512)['input_ids']
        start, end = find_difference(before, after)
        return  report, before, start, end #self.tokenizer.encode_plus(temp,truncation=True, max_length=512)['input_ids']

In [14]:
train_data,val_data = train_test_split(pd.read_csv("Data/Java_Train_Data.csv"),test_size=0.125)

In [15]:
before_fix_ast_paths = train_data['before_fix_uuid_file_path'].map(lambda x:"Data/AST_Files/" + get_uuid(x) + ".txt").tolist()
after_fix_ast_paths = train_data['after_fix_uuid_file_path'].map(lambda x:"Data/AST_Files/" + get_uuid(x) + ".txt").tolist()
report_files = train_data['before_fix_uuid_file_path'].map(lambda x:"Data/Report_Files/" + get_uuid(x) + ".txt").tolist()

In [16]:
all_file_path = before_fix_ast_paths + report_files

In [17]:
if not os.path.isfile("aster-vocab.json"):
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=all_file_path, min_frequency=2, special_tokens=[
        "<s>",
        "<pad>",
        "</s>",
        "<unk>",
        "<mask>",
    ])
    tokenizer.save_model(".", "./aster")

In [18]:
# tokenizer = ByteLevelBPETokenizer(
#     "aster-vocab.json",
#     "aster-merges.txt",
# )
# tokenizer._tokenizer.post_processor = RobertaProcessing(
#     ("</s>", tokenizer.token_to_id("</s>")),
#     ("<s>", tokenizer.token_to_id("<s>")),
# )
# tokenizer.enable_truncation(max_length=3000)

In [19]:
tokenizer = RobertaTokenizer("aster-vocab.json","aster-merges.txt",max_length=510)

In [20]:
temp_dataset = BugDataset("Data/Java_Train_Data.csv",tokenizer=tokenizer)

In [21]:
temp_dataloader = DataLoader(temp_dataset, batch_size=4, num_workers=1)

In [22]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [23]:
 features = [{"input_ids": [0, 1, 2, 3, 4], "label": i} for i in range(4)]
data_collator(features)

{'input_ids': tensor([[0, 1, 2, 3, 4],
        [0, 4, 2, 3, 4],
        [0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4]]), 'label': tensor([0, 1, 2, 3]), 'attention_mask': tensor([[1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, -100, -100],
        [-100,    1, -100, -100,    4],
        [-100, -100, -100, -100, -100],
        [-100, -100, -100, -100, -100]])}

In [115]:
import numpy as np

before =  np.array([1,2,3,4,5,6,7,8])
after = np.array([1,4,3,4,7])
find_difference(after, before)

(1, 4)

In [122]:
next(iter(BugDataset(dataframe=train_data,tokenizer=tokenizer)))

([0,
  16884,
  1252,
  1216,
  531,
  2842,
  10113,
  6003,
  2913,
  698,
  1567,
  1424,
  1663,
  675,
  589,
  984,
  18,
  22,
  18,
  20,
  18,
  49,
  26,
  18,
  2],
 [0,
  12,
  760,
  261,
  429,
  13,
  261,
  752,
  67,
  336,
  261,
  331,
  67,
  271,
  335,
  30,
  261,
  331,
  67,
  271,
  335,
  30,
  261,
  331,
  67,
  271,
  335,
  30,
  261,
  331,
  67,
  271,
  335,
  30,
  261,
  271,
  13,
  286,
  30,
  261,
  271,
  266,
  286,
  30,
  261,
  271,
  266,
  286,
  30,
  261,
  271,
  266,
  286,
  30,
  261,
  271,
  326,
  261,
  412,
  67,
  336,
  261,
  331,
  67,
  271,
  335,
  30,
  261,
  331,
  67,
  271,
  335,
  30,
  261,
  331,
  67,
  271,
  335,
  30,
  261,
  331,
  67,
  271,
  335,
  30,
  261,
  331,
  67,
  271,
  335,
  30,
  261,
  271,
  13,
  286,
  30,
  261,
  271,
  266,
  286,
  30,
  261,
  271,
  266,
  286,
  30,
  261,
  271,
  266,
  286,
  30,
  261,
  271,
  266,
  286,
  30,
  261,
  271,
  326,
  261,
  412,
  67,
  336,

In [None]:
data_collator(temp_dict)

In [None]:
training_args = TrainingArguments(
    output_dir="./aster",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

In [None]:
class AsterTrainer(Trainer):
    def get_train_dataloader(self):
        temp_dataset = BugDataset(dataframe=train_data)
        return DataLoader(temp_dataset, batch_size=4, num_workers=1)
    def get_eval_dataloader(self):
        temp_dataset = BugDataset(dataframe=val_data)
        return DataLoader(temp_dataset, batch_size=4, num_workers=1)
    def get_test_dataloader(self):
        temp_dataset = BugDataset(file_path="Data/Java_Test_Data.csv")
        return DataLoader(temp_dataset, batch_size=4, num_workers=1)

In [36]:
model = RobertaForMaskedLM.from_pretrained("roberta-base")
trainer = Trainer(
    model= model,
    args= training_args,
    data_collator= data_collator,
    train_dataset= BugDataset(dataframe=train_data,tokenizer=tokenizer),
#     eval_dataset= BugDataset(dataframe=val_data,tokenizer=tokenizer)
)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
trainer.train()

RuntimeError: CUDA error: no kernel image is available for execution on the device

In [17]:
# model = AutoModel.from_pretrained("roberta-base")
# model = RobertaForMaskedLM.from_pretrained("roberta-base")
# model.to(device)
# model.train()

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

In [20]:
# batch = next(iter(temp_dataloader))

In [144]:
# tokenizer.encode_batch(batch).ids

TypeError: 'ByteLevelBPETokenizer' object is not callable

In [137]:
optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    for batch in temp_dataloader:
        optim.zero_grad()
        batch = tokenizer.encode_batch(batch)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

model.eval()

TypeError: list indices must be integers or slices, not str