### NLBSE'24 RoBERTa Baseline

In [1]:
BASE_MODEL = "roberta-base"
RANDOM_SEED = 42
OUTPUT_PATH = 'output/roberta'

!mkdir -p $OUTPUT_PATH

In [2]:
from datasets import Dataset

ds = Dataset.from_csv({ "train": "data/issues_train.csv", "test": "data/issues_test.csv" })
ds

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['repo', 'created_at', 'label', 'title', 'body'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['repo', 'created_at', 'label', 'title', 'body'],
        num_rows: 1500
    })
})

In [3]:
repos = ds["train"].unique("repo")
print(repos)

['facebook/react', 'tensorflow/tensorflow', 'microsoft/vscode', 'bitcoin/bitcoin', 'opencv/opencv']


In [4]:
ds["train"].to_pandas().groupby(["repo", "label"]).size().unstack(fill_value=0)

label,bug,feature,question
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin/bitcoin,100,100,100
facebook/react,100,100,100
microsoft/vscode,100,100,100
opencv/opencv,100,100,100
tensorflow/tensorflow,100,100,100


In [5]:
import re

label2id = { "bug": 0, "feature": 1, "question": 2 }
id2label = { 0: "bug", 1: "feature", 2: "question" }

def process_dataset(example):

    # concatenate title and body
    text = example['title'] or "" + " " + example['body'] or ""

    # Remove strings between triple quotes
    text = re.sub(r'```.*?```', ' ', text, flags=re.DOTALL)

    # Remove new lines
    text = re.sub(r'\n', ' ', text)

    # Remove links
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)

    # Remove digits
    text = re.sub(r'\d+', ' ', text)

    # Remove special characters except the question marks
    text = re.sub(r'[^a-zA-Z0-9?\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    example['text'] = text

    return example

ds = ds.shuffle(seed=RANDOM_SEED)
ds = ds.map(process_dataset)
ds = ds.select_columns(['repo', 'label', 'text'])


In [6]:
from transformers import RobertaTokenizer, RobertaConfig, RobertaForSequenceClassification, TextClassificationPipeline

def create_model(max_length=256, truncation=True, padding="max_length", device="cuda"):
  # The tokenizer is based on Roberta. The configurations are: Max_length = 256, truncation = true, padding = max_length.
  tokenizer = RobertaTokenizer.from_pretrained(BASE_MODEL, device=device, max_length=max_length, truncation=truncation, padding=padding)

  # Configuration: We have 3 labels: Bug, Enhancment, Question.
  config = RobertaConfig.from_pretrained(BASE_MODEL, device=device, num_labels=3)

  # Configuration of the Adapter model.
  model = RobertaForSequenceClassification.from_pretrained(BASE_MODEL, config=config)
  
  # This part is for inferencing
  classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device, max_length=max_length, padding=padding, truncation=truncation)

  return tokenizer, model, classifier

In [7]:
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding

def repo_eq(repo: str):
    return lambda example: example['repo'] == repo

references = {}
predictions = {}

for repo in repos:

    tokenizer, model, classifier = create_model(max_length=512)

    train_set = ds.filter(repo_eq(repo))["train"]
    train_set = train_set.class_encode_column("label")
    train_set = train_set.map(lambda batch: tokenizer(batch["text"]), batched=True)
    train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    training_args = TrainingArguments(
        output_dir=OUTPUT_PATH,
        num_train_epochs=20,
        per_device_train_batch_size=256,
        save_strategy="no",
        evaluation_strategy="no",
        seed=RANDOM_SEED
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=None,
        
        data_collator=DataCollatorWithPadding(tokenizer),
    )

    trainer.train()

    test_set = ds.filter(repo_eq(repo))["test"]
    references[repo] = test_set['label']
    test_set = test_set.map(lambda batch: tokenizer(batch["text"]), batched=True)
    test_set.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    predictions[repo] = classifier(test_set['text'])
    predictions[repo] = [pred['label'] for pred in predictions[repo]]
    predictions[repo] = [model.config.label2id[label] for label in predictions[repo]]
    predictions[repo] = [id2label[id] for id in predictions[repo]]


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 5793.56 examples/s]


Step,Training Loss


Map: 100%|██████████| 300/300 [00:00<00:00, 5593.52 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 5458.30 examples/s]


Step,Training Loss


Map: 100%|██████████| 300/300 [00:00<00:00, 6181.15 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 5579.58 examples/s]


Step,Training Loss


Map: 100%|██████████| 300/300 [00:00<00:00, 6818.60 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 5296.66 examples/s]


Step,Training Loss


Map: 100%|██████████| 300/300 [00:00<00:00, 6588.50 examples/s]
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 5740.96 examples/s]


Step,Training Loss


Map: 100%|██████████| 300/300 [00:00<00:00, 6792.32 examples/s]


In [8]:
from sklearn.metrics import classification_report
from numpy import mean

results = {}
metrics = ['precision', 'recall', 'f1-score']
labels = ['bug', 'feature', 'question']

for repo in repos:
  results[repo] = classification_report(references[repo], predictions[repo], digits=4, output_dict=True)
  results[repo]['average'] = results[repo]['weighted avg']
  results[repo] = {label: {metric: results[repo][label][metric] for metric in metrics} for label in labels + ['average']}

results['overall'] = {label: {metric: mean([results[repo][label][metric] for repo in repos]) for metric in metrics} for label in labels + ['average']}


In [9]:
import json
import os


output_file_name = 'results.json'

with open(os.path.join(OUTPUT_PATH, output_file_name), 'w') as fp:
    json.dump(results, fp, indent=2)

print(f"Repository{' '*15}Label     Precision  Recall     F1")
for repo in repos + ['overall']:
  print("-"*63)
  for label in labels + ['average']:
    out = f"{repo:<25}{label:<10}"
    for metric in metrics:
      out += f"{results[repo][label][metric]:<10.4f} "
    print(out)

Repository               Label     Precision  Recall     F1
---------------------------------------------------------------
facebook/react           bug       0.8585     0.9100     0.8835     
facebook/react           feature   0.7170     0.7600     0.7379     
facebook/react           question  0.7386     0.6500     0.6915     
facebook/react           average   0.7714     0.7733     0.7709     
---------------------------------------------------------------
tensorflow/tensorflow    bug       0.6875     0.7700     0.7264     
tensorflow/tensorflow    feature   0.6514     0.7100     0.6794     
tensorflow/tensorflow    question  0.5443     0.4300     0.4804     
tensorflow/tensorflow    average   0.6277     0.6367     0.6288     
---------------------------------------------------------------
microsoft/vscode         bug       0.7250     0.5800     0.6444     
microsoft/vscode         feature   0.6518     0.7300     0.6887     
microsoft/vscode         question  0.6667     0.7200     0