### NLBSE'24 RoBERTa Baseline

In [1]:
BASE_MODEL = "roberta-base"
RANDOM_SEED = 42
OUTPUT_PATH = 'output/roberta'

!mkdir -p $OUTPUT_PATH

In [2]:
from datasets import Dataset

ds = Dataset.from_csv({ "train": "data/issues_train.csv", "test": "data/issues_test.csv" })
ds

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['repo', 'created_at', 'label', 'title', 'body'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['repo', 'created_at', 'label', 'title', 'body'],
        num_rows: 1500
    })
})

In [3]:
repos = ds["train"].unique("repo")
print(repos)

['facebook/react', 'tensorflow/tensorflow', 'microsoft/vscode', 'bitcoin/bitcoin', 'opencv/opencv']


In [4]:
ds["train"].to_pandas().groupby(["repo", "label"]).size().unstack(fill_value=0)

label,bug,feature,question
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin/bitcoin,100,100,100
facebook/react,100,100,100
microsoft/vscode,100,100,100
opencv/opencv,100,100,100
tensorflow/tensorflow,100,100,100


In [5]:
import re

label2id = { "bug": 0, "feature": 1, "question": 2 }
id2label = { 0: "bug", 1: "feature", 2: "question" }

def process_dataset(example):

    example['label'] = label2id[example['label']]

    # concatenate title and body
    text = (example['title'] or "") + " " + (example['body'] or "")

    # Remove strings between triple quotes
    text = re.sub(r'```.*?```', ' ', text, flags=re.DOTALL)

    # Remove new lines
    text = re.sub(r'\n', ' ', text)

    # Remove links
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)

    # Remove digits
    text = re.sub(r'\d+', ' ', text)

    # Remove special characters except the question marks
    text = re.sub(r'[^a-zA-Z0-9?\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    example['text'] = text

    return example

ds = ds.shuffle(seed=RANDOM_SEED)
ds = ds.map(process_dataset)
ds = ds.select_columns(['repo', 'label', 'text'])


In [6]:
import wandb
from datetime import datetime
import evaluate
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import RobertaTokenizerFast, RobertaConfig, RobertaForSequenceClassification, TextClassificationPipeline
import numpy as np

group = datetime.utcnow().replace(microsecond=0).isoformat()

metric = evaluate.load("f1")

def repo_eq(repo: str):
    return lambda example: example['repo'] == repo

references = {}
predictions = {}

for repo in repos:
    wandb.init(
        project="NLBSE'24 Issue Report Classification - RoBERTa", 
        group=group,
        name=repo, 
    )

    device="cuda"
    truncation=True
    padding="max_length"
    max_length=512

    tokenizer = RobertaTokenizerFast.from_pretrained(BASE_MODEL)

    config = RobertaConfig.from_pretrained(BASE_MODEL, num_labels=3)

    model = RobertaForSequenceClassification.from_pretrained(BASE_MODEL, config=config)
  
    classifier = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device, truncation=truncation, padding=padding, max_length=max_length)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    def preprocess_function(examples):
      return tokenizer(examples['text'], truncation=truncation, padding=padding, max_length=max_length)

    train_set = ds.filter(repo_eq(repo))["train"]
    train_set = train_set.map(preprocess_function, batched=True)
    train_set.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    test_set = ds.filter(repo_eq(repo))["test"]
    references[repo] = [id2label[id] for id in test_set['label']]
    test_set = test_set.map(preprocess_function, batched=True)
    test_set.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        return metric.compute(predictions=predictions, references=labels, average="weighted")

    training_args = TrainingArguments(
        output_dir=f"{OUTPUT_PATH}/{repo.replace('/', '-')}",
        evaluation_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="no",
        report_to="wandb",
        run_name=repo,
        seed=RANDOM_SEED,
        num_train_epochs=10,
        per_device_train_batch_size=16,
    )

    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_set,
        eval_dataset=test_set,
        compute_metrics=compute_metrics,
        data_collator=data_collator,
    )

    trainer.train()

    predictions[repo] = classifier(test_set['text'])
    predictions[repo] = [pred['label'] for pred in predictions[repo]]
    predictions[repo] = [model.config.label2id[label] for label in predictions[repo]]
    predictions[repo] = [id2label[id] for id in predictions[repo]]
    
    wandb.finish()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrafaelkallis[0m. Use [1m`wandb login --relogin`[0m to force relogin


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.889,0.683371,0.505019
2,0.6823,0.613906,0.639636
3,0.5468,0.547347,0.789345
4,0.4322,0.551864,0.760293
5,0.3111,0.65039,0.75004
6,0.1133,0.607732,0.839547
7,0.0769,0.873719,0.8274
8,0.0212,0.854718,0.841026
9,0.0039,0.868349,0.843438
10,0.003,0.881902,0.844094




0,1
eval/f1,▁▄▇▆▆█████
eval/loss,▄▂▁▁▃▂█▇██
eval/runtime,▁▂▄▅▆▆▇▇▇█
eval/samples_per_second,█▆▅▄▃▃▂▂▂▁
eval/steps_per_second,█▇▅▄▃▃▂▂▂▁
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▅▄▃▂▂▁▁▁
train/total_flos,▁

0,1
eval/f1,0.84409
eval/loss,0.8819
eval/runtime,2.3665
eval/samples_per_second,126.768
eval/steps_per_second,16.057
train/epoch,10.0
train/global_step,190.0
train/learning_rate,0.0
train/loss,0.003
train/total_flos,789340253184000.0


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 2123.36 examples/s]
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,0.9898,0.835612,0.59405
2,0.7204,0.732002,0.674492
3,0.592,0.624695,0.732933
4,0.4808,0.479299,0.822421
5,0.2855,0.473744,0.869514
6,0.1939,0.498568,0.865892
7,0.1269,0.722005,0.831306
8,0.0865,0.708205,0.844257
9,0.0696,0.739826,0.858154
10,0.0408,0.724963,0.858454




0,1
eval/f1,▁▃▅▇██▇▇██
eval/loss,█▆▄▁▁▁▆▆▆▆
eval/runtime,▁▃▄▂▃▄▅▆▆█
eval/samples_per_second,█▆▅▇▆▅▄▃▃▁
eval/steps_per_second,█▆▅▇▆▅▄▃▃▁
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▅▄▃▂▂▁▁▁
train/total_flos,▁

0,1
eval/f1,0.85845
eval/loss,0.72496
eval/runtime,2.3871
eval/samples_per_second,125.675
eval/steps_per_second,15.919
train/epoch,10.0
train/global_step,190.0
train/learning_rate,0.0
train/loss,0.0408
train/total_flos,789340253184000.0


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 2127.63 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 2272.63 examples/s]
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,1.0297,0.975167,0.566626
2,0.8855,0.836753,0.666281
3,0.6882,0.802347,0.671059
4,0.5958,0.66413,0.740537
5,0.3891,0.606848,0.789923
6,0.2192,0.6537,0.777
7,0.1208,0.760761,0.790686
8,0.0754,0.962399,0.760114
9,0.0444,0.996846,0.767218
10,0.0347,1.037996,0.764312




0,1
eval/f1,▁▄▄▆███▇▇▇
eval/loss,▇▅▄▂▁▂▃▇▇█
eval/runtime,▁▃▄▆▆▇▇▇██
eval/samples_per_second,█▆▅▃▃▂▂▂▁▁
eval/steps_per_second,█▆▅▃▃▂▂▂▁▁
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▇▆▅▃▂▂▁▁▁
train/total_flos,▁

0,1
eval/f1,0.76431
eval/loss,1.038
eval/runtime,2.3838
eval/samples_per_second,125.851
eval/steps_per_second,15.941
train/epoch,10.0
train/global_step,190.0
train/learning_rate,0.0
train/loss,0.0347
train/total_flos,789340253184000.0


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 2067.63 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 1932.63 examples/s]
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,1.0814,0.970499,0.445159
2,0.7716,0.764188,0.653771
3,0.5335,0.667387,0.71079
4,0.3252,0.815736,0.737051
5,0.2522,0.812353,0.726487
6,0.1199,1.13398,0.735907
7,0.0693,1.336177,0.725464
8,0.0367,1.468985,0.707075
9,0.0252,1.412592,0.734412
10,0.038,1.354312,0.752635




0,1
eval/f1,▁▆▇█▇█▇▇██
eval/loss,▄▂▁▂▂▅▇██▇
eval/runtime,▁▃▅▆▇▇██▇▇
eval/samples_per_second,█▆▄▃▂▂▁▁▂▂
eval/steps_per_second,█▆▄▃▂▂▁▁▂▂
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▆▄▃▃▂▁▁▁▁
train/total_flos,▁

0,1
eval/f1,0.75263
eval/loss,1.35431
eval/runtime,2.3901
eval/samples_per_second,125.515
eval/steps_per_second,15.899
train/epoch,10.0
train/global_step,190.0
train/learning_rate,0.0
train/loss,0.038
train/total_flos,789340253184000.0


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 1869.55 examples/s]
Map: 100%|██████████| 300/300 [00:00<00:00, 2020.36 examples/s]
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1
1,1.0751,0.968205,0.593196
2,0.9398,0.817039,0.561382
3,0.7271,0.83295,0.642537
4,0.6326,0.727519,0.628608
5,0.5643,0.866814,0.556447
6,0.4725,0.845133,0.640861
7,0.3589,0.818358,0.67732
8,0.3223,0.881727,0.670452
9,0.2494,0.882026,0.722118
10,0.222,0.845077,0.742175




0,1
eval/f1,▂▁▄▄▁▄▆▅▇█
eval/loss,█▄▄▁▅▄▄▅▅▄
eval/runtime,▁▃▄▅▆▇▇█▇█
eval/samples_per_second,█▆▅▄▃▂▂▁▂▁
eval/steps_per_second,█▆▅▄▃▂▂▁▂▁
train/epoch,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/global_step,▁▁▂▂▃▃▃▃▄▄▅▅▆▆▆▆▇▇███
train/learning_rate,█▇▆▆▅▄▃▃▂▁
train/loss,█▇▅▄▄▃▂▂▁▁
train/total_flos,▁

0,1
eval/f1,0.74218
eval/loss,0.84508
eval/runtime,2.4025
eval/samples_per_second,124.872
eval/steps_per_second,15.817
train/epoch,10.0
train/global_step,190.0
train/learning_rate,0.0
train/loss,0.222
train/total_flos,789340253184000.0


In [7]:
from sklearn.metrics import classification_report
from numpy import mean

results = {}
metrics = ['precision', 'recall', 'f1-score']
labels = ['bug', 'feature', 'question']

for repo in repos:
  results[repo] = classification_report(references[repo], predictions[repo], digits=4, output_dict=True)
  results[repo]['average'] = results[repo]['weighted avg']
  results[repo] = {label: {metric: results[repo][label][metric] for metric in metrics} for label in labels + ['average']}

results['overall'] = {label: {metric: mean([results[repo][label][metric] for repo in repos]) for metric in metrics} for label in labels + ['average']}


In [8]:
import json
import os

output_file_name = 'results.json'

with open(os.path.join(OUTPUT_PATH, output_file_name), 'w') as fp:
    json.dump(results, fp, indent=2)

print(f"Repository{' '*15}Label     Precision  Recall     F1")
for repo in repos + ['overall']:
  print("-"*63)
  for label in labels + ['average']:
    out = f"{repo:<25}{label:<10}"
    for metric in metrics:
      out += f"{results[repo][label][metric]:<10.4f} "
    print(out)

Repository               Label     Precision  Recall     F1
---------------------------------------------------------------
facebook/react           bug       0.8909     0.9800     0.9333     
facebook/react           feature   0.8200     0.8200     0.8200     
facebook/react           question  0.8222     0.7400     0.7789     
facebook/react           average   0.8444     0.8467     0.8441     
---------------------------------------------------------------
tensorflow/tensorflow    bug       0.9659     0.8500     0.9043     
tensorflow/tensorflow    feature   0.8641     0.8900     0.8768     
tensorflow/tensorflow    question  0.7615     0.8300     0.7943     
tensorflow/tensorflow    average   0.8638     0.8567     0.8585     
---------------------------------------------------------------
microsoft/vscode         bug       0.7895     0.7500     0.7692     
microsoft/vscode         feature   0.6803     0.8300     0.7477     
microsoft/vscode         question  0.8554     0.7100     0