### NLBSE'24 SetFit Baseline

In [1]:
BASE_MODEL = "sentence-transformers/all-mpnet-base-v2"
RANDOM_SEED = 42
OUTPUT_PATH = 'output/setfit'

!mkdir -p $OUTPUT_PATH

In [2]:
from datasets import Dataset

ds = Dataset.from_csv({ "train": "data/issues_train.csv", "test": "data/issues_test.csv" })
ds

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['repo', 'created_at', 'label', 'title', 'body'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['repo', 'created_at', 'label', 'title', 'body'],
        num_rows: 1500
    })
})

In [3]:
repos = ds["train"].unique("repo")
print(repos)

['facebook/react', 'tensorflow/tensorflow', 'microsoft/vscode', 'bitcoin/bitcoin', 'opencv/opencv']


In [4]:
ds["train"].to_pandas().groupby(["repo", "label"]).size().unstack(fill_value=0)

label,bug,feature,question
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin/bitcoin,100,100,100
facebook/react,100,100,100
microsoft/vscode,100,100,100
opencv/opencv,100,100,100
tensorflow/tensorflow,100,100,100


In [5]:
import re

def process_dataset(example):

    # concatenate title and body
    text = (example['title'] or "") + " " + (example['body'] or "")
    
    example['text'] = text
    return example

In [6]:
ds = ds.shuffle(seed=RANDOM_SEED)
ds = ds.map(process_dataset)
ds = ds.select_columns(['repo', 'label', 'text'])
ds

DatasetDict({
    train: Dataset({
        features: ['repo', 'label', 'text'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['repo', 'label', 'text'],
        num_rows: 1500
    })
})

In [7]:
import wandb
from datetime import datetime
from setfit import SetFitModel, Trainer, TrainingArguments

group = datetime.utcnow().replace(microsecond=0).isoformat()

references = {}
predictions = {}

def repo_eq(repo: str):
    return lambda example: example['repo'] == repo

for repo in repos:
    wandb.init(
        project="NLBSE'24 Issue Report Classification - SetFit", 
        group=group,
        name=repo, 
    )

    train_set = ds.filter(repo_eq(repo))['train']

    model = SetFitModel.from_pretrained(BASE_MODEL)

    args = TrainingArguments(
        output_dir=f'{OUTPUT_PATH}/{repo.replace("/", "-")}',
        save_strategy="no",
        report_to="wandb",
        run_name=repo,
        logging_steps=1,
        seed=RANDOM_SEED,
        batch_size=(16, 2),
        num_epochs=1,
        num_iterations=20,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_set,
    )

    trainer.train()
    
    test_set = ds.filter(repo_eq(repo))['test']

    references[repo] = list(test_set['label'])

    predictions[repo] = list(model.predict(test_set['text'], batch_size=8, show_progress_bar=True))

    wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrafaelkallis[0m. Use [1m`wandb login --relogin`[0m to force relogin
  from IPython.core.display import HTML, display  # type: ignore


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 750
  Num epochs = 1
  Total optimization steps = 750
  Total train batch size = 16


Step,Training Loss


Batches: 100%|██████████| 38/38 [00:01<00:00, 22.72it/s]
  from IPython.core.display import HTML, display  # type: ignore


0,1
train/embedding_loss,█▆▆▆▅▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/embedding_loss,0.0007
train/epoch,1.0
train/global_step,750.0
train/learning_rate,0.0
train/train_runtime,403.2109
train/train_samples_per_second,29.761
train/train_steps_per_second,1.86


  from IPython.core.display import HTML, display  # type: ignore


  from IPython.core.display import HTML, display  # type: ignore


Filter: 100%|██████████| 1500/1500 [00:00<00:00, 174699.58 examples/s]
Filter: 100%|██████████| 1500/1500 [00:00<00:00, 195314.04 examples/s]
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 5668.39 examples/s]
***** Running training *****
  Num examples = 750
  Num epochs = 1
  Total optimization steps = 750
  Total train batch size = 16


Step,Training Loss


Batches: 100%|██████████| 38/38 [00:01<00:00, 19.99it/s]
  from IPython.core.display import HTML, display  # type: ignore


0,1
train/embedding_loss,██▇█▃▃▃▂▃▁▁▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▂▂▁▁▁▁▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/embedding_loss,0.0001
train/epoch,1.0
train/global_step,750.0
train/learning_rate,0.0
train/train_runtime,417.4734
train/train_samples_per_second,28.744
train/train_steps_per_second,1.797


  from IPython.core.display import HTML, display  # type: ignore


  from IPython.core.display import HTML, display  # type: ignore


Filter: 100%|██████████| 1500/1500 [00:00<00:00, 168975.26 examples/s]
Filter: 100%|██████████| 1500/1500 [00:00<00:00, 172927.71 examples/s]
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 7208.28 examples/s]
***** Running training *****
  Num examples = 750
  Num epochs = 1
  Total optimization steps = 750
  Total train batch size = 16


Step,Training Loss


Batches: 100%|██████████| 38/38 [00:01<00:00, 26.28it/s]
  from IPython.core.display import HTML, display  # type: ignore


0,1
train/embedding_loss,█▆▆▃▃▄▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/embedding_loss,0.0001
train/epoch,1.0
train/global_step,750.0
train/learning_rate,0.0
train/train_runtime,409.5611
train/train_samples_per_second,29.3
train/train_steps_per_second,1.831


  from IPython.core.display import HTML, display  # type: ignore


  from IPython.core.display import HTML, display  # type: ignore


Filter: 100%|██████████| 1500/1500 [00:00<00:00, 163869.87 examples/s]
Filter: 100%|██████████| 1500/1500 [00:00<00:00, 168009.61 examples/s]
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 6401.43 examples/s]
***** Running training *****
  Num examples = 750
  Num epochs = 1
  Total optimization steps = 750
  Total train batch size = 16


Step,Training Loss


Batches: 100%|██████████| 38/38 [00:01<00:00, 23.26it/s]
  from IPython.core.display import HTML, display  # type: ignore


0,1
train/embedding_loss,█▇▇▇▃▅▆▄▃▅▂▂▂▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/embedding_loss,0.0003
train/epoch,1.0
train/global_step,750.0
train/learning_rate,0.0
train/train_runtime,421.5564
train/train_samples_per_second,28.466
train/train_steps_per_second,1.779


  from IPython.core.display import HTML, display  # type: ignore


  from IPython.core.display import HTML, display  # type: ignore


Filter: 100%|██████████| 1500/1500 [00:00<00:00, 125135.87 examples/s]
Filter: 100%|██████████| 1500/1500 [00:00<00:00, 132714.34 examples/s]
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Map: 100%|██████████| 300/300 [00:00<00:00, 4178.23 examples/s]
***** Running training *****
  Num examples = 750
  Num epochs = 1
  Total optimization steps = 750
  Total train batch size = 16


Step,Training Loss


Batches: 100%|██████████| 38/38 [00:01<00:00, 22.01it/s]
  from IPython.core.display import HTML, display  # type: ignore


0,1
train/embedding_loss,▇█▅▆▅▃▅▄▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/embedding_loss,0.0003
train/epoch,1.0
train/global_step,750.0
train/learning_rate,0.0
train/train_runtime,427.1565
train/train_samples_per_second,28.093
train/train_steps_per_second,1.756


In [8]:
from sklearn.metrics import classification_report
from numpy import mean

results = {}
metrics = ['precision', 'recall', 'f1-score']
labels = ['bug', 'feature', 'question']

for repo in repos:
  results[repo] = classification_report(references[repo], predictions[repo], digits=4, output_dict=True)
  results[repo]['average'] = results[repo]['weighted avg']
  results[repo] = {label: {metric: results[repo][label][metric] for metric in metrics} for label in labels + ['average']}

results['overall'] = {label: {metric: mean([results[repo][label][metric] for repo in repos]) for metric in metrics} for label in labels + ['average']}

In [9]:
import json
import os

output_file_name = 'results.json'

with open(os.path.join(OUTPUT_PATH, output_file_name), 'w') as fp:
    json.dump(results, fp, indent=2)

print(f"Repository{' '*15}Label     Precision  Recall     F1")
for repo in repos + ['overall']:
  print("-"*63)
  for label in labels + ['average']:
    out = f"{repo:<25}{label:<10}"
    for metric in metrics:
      out += f"{results[repo][label][metric]:<10.4f} "
    print(out)

Repository               Label     Precision  Recall     F1
---------------------------------------------------------------
facebook/react           bug       0.9065     0.9700     0.9372     
facebook/react           feature   0.8627     0.8800     0.8713     
facebook/react           question  0.8571     0.7800     0.8168     
facebook/react           average   0.8755     0.8767     0.8751     
---------------------------------------------------------------
tensorflow/tensorflow    bug       0.9375     0.9000     0.9184     
tensorflow/tensorflow    feature   0.8958     0.8600     0.8776     
tensorflow/tensorflow    question  0.7778     0.8400     0.8077     
tensorflow/tensorflow    average   0.8704     0.8667     0.8679     
---------------------------------------------------------------
microsoft/vscode         bug       0.8061     0.7900     0.7980     
microsoft/vscode         feature   0.7565     0.8700     0.8093     
microsoft/vscode         question  0.8621     0.7500     0