### NLBSE'24 SetFit Baseline

In [1]:
BASE_MODEL = "sentence-transformers/all-mpnet-base-v2"
# BASE_MODEL = "sentence-transformers/all-MiniLM-L12-v2"

RANDOM_SEED = 42
OUTPUT_PATH = 'output/setfit'

!mkdir -p $OUTPUT_PATH

In [2]:
from datasets import Dataset

ds = Dataset.from_csv({ "train": "data/issues_train.csv", "test": "data/issues_test.csv" })
ds

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['repo', 'created_at', 'label', 'title', 'body'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['repo', 'created_at', 'label', 'title', 'body'],
        num_rows: 1500
    })
})

In [3]:
repos = ds["train"].unique("repo")
print(repos)

['facebook/react', 'tensorflow/tensorflow', 'microsoft/vscode', 'bitcoin/bitcoin', 'opencv/opencv']


In [4]:
ds["train"].to_pandas().groupby(["repo", "label"]).size().unstack(fill_value=0)

label,bug,feature,question
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin/bitcoin,100,100,100
facebook/react,100,100,100
microsoft/vscode,100,100,100
opencv/opencv,100,100,100
tensorflow/tensorflow,100,100,100


In [5]:
import re

def process_dataset(example):

    # concatenate title and body
    text = example['title'] or "" + " " + example['body'] or ""

    # Remove strings between triple quotes
    # text = re.sub(r'```.*?```', ' ', text, flags=re.DOTALL)

    # Remove new lines
    # text = re.sub(r'\n', ' ', text)

    # Remove links
    # text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', ' ', text)

    # Remove digits
    # text = re.sub(r'\d+', ' ', text)

    # Remove special characters except the question marks
    # text = re.sub(r'[^a-zA-Z0-9?\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    example['text'] = text
    return example

In [6]:
ds = ds.shuffle(seed=RANDOM_SEED)
ds = ds.map(process_dataset)
ds = ds.select_columns(['repo', 'label', 'text'])
ds

DatasetDict({
    train: Dataset({
        features: ['repo', 'label', 'text'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['repo', 'label', 'text'],
        num_rows: 1500
    })
})

In [7]:
import wandb
from setfit import SetFitModel, Trainer, TrainingArguments
from sentence_transformers.losses import CosineSimilarityLoss

references = {}
predictions = {}

def repo_eq(repo: str):
    return lambda example: example['repo'] == repo

for repo in repos:
    wandb.init(
        project="NLBSE'24 Issue Report Classification - SetFit", 
        name=repo, 
    )

    train_set = ds.filter(repo_eq(repo))['train']

    # encoded class labels needed for differentiable head
    # label2id = { "bug": 0, "feature": 1, "question": 2 }
    # train_set = train_set.map(lambda example: { 'label': label2id[example['label']] })

    model = SetFitModel.from_pretrained(BASE_MODEL)

    # model = SetFitModel.from_pretrained(
    #     BASE_MODEL, 
    #     labels=["bug", "feature", "question"],
    #     use_differentiable_head=True, 
    #     head_params={"out_features": 3},
    # )

    args = TrainingArguments(
        output_dir=f'{OUTPUT_PATH}/{repo.replace("/", "-")}',
        save_strategy="no",
        report_to="wandb",
        run_name=repo,
        logging_steps=1,
        seed=RANDOM_SEED,
        loss=CosineSimilarityLoss,
        batch_size=(64, 2),
        num_epochs=1,
        num_iterations=40,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_set,
    )

    trainer.train()

    test_set = ds.filter(repo_eq(repo))['test']

    references[repo] = test_set['label']

    predictions[repo] = model.predict(test_set['text'], batch_size=2)

    wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrafaelkallis[0m. Use [1m`wandb login --relogin`[0m to force relogin
  from IPython.core.display import HTML, display  # type: ignore


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 188
  Num epochs = 1
  Total optimization steps = 188
  Total train batch size = 64


Step,Training Loss


  from IPython.core.display import HTML, display  # type: ignore


0,1
train/embedding_loss,██▇▅▅▅▅▄▃▄▃▃▃▂▂▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/embedding_loss,0.0022
train/epoch,1.0
train/global_step,188.0
train/learning_rate,0.0
train/train_runtime,35.7226
train/train_samples_per_second,336.818
train/train_steps_per_second,5.263


  from IPython.core.display import HTML, display  # type: ignore


  from IPython.core.display import HTML, display  # type: ignore


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 188
  Num epochs = 1
  Total optimization steps = 188
  Total train batch size = 64


Step,Training Loss


  from IPython.core.display import HTML, display  # type: ignore


0,1
train/embedding_loss,█▇▇▆▅▆▅▄▅▄▃▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/embedding_loss,0.0942
train/epoch,1.0
train/global_step,188.0
train/learning_rate,0.0
train/train_runtime,60.7666
train/train_samples_per_second,198.004
train/train_steps_per_second,3.094


  from IPython.core.display import HTML, display  # type: ignore


  from IPython.core.display import HTML, display  # type: ignore


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 188
  Num epochs = 1
  Total optimization steps = 188
  Total train batch size = 64


Step,Training Loss


  from IPython.core.display import HTML, display  # type: ignore


0,1
train/embedding_loss,██▇▆▅▆▅▅▄▃▄▂▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/embedding_loss,0.0049
train/epoch,1.0
train/global_step,188.0
train/learning_rate,0.0
train/train_runtime,31.846
train/train_samples_per_second,377.818
train/train_steps_per_second,5.903


  from IPython.core.display import HTML, display  # type: ignore


  from IPython.core.display import HTML, display  # type: ignore


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 188
  Num epochs = 1
  Total optimization steps = 188
  Total train batch size = 64


Step,Training Loss


  from IPython.core.display import HTML, display  # type: ignore


0,1
train/embedding_loss,▇█▇▅▅▆▅▅▅▄▄▄▄▅▄▄▄▄▄▄▃▃▃▃▂▃▂▂▂▂▁▂▂▁▁▁▂▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/embedding_loss,0.0135
train/epoch,1.0
train/global_step,188.0
train/learning_rate,0.0
train/train_runtime,40.6841
train/train_samples_per_second,295.742
train/train_steps_per_second,4.621


  from IPython.core.display import HTML, display  # type: ignore


  from IPython.core.display import HTML, display  # type: ignore


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
***** Running training *****
  Num examples = 188
  Num epochs = 1
  Total optimization steps = 188
  Total train batch size = 64


Step,Training Loss


  from IPython.core.display import HTML, display  # type: ignore


0,1
train/embedding_loss,███▆▇▆▅▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▃▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
train/learning_rate,▂▃▅▇███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/embedding_loss,0.0266
train/epoch,1.0
train/global_step,188.0
train/learning_rate,0.0
train/train_runtime,61.3443
train/train_samples_per_second,196.139
train/train_steps_per_second,3.065


In [8]:
from sklearn.metrics import classification_report
from numpy import mean

results = {}
metrics = ['precision', 'recall', 'f1-score']
labels = ['bug', 'feature', 'question']

for repo in repos:
  results[repo] = classification_report(references[repo], predictions[repo], digits=4, output_dict=True)
  results[repo]['average'] = results[repo]['weighted avg']
  results[repo] = {label: {metric: results[repo][label][metric] for metric in metrics} for label in labels + ['average']}

results['overall'] = {label: {metric: mean([results[repo][label][metric] for repo in repos]) for metric in metrics} for label in labels + ['average']}

In [9]:
import json
import os

output_file_name = 'results.json'

with open(os.path.join(OUTPUT_PATH, output_file_name), 'w') as fp:
    json.dump(results, fp, indent=2)

print(f"Repository{' '*15}Label     Precision  Recall     F1")
for repo in repos + ['overall']:
  print("-"*63)
  for label in labels + ['average']:
    out = f"{repo:<25}{label:<10}"
    for metric in metrics:
      out += f"{results[repo][label][metric]:<10.4f} "
    print(out)

Repository               Label     Precision  Recall     F1
---------------------------------------------------------------
facebook/react           bug       0.8440     0.9200     0.8804     
facebook/react           feature   0.7549     0.7700     0.7624     
facebook/react           question  0.7079     0.6300     0.6667     
facebook/react           average   0.7689     0.7733     0.7698     
---------------------------------------------------------------
tensorflow/tensorflow    bug       0.7113     0.6900     0.7005     
tensorflow/tensorflow    feature   0.7087     0.7300     0.7192     
tensorflow/tensorflow    question  0.5100     0.5100     0.5100     
tensorflow/tensorflow    average   0.6434     0.6433     0.6432     
---------------------------------------------------------------
microsoft/vscode         bug       0.7980     0.7900     0.7940     
microsoft/vscode         feature   0.7568     0.8400     0.7962     
microsoft/vscode         question  0.8111     0.7300     0