### NLBSE'24 FastText Baseline

In [33]:
RANDOM_SEED = 42
OUTPUT_PATH = 'output/fasttext'

!mkdir -p $OUTPUT_PATH

In [34]:
from datasets import Dataset

ds = Dataset.from_csv({ "train": "data/issues_train.csv", "test": "data/issues_test.csv" })
ds

DatasetDict({
    train: Dataset({
        features: ['repo', 'created_at', 'label', 'title', 'body'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['repo', 'created_at', 'label', 'title', 'body'],
        num_rows: 1500
    })
})

In [35]:
repos = ds["train"].unique("repo")
print(repos)

['facebook/react', 'tensorflow/tensorflow', 'microsoft/vscode', 'bitcoin/bitcoin', 'opencv/opencv']


In [36]:
ds["train"].to_pandas().groupby(["repo", "label"]).size().unstack(fill_value=0)

label,bug,feature,question
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin/bitcoin,100,100,100
facebook/react,100,100,100
microsoft/vscode,100,100,100
opencv/opencv,100,100,100
tensorflow/tensorflow,100,100,100


In [37]:
import gensim

def process_dataset(example):

    # concatenate title and body
    text = example['title'] or "" + " " + example['body'] or ""

    # escape fasttext special sequences
    text = text.replace("__label__", "")

    # remove consecutive whitespace characters and convert tabs to spaces
    text = gensim.parsing.preprocessing.strip_multiple_whitespaces(text)
    
    example['text'] = text

    return example

ds = ds.shuffle(seed=RANDOM_SEED)
ds = ds.map(process_dataset)
ds = ds.select_columns(['repo', 'label','text'])
ds

DatasetDict({
    train: Dataset({
        features: ['repo', 'label', 'text'],
        num_rows: 1500
    })
    test: Dataset({
        features: ['repo', 'label', 'text'],
        num_rows: 1500
    })
})

In [38]:
import fasttext

def repo_eq(repo: str):
    return lambda example: example['repo'] == repo

def to_fasttext_train_file(dataset, o_path):
    with open(o_path, "w", encoding='utf-8') as o_f:
        for example in dataset:
            o_f.write(f"__label__{example['label']} {example['text']}\n")

references = {}
predictions = {}

for repo in repos:
    train_file = f"{OUTPUT_PATH}/issues-{repo.replace('/','-')}.train"
    model_file = f"{OUTPUT_PATH}/issues-{repo.replace('/','-')}.bin"

    train_set = ds.filter(repo_eq(repo))["train"]

    to_fasttext_train_file(train_set, train_file)

    model = fasttext.train_supervised(input=train_file, epoch=100, seed=RANDOM_SEED)

    model.save_model(model_file)
    
    test_set = ds.filter(repo_eq(repo))["test"]
    
    references[repo] = test_set['label']
    predictions[repo] = model.predict(test_set['text'], k=1)

    predictions[repo] = predictions[repo][0]
    predictions[repo] = [pred[0] for pred in predictions[repo]]
    predictions[repo] = [pred.replace('__label__', '') for pred in predictions[repo]]


Read 0M words
Number of words:  1081
Number of labels: 3
Progress: 100.0% words/sec/thread:  301565 lr:  0.000000 avg.loss:  0.141648 ETA:   0h 0m 0s
Read 0M words
Number of words:  1180
Number of labels: 3
Progress: 100.0% words/sec/thread:  268792 lr:  0.000000 avg.loss:  0.195273 ETA:   0h 0m 0s
Read 0M words
Number of words:  1186
Number of labels: 3
Progress: 100.0% words/sec/thread:  270802 lr:  0.000000 avg.loss:  0.194122 ETA:   0h 0m 0s
Read 0M words
Number of words:  1236
Number of labels: 3
Progress: 100.0% words/sec/thread:  246496 lr:  0.000000 avg.loss:  0.198439 ETA:   0h 0m 0s
Read 0M words
Number of words:  1241
Number of labels: 3
Progress: 100.0% words/sec/thread:  260606 lr:  0.000000 avg.loss:  0.187643 ETA:   0h 0m 0s


In [39]:
from sklearn.metrics import classification_report
from numpy import mean

results = {}
metrics = ['precision', 'recall', 'f1-score']
labels = ['bug', 'feature', 'question']

for repo in repos:
  results[repo] = classification_report(references[repo], predictions[repo], digits=4, output_dict=True)
  results[repo]['average'] = results[repo]['weighted avg']
  results[repo] = {label: {metric: results[repo][label][metric] for metric in metrics} for label in labels + ['average']}

results['overall'] = {label: {metric: mean([results[repo][label][metric] for repo in repos]) for metric in metrics} for label in labels + ['average']}


In [40]:
import json
import os

output_file_name = 'results.json'

with open(os.path.join(OUTPUT_PATH, output_file_name), 'w') as fp:
    json.dump(results, fp, indent=2)

print(f"Repository{' '*15}Label     Precision  Recall     F1")
for repo in repos + ['overall']:
  print("-"*63)
  for label in labels + ['average']:
    out = f"{repo:<25}{label:<10}"
    for metric in metrics:
      out += f"{results[repo][label][metric]:<10.4f} "
    print(out)

Repository               Label     Precision  Recall     F1
---------------------------------------------------------------
facebook/react           bug       0.9184     0.9000     0.9091     
facebook/react           feature   0.6577     0.7300     0.6919     
facebook/react           question  0.6593     0.6000     0.6283     
facebook/react           average   0.7451     0.7433     0.7431     
---------------------------------------------------------------
tensorflow/tensorflow    bug       0.6122     0.6000     0.6061     
tensorflow/tensorflow    feature   0.5596     0.6100     0.5837     
tensorflow/tensorflow    question  0.4624     0.4300     0.4456     
tensorflow/tensorflow    average   0.5447     0.5467     0.5451     
---------------------------------------------------------------
microsoft/vscode         bug       0.5761     0.5300     0.5521     
microsoft/vscode         feature   0.6154     0.7200     0.6636     
microsoft/vscode         question  0.6044     0.5500     0