In [None]:
%pip install pandas sentence-transformers setfit scikit-learn datasets

In [None]:
import pandas as pd
import json
import os
from setfit import SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss
from datasets import Dataset
from sklearn.metrics import classification_report
from collections import defaultdict

In [None]:
BASE_MODEL = "all-mpnet-base-v2"
RANDOM_SEED = 42
OUTPUT_PATH = 'output'

In [None]:
train_set = pd.read_csv("data/issues_train.csv")
test_set = pd.read_csv("data/issues_test.csv")

In [None]:
train_set.groupby(["repo", "label"]).size().unstack(fill_value=0)

In [None]:
def process_dataset(dataset):
    dataset['text'] = dataset['title'] + " " + dataset['body']
    dataset = dataset[['text', 'label', 'repo']]
    return dataset

In [None]:
train_set = process_dataset(train_set)
test_set = process_dataset(test_set)

In [None]:
group_by_repo = lambda dataset: {
    repo: Dataset.from_pandas(dataset[dataset["repo"] == repo]).class_encode_column("label")
    for repo in dataset["repo"].unique()
}

train_sets = group_by_repo(train_set)
test_sets = group_by_repo(test_set)

In [None]:
datasets = {
    repo: {'train': train_sets[repo], 'test': test_sets[repo]} for repo in train_sets.keys()
}

In [None]:
results = defaultdict(dict)
for repo in datasets.keys():
    train_set, test_set = datasets[repo]['train'], datasets[repo]['test']
    model = SetFitModel.from_pretrained(BASE_MODEL)

    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_set,
        loss_class=CosineSimilarityLoss,
        metric="accuracy",
        batch_size=16,
        num_epochs=1,
        num_iterations=20,
    )
    trainer.train()
    y_pred = trainer.model.predict(test_set['text'])
    results[repo]['metrics'] = classification_report(test_set['label'], y_pred, digits=4, output_dict=True)
    results[repo]['predictions'] = y_pred.tolist()
    results['label_mapping'] = {train_set.features["label"].int2str(x): x for x in range(train_set.features["label"].num_classes)}

In [None]:
for repo in results.keys():
    print(repo)
    print(results[repo]['metrics'])

In [None]:
f1_scores = [results[repo]['metrics']['macro avg']['f1-score'] for repo in results.keys()]
mean_score = sum(f1_scores) / len(f1_scores)

print(f"Mean F1 score: {mean_score}")

In [None]:
output_file_name = 'results.json'
with open(os.path.join(OUTPUT_PATH, output_file_name), 'w') as fp:
    json.dump(results, fp)