In [12]:
%pip install pandas sentence-transformers setfit scikit-learn datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
import json
import os
from setfit import SetFitModel, SetFitTrainer
from sentence_transformers.losses import CosineSimilarityLoss
from datasets import Dataset
from sklearn.metrics import classification_report
from collections import defaultdict

In [14]:
BASE_MODEL = "all-mpnet-base-v2"
RANDOM_SEED = 42
OUTPUT_PATH = 'output'

In [15]:
train_set = pd.read_csv("data/issues_train.csv")
test_set = pd.read_csv("data/issues_test.csv")

In [16]:
train_set.groupby(["repo", "label"]).size().unstack(fill_value=0)

label,bug,feature,question
repo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bitcoin/bitcoin,100,100,100
facebook/react,100,100,100
microsoft/vscode,100,100,100
opencv/opencv,100,100,100
tensorflow/tensorflow,100,100,100


In [17]:
def process_dataset(dataset):
    dataset['text'] = dataset['title'] + " " + dataset['body']
    dataset = dataset[['text', 'label', 'repo']]
    return dataset

In [18]:
train_set = process_dataset(train_set)
test_set = process_dataset(test_set)

In [19]:
group_by_repo = lambda dataset: {
    repo: Dataset.from_pandas(dataset[dataset["repo"] == repo]).class_encode_column("label")
    for repo in dataset["repo"].unique()
}

train_sets = group_by_repo(train_set)
test_sets = group_by_repo(test_set)

Casting to class labels: 100%|██████████| 300/300 [00:00<00:00, 60219.73 examples/s]
Casting to class labels: 100%|██████████| 300/300 [00:00<00:00, 92209.53 examples/s]
Casting to class labels: 100%|██████████| 300/300 [00:00<00:00, 112588.69 examples/s]
Casting to class labels: 100%|██████████| 300/300 [00:00<00:00, 97716.18 examples/s]
Casting to class labels: 100%|██████████| 300/300 [00:00<00:00, 67295.50 examples/s]
Casting to class labels: 100%|██████████| 300/300 [00:00<00:00, 98681.77 examples/s]
Casting to class labels: 100%|██████████| 300/300 [00:00<00:00, 89082.56 examples/s]
Casting to class labels: 100%|██████████| 300/300 [00:00<00:00, 109226.67 examples/s]
Casting to class labels: 100%|██████████| 300/300 [00:00<00:00, 96665.22 examples/s]
Casting to class labels: 100%|██████████| 300/300 [00:00<00:00, 99430.36 examples/s]


In [20]:
datasets = {
    repo: {'train': train_sets[repo], 'test': test_sets[repo]} for repo in train_sets.keys()
}

In [21]:
results = defaultdict(dict)
for repo in datasets.keys():
    train_set, test_set = datasets[repo]['train'], datasets[repo]['test']
    model = SetFitModel.from_pretrained(BASE_MODEL)

    trainer = SetFitTrainer(
        model=model,
        train_dataset=train_set,
        loss_class=CosineSimilarityLoss,
        metric="accuracy",
        batch_size=16,
        num_epochs=1,
        num_iterations=20,
    )
    trainer.train()
    y_pred = trainer.model.predict(test_set['text'])
    results[repo]['metrics'] = classification_report(test_set['label'], y_pred, digits=4, output_dict=True)
    results[repo]['predictions'] = y_pred.tolist()
    results['label_mapping'] = {train_set.features["label"].int2str(x): x for x in range(train_set.features["label"].num_classes)}

config.json not found in HuggingFace Hub.


model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Generating Training Pairs: 100%|██████████| 20/20 [00:00<00:00, 43.69it/s]
***** Running training *****
  Num examples = 12000
  Num epochs = 1
  Total optimization steps = 750
  Total train batch size = 16
Iteration: 100%|██████████| 750/750 [06:37<00:00,  1.89it/s]
Epoch: 100%|██████████| 1/1 [06:37<00:00, 397.83s/it]
config.json not found in HuggingFace Hub.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Generating Training Pairs: 100%|██████████| 20/20 [00:01<00:00, 11.46it/s]
***** Running training *****
  Num examples = 12000
  Num epochs = 1
  Total optimization steps = 750
  Total train batch size = 16
Iteration: 100%|██████████| 750/750 [06:44<00:00,  

KeyboardInterrupt: 

In [None]:
for repo in results.keys():
    print(repo)
    print(results[repo]['metrics'])

facebook/react
{'0': {'precision': 0.897196261682243, 'recall': 0.96, 'f1-score': 0.927536231884058, 'support': 100.0}, '1': {'precision': 0.839622641509434, 'recall': 0.89, 'f1-score': 0.8640776699029127, 'support': 100.0}, '2': {'precision': 0.8620689655172413, 'recall': 0.75, 'f1-score': 0.8021390374331552, 'support': 100.0}, 'accuracy': 0.8666666666666667, 'macro avg': {'precision': 0.8662959562363062, 'recall': 0.8666666666666667, 'f1-score': 0.8645843130733754, 'support': 300.0}, 'weighted avg': {'precision': 0.8662959562363061, 'recall': 0.8666666666666667, 'f1-score': 0.8645843130733752, 'support': 300.0}}
label_mapping


KeyError: 'metrics'

In [None]:
f1_scores = [results[repo]['metrics']['macro avg']['f1-score'] for repo in results.keys()]
mean_score = sum(f1_scores) / len(f1_scores)

print(f"Mean F1 score: {mean_score}")

In [None]:
output_file_name = 'results.json'
with open(os.path.join(OUTPUT_PATH, output_file_name), 'w') as fp:
    json.dump(results, fp)