In [1]:
# pip install setfit==1.1.2 datasets=3.6.0
# expects that the folder models exists
import pandas as pd
import time
from setfit import SetFitModel, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict, load_dataset
from tqdm.auto import tqdm
import numpy as np
import torch
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Intent', 'Keymessages', 'Collaborators']
}
ds = load_dataset('NLBSE/nlbse26-code-comment-classification')

In [None]:
for lang in langs:
    model = SetFitModel.from_pretrained("sentence-transformers/paraphrase-MiniLM-L6-v2", multi_target_strategy="multi-output")

    args = TrainingArguments(
        num_epochs=5 if lang == 'java' else 10,
        batch_size=32,
        num_iterations=20
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=ds[f'{lang}_train'],
        eval_dataset=ds[f'{lang}_test'],
        column_mapping={"combo": "text", "labels": "label"}
    )

    trainer.train()
    trainer.model.save_pretrained(f'./models/{lang}')

In [20]:
total_flops = 0
total_time = 0
scores = []
for lan in langs:
    # to load trained models:
    # model = SetFitModel.from_pretrained(f'./models/{lan}')
    # to load pretrained models from Hub:
    model = SetFitModel.from_pretrained(f'NLBSE/nlbse26_{lan}')
    with torch.profiler.profile(with_flops=True) as p:
        x = ds[f'{lan}_test'][:]["combo"]
        begin = time.time()
        for i in range(10):
          y_pred = model(x)
          y_pred = np.asarray(y_pred).T 
        total = time.time() - begin
        total_time = total_time + total
    total_flops = total_flops + (sum(k.flops for k in p.key_averages()) / 1e9)
    y_true = np.array(ds[f'{lan}_test']['labels']).T
    for i in range(len(y_pred)):
        assert(len(y_pred[i]) == len(y_true[i]))
        tp = sum([true == pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
        tn = sum([true == pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
        fp = sum([true == 0 and pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
        fn = sum([true == 1 and pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = (2*tp) / (2*tp + fp + fn)
        scores.append({'lan': lan, 'cat': labels[lan][i],'precision': precision,'recall': recall,'f1': f1})
print("Compute in GFLOPs:", total_flops/10)
print("Avg runtime in seconds:", total_time/10)
scores = pd.DataFrame(scores)

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*ar

Compute in GFLOPs: 0.355315968
Avg runtime in seconds: 0.9584410905838012


In [21]:
scores

Unnamed: 0,lan,cat,precision,recall,f1
0,java,summary,0.871224,0.886731,0.878909
1,java,Ownership,1.0,1.0,1.0
2,java,Expand,0.330097,0.43038,0.373626
3,java,usage,0.883803,0.850847,0.867012
4,java,Pointer,0.775641,0.968,0.86121
5,java,deprecation,0.875,0.7,0.777778
6,java,rational,0.311688,0.413793,0.355556
7,python,Usage,0.666667,0.681319,0.673913
8,python,Parameters,0.688172,0.752941,0.719101
9,python,DevelopmentNotes,0.219178,0.5,0.304762


In [23]:
ds

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 5394
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1201
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 900
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 208
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1368
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 290
    })
})

In [19]:
max_avg_runtime = 5
max_avg_flops = 5000
# s𝑢𝑏𝑚𝑖𝑠𝑠𝑖𝑜𝑛_𝑠𝑐𝑜𝑟𝑒(𝑚𝑜𝑑𝑒𝑙)=(𝑎𝑣𝑔. 𝐹1)×0.60+max((𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒)/𝑚𝑎𝑥_𝑎𝑣𝑔_𝑟𝑢𝑛𝑡𝑖𝑚𝑒),0)×0.2+max(((𝑚𝑎𝑥_GFLOPs−𝑚𝑒𝑎𝑠𝑢𝑟𝑒𝑑_GFLOPs)/𝑚𝑎𝑥_GFLOPs), 0)×0.2
def score(avg_f1, avg_runtime, avg_flops):
    return (0.6 * avg_f1 +
      0.2 * max((max_avg_runtime - avg_runtime) / max_avg_runtime, 0) +
      0.2 * max((max_avg_flops - avg_flops) / max_avg_flops), 0)

avg_f1 = scores.f1.mean()
avg_runtime = total_time/10
avg_flops = total_flops/10

round(score(avg_f1, avg_runtime, avg_flops), 2)

np.float64(0.76)