In [None]:
# %pip install openai datasets

In [210]:
from openai import OpenAI
import numpy as np
import random

In [211]:
from datasets import Dataset, DatasetDict, load_dataset
# dataset from hf_hub
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}
ds = load_dataset('NLBSE/nlbse25-code-comment-classification')
ds

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

In [212]:
client = OpenAI(
    base_url = 'http://localhost:11434/v1',
    api_key='ollama',
)

In [None]:
training_data = {}
for lan in langs:
  training_data[lan] = []
  for row in ds[f'{lan}_train']:
    comment = row['combo']
    classes = row['labels']
    for i in range(len(classes)):
      question = {"role": "user", "content": f"Does the following comment belong to the class '{labels['java'][i]}'?: {comment}"}
      if classes[i] == 1:
        response = {"role": "assistant", "content": "yes"}
      else:
        response = {"role": "assistant", "content": "no"}
      training_data[lan].append((question, response))
training_data

{'java': [({'role': 'user',
    'content': "Does the following comment belong to the class 'summary'?: azure blob file system implementation of abstractfilesystem. | Abfss.java"},
   {'role': 'assistant', 'content': 'yes'}),
  ({'role': 'user',
    'content': "Does the following comment belong to the class 'Ownership'?: azure blob file system implementation of abstractfilesystem. | Abfss.java"},
   {'role': 'assistant', 'content': 'no'}),
  ({'role': 'user',
    'content': "Does the following comment belong to the class 'Expand'?: azure blob file system implementation of abstractfilesystem. | Abfss.java"},
   {'role': 'assistant', 'content': 'no'}),
  ({'role': 'user',
    'content': "Does the following comment belong to the class 'usage'?: azure blob file system implementation of abstractfilesystem. | Abfss.java"},
   {'role': 'assistant', 'content': 'no'}),
  ({'role': 'user',
    'content': "Does the following comment belong to the class 'Pointer'?: azure blob file system implementa

In [217]:
predictions = {}

for lan in langs:
  sample = random.sample(training_data[lan], 25)
  sample = [x for tup in sample for x in tup]
  print(len(sample))

  predictions[lan] = []
  for row in ds[f'{lan}_test']:
    comment = row['combo']
    y = []
    for label in labels[lan]:
      query = f"Does the following comment belong to the class '{label}'?: {comment}"
      
      print(query)

      response = client.chat.completions.create(
        model="llama3.2",
        messages=[
          {"role": "system", "content": "You are a code comment classifier. When asked if a code comment belongs to a given class, you will answer 'yes' if the comment belongs to the class, or 'no' if it does not. Do not output anything other than 'yes' or 'no'."},
          *sample,
          {"role": "user", "content": query},
        ]
      )
      message = response.choices[0].message.content.lower()
      y.append(1 if message.startswith('yes') else 0)
    predictions[lan].append(y)

predictions

50
Does the following comment belong to the class 'summary'?: accept everything. | AbstractContractGetFileStatusTest.java
Does the following comment belong to the class 'Ownership'?: accept everything. | AbstractContractGetFileStatusTest.java
Does the following comment belong to the class 'Expand'?: accept everything. | AbstractContractGetFileStatusTest.java
Does the following comment belong to the class 'usage'?: accept everything. | AbstractContractGetFileStatusTest.java
Does the following comment belong to the class 'Pointer'?: accept everything. | AbstractContractGetFileStatusTest.java
Does the following comment belong to the class 'deprecation'?: accept everything. | AbstractContractGetFileStatusTest.java
Does the following comment belong to the class 'rational'?: accept everything. | AbstractContractGetFileStatusTest.java
Does the following comment belong to the class 'summary'?: accept nothing. | AbstractContractGetFileStatusTest.java
Does the following comment belong to the cla

{'java': [[1, 1, 1, 1, 0, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [0, 1, 1, 1, 1, 1, 0],
  [1, 0, 1, 0, 1, 1, 0],
  [1, 1, 1, 1, 1, 1, 0],
  [1, 1, 1, 1, 1, 0, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 0, 1, 1, 0, 1],
  [1, 1, 1, 1, 1, 0, 1],
  [1, 0, 1, 1, 1, 0, 0],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 0, 0],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [0, 1, 1, 1, 1, 1, 1],
  [0, 1, 0, 1, 0, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 0, 1, 1, 0, 1],
  [1, 0, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 0, 0, 1],
  [1, 0, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 0],
  [1, 1, 0, 0, 1, 0, 1],
  [0, 1, 1, 1, 1, 1, 1],
  [1, 1, 0, 1, 1, 1, 1],
  [1, 1, 0, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 0, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 0, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 0],
  [1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1],
  [0, 1, 1, 1, 1,

In [218]:
import pandas as pd

# Copied nearly verbatim from https://github.com/nlbse2025/code-comment-classification/blob/main/SetFit_baseline.ipynb
# to ensure statistics are calculated the same way.
scores = []
for lan in langs:
    y_pred = np.array(predictions[lan]).T
    y_true = np.array(ds[f'{lan}_test']['labels']).T

    for i in range(len(y_pred)):
        assert(len(y_pred[i]) == len(y_true[i]))
        tp = sum([true == pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
        tn = sum([true == pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
        fp = sum([true == 0 and pred == 1 for (true,pred) in zip(y_true[i], y_pred[i])])
        fn = sum([true == 1 and pred == 0 for (true,pred) in zip(y_true[i], y_pred[i])])
        precision = tp / (tp + fp)
        recall = tp / (tp + fn)
        f1 = (2*tp) / (2*tp + fp + fn)
        scores.append({'lan': lan, 'cat': labels[lan][i],'precision': precision,'recall': recall,'f1': f1})
scores = pd.DataFrame(scores)
scores

Unnamed: 0,lan,cat,precision,recall,f1
0,java,summary,0.513198,0.93722,0.663229
1,java,Ownership,0.025586,0.8,0.049587
2,java,Expand,0.05663,0.803922,0.105806
3,java,usage,0.253795,0.969838,0.40231
4,java,Pointer,0.113948,0.994565,0.204469
5,java,deprecation,0.010226,0.933333,0.020231
6,java,rational,0.039801,0.823529,0.075932
7,python,Usage,0.303371,0.892562,0.45283
8,python,Parameters,0.3,0.796875,0.435897
9,python,DevelopmentNotes,0.115556,0.634146,0.195489
