In [1]:
from datasets import load_dataset, \
    concatenate_datasets

In [2]:
data = load_dataset('NLBSE/nlbse25-code-comment-classification')

In [3]:
data

DatasetDict({
    java_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 7614
    })
    java_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1725
    })
    python_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1884
    })
    python_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 406
    })
    pharo_train: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 1298
    })
    pharo_test: Dataset({
        features: ['index', 'class', 'comment_sentence', 'partition', 'combo', 'labels'],
        num_rows: 289
    })
})

In [10]:
langs = ['java', 'python', 'pharo']
labels = {
    'java': ['summary', 'Ownership', 'Expand', 'usage', 'Pointer', 'deprecation', 'rational'],
    'python': ['Usage', 'Parameters', 'DevelopmentNotes', 'Expand', 'Summary'],
    'pharo': ['Keyimplementationpoints', 'Example', 'Responsibilities', 'Classreferences', 'Intent', 'Keymessages', 'Collaborators']
}

def split_list_into_columns(row, lang):
    values_list = row['labels']  # Replace 'values' with your actual column name
    dict = {}
    for key in labels[lang]:
        dict[key] = values_list[labels[lang].index(key)]

    return dict

def print_labels(data, lang):
    for label in labels[lang]:
        group = data.groupby(label).count()
        print("label " + label)
        print("positive", group["class"][1], ",negative", group["class"][0], "---", group["class"][1]+group["class"][0])

java = concatenate_datasets([data["java_train"], data["java_test"]]).map(lambda row: split_list_into_columns(row, "java")).to_pandas()
python = concatenate_datasets([data["python_train"], data["python_test"]]).map(lambda row: split_list_into_columns(row, "python")).to_pandas()
pharo = concatenate_datasets([data["pharo_train"], data["pharo_test"]]).map(lambda row: split_list_into_columns(row, "pharo")).to_pandas()


In [11]:
print_labels(java, "java")

label summary
positive 4502 ,negative 4837 --- 9339 - 0.48206446086304744
label Ownership
positive 312 ,negative 9027 --- 9339 - 0.033408287825248954
label Expand
positive 611 ,negative 8728 --- 9339 - 0.06542456365777921
label usage
positive 2524 ,negative 6815 --- 9339 - 0.27026448227861655
label Pointer
positive 1088 ,negative 8251 --- 9339 - 0.11650069600599636
label deprecation
positive 132 ,negative 9207 --- 9339 - 0.014134275618374558
label rational
positive 379 ,negative 8960 --- 9339 - 0.04058250348002998


In [12]:
print_labels(python, "python")

label Usage
positive 699 ,negative 1591 --- 2290 - 0.3052401746724891
label Parameters
positive 700 ,negative 1590 --- 2290 - 0.3056768558951965
label DevelopmentNotes
positive 251 ,negative 2039 --- 2290 - 0.10960698689956332
label Expand
positive 407 ,negative 1883 --- 2290 - 0.1777292576419214
label Summary
positive 429 ,negative 1861 --- 2290 - 0.1873362445414847


In [13]:
print_labels(pharo, "pharo")

label Keyimplementationpoints
positive 221 ,negative 1366 --- 1587 - 0.13925645872715817
label Example
positive 666 ,negative 921 --- 1587 - 0.41965973534971646
label Responsibilities
positive 297 ,negative 1290 --- 1587 - 0.18714555765595464
label Classreferences
positive 50 ,negative 1537 --- 1587 - 0.0315059861373661
label Intent
positive 181 ,negative 1406 --- 1587 - 0.11405166981726528
label Keymessages
positive 257 ,negative 1330 --- 1587 - 0.16194076874606175
label Collaborators
positive 86 ,negative 1501 --- 1587 - 0.05419029615626969


label summary
positive 3610 ,negative 4004 --- 7614 - 0.4741266088783819
label Ownership
positive 267 ,negative 7347 --- 7614 - 0.035066981875492516
label Expand
positive 509 ,negative 7105 --- 7614 - 0.06685053848174416
label usage
positive 2093 ,negative 5521 --- 7614 - 0.27488836354084584
label Pointer
positive 904 ,negative 6710 --- 7614 - 0.11872865773574993
label deprecation
positive 117 ,negative 7497 --- 7614 - 0.015366430260047281
label rational
positive 311 ,negative 7303 --- 7614 - 0.04084581034935645
label Usage
positive 578 ,negative 1306 --- 1884 - 0.3067940552016985
label Parameters
positive 572 ,negative 1312 --- 1884 - 0.3036093418259023
label DevelopmentNotes
positive 210 ,negative 1674 --- 1884 - 0.11146496815286625
label Expand
positive 343 ,negative 1541 --- 1884 - 0.18205944798301485
label Summary
positive 347 ,negative 1537 --- 1884 - 0.18418259023354563
label Keyimplementationpoints
positive 178 ,negative 1120 --- 1298 - 0.13713405238828968
label Example
positiv