# fastText Commit Classification

In [1]:
!pip install fasttext



In [2]:
import fasttext
model = fasttext.train_supervised(input="commits.train", lr=0.1, epoch=25, wordNgrams=2)
model.save_model("fasttext.model")
stats = model.test("commits.valid")
model.predict(['fixed some ClassInfo bugsSummary:for better re.'])  

Read 0M words
Number of words:  11972
Number of labels: 5
Progress: 100.0% words/sec/thread:  732459 lr:  0.000000 avg.loss:  0.183071 ETA:   0h 0m 0s


([['__label__corrective']], [array([0.99983376], dtype=float32)])

In [3]:
import pandas as pd

arquivo = "commits.valid" 

def process_line(line):
    if line.startswith("__label__"):  
        parts = line.split(' ', 1) 
        label = parts[0]
        comment = parts[1] if len(parts) > 1 else ''
        return label, comment
    return None 

with open(arquivo, "r", encoding="utf-8") as f:
    data = []
    current_label = None
    current_comment = ""
    
    for line in f:
        line = line.strip()
        if line.startswith("__label__"):
            if current_label is not None:
                data.append((current_label, current_comment))
            label, comment = process_line(line)
            if label is not None:
                current_label = label
                current_comment = comment
        else:
            current_comment += " " + line  
    
    if current_label is not None:
        data.append((current_label, current_comment)) 

df = pd.DataFrame(data, columns=['label', 'comment'])

print(df.head())


                    label                                            comment
0        __label__unknown  infra .  chart: fix socket info items duplicat...
1  __label__nonfunctional  performance improvements to get the existing f...
2       __label__features  enable action item text display on pre-android...
3  __label__nonfunctional  issue 55 enhanced integration tests to support...
4        __label__unknown  add a simple duplicate method to maintain cons...


In [4]:
commits = list(df['comment'].astype(str))
labels = model.predict(commits)
res = list(zip(*labels))
res_list = [x[0] for x in res]
lst2 = [item[0] for item in res_list]
df['label_predicted'] = lst2
df.head()

Unnamed: 0,label,comment,label_predicted
0,__label__unknown,infra . chart: fix socket info items duplicat...,__label__unknown
1,__label__nonfunctional,performance improvements to get the existing f...,__label__nonfunctional
2,__label__features,enable action item text display on pre-android...,__label__features
3,__label__nonfunctional,issue 55 enhanced integration tests to support...,__label__nonfunctional
4,__label__unknown,add a simple duplicate method to maintain cons...,__label__unknown


In [5]:
df_filtered = df[df['label'] != df['label_predicted']]
df_filtered

Unnamed: 0,label,comment,label_predicted
54,__label__features,"""implemented optional create-on-push if this ...",__label__corrective
246,__label__features,"""change separator for shards preference the s...",__label__corrective
436,__label__features,"""what ' s new and migrating are prominently fe...",__label__perfective
484,__label__nonfunctional,"""[iphone] now decoders need to be injected int...",__label__corrective
605,__label__features,"""enable validation of optional values to vali...",__label__corrective
...,...,...,...
2264,__label__unknown,clutter-1 . 0: update to 1 . 2 . 4,__label__features
2265,__label__unknown,fixes bug 614268 .,__label__corrective
2266,__label__unknown,gtksourceview-2 . 0: update to 2 . 10 . 0,__label__features
2267,__label__unknown,"""rest-0 . 6: rename rest to rest-0 . 6""""""",__label__features


In [6]:
df_corrective = df[df['label_predicted'] == '__label__corrective']
df_corrective

Unnamed: 0,label,comment,label_predicted
7,__label__corrective,bug 10496: fixed various problems in scandirec...,__label__corrective
11,__label__corrective,cloudstack-9002: vm deployment is successful e...,__label__corrective
12,__label__corrective,"""ip address page - pf lb tab - fix a bug that...",__label__corrective
13,__label__corrective,new ui - restore vm - fix a bug that vm info d...,__label__corrective
14,__label__corrective,"""[fixed hudson-3875] according to the bug repo...",__label__corrective
...,...,...,...
2245,__label__perfective,do not use `weak ' modifier in *-custom . val...,__label__corrective
2257,__label__unknown,fixes bug 577352 .,__label__corrective
2259,__label__unknown,fixes bug 610330 .,__label__corrective
2261,__label__unknown,gstreamer-video-0 . 10: don ' t hide gst . vid...,__label__corrective


In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Suponha que você tenha os rótulos verdadeiros e as previsões do LLM
y_true = df['label']
y_pred = df['label_predicted']

# Calcula as métricas
precision = precision_score(y_true, y_pred, average="macro")  # Média das classes
recall = recall_score(y_true, y_pred, average="macro")
f1 = f1_score(y_true, y_pred, average="macro")

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-score: {f1:.2f}")

Precision: 0.81
Recall: 0.81
F1-score: 0.81


In [8]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred))

                        precision    recall  f1-score   support

   __label__corrective       0.85      0.74      0.79       765
     __label__features       0.63      0.75      0.68       530
__label__nonfunctional       0.91      1.00      0.95       246
   __label__perfective       0.80      0.78      0.79       640
      __label__unknown       0.88      0.78      0.83        88

              accuracy                           0.78      2269
             macro avg       0.81      0.81      0.81      2269
          weighted avg       0.79      0.78      0.78      2269

