### exp004 - fine-tuning LLaMA3.1 8b instruct w/ PEFT via ~3065 labeled headlines
https://colab.research.google.com/drive/13S73CHubLU7n8V8GJsXseeWFnNwt1LpE?usp=drive_link

In [None]:
import pandas as pd
from pathlib import Path
import re

def clean_prediction_text(text):
    """
    Extracts the first valid label (Aumento or Diminuição) after the keyword 'assistant'.
    Handles multiline and malformed text GRACEFULLY!!!
    """
    text = str(text)

    # if the string is already clean
    if text.strip() in {"Aumento", "Diminuição"}:
        return text.strip()

    # Look for all valid labels following 'assistant' anywhere in the text
    matches = re.findall(r'assistant.*?(Aumento|Diminuição)', text, flags=re.DOTALL)

    # If found, return the first match
    if matches:
        return matches[0]

    # Fallback: if 'assistant' is missing, try to match label at start
    match = re.match(r'\s*(Aumento|Diminuição)', text)
    if match:
        return match.group(1)

    return None  # or return 'UNKNOWN' if to mark it explicitly

def clean_predictions_csv(file_path: str):
    df = pd.read_csv(file_path)
    df['Prediction'] = df['Prediction'].apply(clean_prediction_text)
    df = df[df['Prediction'].isin(['Aumento', 'Diminuição'])] # filter out rows IF label wasn't found (didn't happen which is good)

    original_path = Path(file_path)
    new_name = original_path.name.replace("original", "").replace("--", "-").strip("-_")
    new_path = original_path.with_name(new_name)

    df.to_csv(file_path, index=False)

# clean the preds
clean_predictions_csv("../../results/exp004/predictions-original.csv")

In [None]:
df = pd.read_csv("../../results/exp004/predictions.csv")

In [3]:
from sklearn.metrics import confusion_matrix
'''
[[TP FN]
 [FP TN]]
'''
cm = confusion_matrix(df["Direção"], df["Prediction"], labels=["Aumento", "Diminuição"])
print(cm)

[[200   0]
 [  0 200]]


In [None]:
from sklearn.metrics import classification_report
import pandas as pd

# Normalize predictions (just in case)
df["Prediction"] = df["Prediction"].str.strip().str.capitalize()
df["Direção"] = df["Direção"].str.strip().str.capitalize()

# Report
report = classification_report(
    df["Direção"], df["Prediction"],
    labels=["Aumento", "Diminuição"],
    target_names=["Aumento", "Diminuição"],
    digits=3
)

print("\nClassification Report:\n")
print(report)


Classification Report:

              precision    recall  f1-score   support

     Aumento      1.000     1.000     1.000       200
  Diminuição      1.000     1.000     1.000       200

    accuracy                          1.000       400
   macro avg      1.000     1.000     1.000       400
weighted avg      1.000     1.000     1.000       400



## Check for Data Leakage

In [None]:
import pandas as pd

test_df = pd.read_csv("../../data/processed/bdm-corpus-2/stage-3/test.csv")
train_df = pd.read_csv("../../data/processed/bdm-corpus-2/stage-3/train.csv")

# Find matching rows based on both columns
matching_rows = pd.merge(test_df, train_df, on=["DataHora", "Manchete"], how="inner")

# Print number of matches
print(f"Number of overlapping rows: {len(matching_rows)}")
display(matching_rows["DataHora"])

Number of overlapping rows: 3


Unnamed: 0,DataHora,Manchete,Direção_x,Direção_y
0,2024-11-26 10:02:00,Reação ao IPCA-15,Aumento,Aumento
1,2024-12-06 12:54:00,"Fed/Goolsbee: Inflação cai, mas se estabilizar...",Diminuição,Diminuição
2,2024-11-28 09:02:00,"Com essa fórmula de cálculo, a suposta renúnci...",Diminuição,Diminuição
