In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Configura√ß√£o inicial para o Google Colab
!pip install -q torch accelerate transformers datasets evaluate scikit-learn
import torch
from accelerate import Accelerator

# Verificar se h√° GPU dispon√≠vel e configurar para uso h√≠brido (GPU+CPU)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
accelerator = Accelerator(device_placement=True, mixed_precision="fp16")
print(f"‚úÖ Dispositivo ativo: {device}")


In [None]:
! pip install unbabel-comet
! pip install evaluate tensorflow
! pip install bert-score
! pip install evaluate
! pip install git+https://github.com/google-research/bleurt.git

from bert_score import score
from comet import download_model, load_from_checkpoint
import evaluate
import json
import torch
import gc


In [None]:
def calcular_metricas(pasta, lote_salvamento=10):
    torch.cuda.empty_cache()
    gc.collect()

    path = f"/content/drive/MyDrive/DIMEMEX/{pasta}"
    arquivo_saida = f"{path}/metricas.json"

    # === Carregar modelos uma √∫nica vez ===
    print("üîÑ Carregando modelos...")
    bleurt = evaluate.load("bleurt", "bleurt-large-512")
    chrf = evaluate.load("chrf")
    bertscore = evaluate.load("bertscore")
    model_path = download_model("Unbabel/wmt22-cometkiwi-da")
    model = load_from_checkpoint(model_path)
    print("‚úÖ Modelos carregados!\n")

    # === Ler dados de entrada ===
    with open(f'{path}/{pasta}_data_junto.json', 'r', encoding='utf-8') as file:
        dados = json.load(file)

    # === Se j√° existir arquivo parcial, continuar de onde parou ===
    vetor = []
    if os.path.exists(arquivo_saida):
        with open(arquivo_saida, 'r', encoding='utf-8') as f:
            try:
                vetor = json.load(f)
                print(f"üìÇ Retomando: {len(vetor)} itens j√° processados.")
            except json.JSONDecodeError:
                print("‚ö†Ô∏è Arquivo de sa√≠da corrompido ‚Äî come√ßando do zero.")
                vetor = []

    inicio = len(vetor)
    total = len(dados)

    # === Loop principal ===
    for i, objeto in enumerate(dados[inicio:], inicio + 1):
        result = objeto
        result['metricas'] = {'text': {}, 'description': {}}

        for termo in ["text", "description"]:
            original = result['original'][termo]
            traduzido = result['traduzido'][termo]

            # BERTScore
            result_bertscore = bertscore.compute(predictions=[traduzido],references=[original],model_type="xlm-roberta-large",lang="pt")

            # BLEURT
            result_bleurt = bleurt.compute(predictions=[traduzido], references=[original])

            # COMET
            data = [{"src": original, "mt": traduzido}]
            result_comet = model.predict(data, batch_size=8, gpus=1)

            # CHRF
            result_chrf = chrf.compute(predictions=[traduzido], references=[original])

            metricas = {
                'bertscore': result_bertscore["f1"][0],
                'bleurt': result_bleurt['scores'][0],
                'comet': result_comet['scores'][0],
                'chrf': result_chrf['score']
            }

            result['metricas'][termo] = metricas

        vetor.append(result)
        print(f"‚úÖ {i}/{total} processado ‚Äî {result['metricas']}")

        # === Salvar em lotes ===
        if i % lote_salvamento == 0 or i == total:
            with open(arquivo_saida, 'w', encoding='utf-8') as file:
                json.dump(vetor, file, ensure_ascii=False, indent=4)
            print(f"üíæ Progresso salvo ({i}/{total})")

    print(f"üìò Resultado salvo em: {arquivo_saida}")


In [None]:
for pasta in ["test", "validation", "train"]:
    calcular_metricas(pasta)