In [1]:
MODEL_PATH = "llama-1b-wikipedia/checkpoint-2500"
TOKENIZER_PATH = "meta-llama/Llama-3.2-1B"
SYSTEM_PROMPT = "You are a corrector of French texts. Correct the text without explaining."
MAX_LENGTH = 1024

In [2]:
import sys
sys.path.append('..')
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from jiwer import cer, wer

from src.evaluate import correct_text

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from datasets import load_dataset

test_dataset = dataset = load_dataset("wikipedia", "20220301.fr", split="train")
test_dataset = test_dataset.shuffle(seed=50)
test_dataset = test_dataset.select(range(100))
test_dataset = test_dataset.filter(lambda example: len(example['text']) <= 2500)

In [4]:
from src.data_utils import introduce_errors

def process_example(example):
    text = example['text']
    return {'input': introduce_errors(text), 'output': text}

test_dataset = test_dataset.map(process_example, batched=False, remove_columns=['title', 'text', 'id', 'url'])

In [5]:
cer_list = []
wer_list = []
for example in test_dataset:
    input_text = example['input']
    output_text = example['output']
    cer_list.append(cer(output_text, input_text))
    wer_list.append(wer(output_text, input_text))

print("Introduced CER:", sum(cer_list) / len(cer_list))
print("Introduced WER:", sum(wer_list) / len(wer_list))
print("Cer std dev:", torch.tensor(cer_list).std().item())
print("Wer std dev:", torch.tensor(wer_list).std().item())

Introduced CER: 0.02397686974832509
Introduced WER: 0.13989397660558023
Cer std dev: 0.023085787892341614
Wer std dev: 0.10908369719982147


In [6]:
model_path = "../models/" + MODEL_PATH
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
model = AutoModelForCausalLM.from_pretrained(model_path)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [7]:
from tqdm import tqdm

cer_scores = []
wer_scores = []

for example in tqdm(test_dataset, desc="Evaluating"):
    input_text = example["input"]
    target_text = example["output"]
    corrected_text = correct_text(input_text, model, tokenizer, max_length=MAX_LENGTH, system_prompt=SYSTEM_PROMPT)
    cer_scores.append(cer(target_text, corrected_text))
    wer_scores.append(wer(target_text, corrected_text))

print("CER:", sum(cer_scores) / len(cer_scores))
print("WER:", sum(wer_scores) / len(wer_scores))


Evaluating: 100%|██████████| 73/73 [06:00<00:00,  4.94s/it]

CER: 0.032882601001905126
WER: 0.062977366347162





In [8]:
print("Cer std dev:", torch.tensor(cer_scores).std().item())
print("Wer std dev:", torch.tensor(wer_scores).std().item())

Cer std dev: 0.08759099990129471
Wer std dev: 0.09981656074523926
