In [1]:
from transformers import pipeline
from rouge_score import rouge_scorer

classifier = pipeline(
    "text-classification", 
    model="AbdullahBarayan/ModernBERT-base-doc_en-Cefr",
    device=0,
    top_k=None,
)

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

  from .autonotebook import tqdm as notebook_tqdm
2026-01-26 07:27:08.133884: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Device set to use cuda:0


In [2]:
import json
from tqdm import tqdm
from bert_score import score

cefr_labels = ["A1", "A2", "B1", "B2", "C1", "C2"]
prediction_file = "dataset/predictions/oob/ca_test_data_final_OFFICIAL.jsonl"

cefr_scores = {label: [] for label in cefr_labels}
bert_scores = {label: [] for label in cefr_labels}

dataset = [line.strip() for line in open(prediction_file)]

for i in tqdm(range(len(dataset))):
    
    pred_json = json.loads(dataset[i])

    predictions = pred_json["predictions"]
    texts = [predictions[label] for label in cefr_labels]
    
    cefr_probs = classifier(texts, batch_size=6)

    candidates = []
    references = [pred_json["summary"]] * 6
    for label, single_probs in zip(cefr_labels, cefr_probs):
        single_probs = {p["label"]: p["score"] for p in single_probs}
        cefr_scores[label].append(single_probs[label])
        candidates.append(predictions[label])

    _, _, f1 = score(candidates, references, model_type="microsoft/deberta-xlarge-mnli", lang="en", verbose=False)
    single_scores = f1.data.tolist()

    for label, bscore in zip(cefr_labels, single_scores):
        bert_scores[label].append(bscore)
        

 10%|█         | 10/100 [00:35<05:04,  3.38s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [05:31<00:00,  3.32s/it]


In [3]:
for k, scores in cefr_scores.items():
    cefr = sum(scores) * 100 / len(scores)
    bscores = bert_scores[k]
    bscore = sum(bscores) * 100 / len(bscores)
    print(k, cefr, bscore)


A1 0.5656654936283303 54.79496845602989
A2 2.902866041836816 55.84937039017677
B1 13.985654603304283 57.72564893960953
B2 39.01431962731294 57.5932794213295
C1 76.89376731868833 57.33822947740555
C2 0.13895702163608803 57.35818266868591
