# Legal document classification in zero-shot cross lingual transfer setting

# Part II: Results reproduction

Date: May 2025

Project of course: Natural Language Processing - ENSAE 3A S2

Author: Noémie Guibé

In [1]:
# imports
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import pandas as pd 
from datasets import Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import f1_score
import os

2025-05-03 01:49:03.325097: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-03 01:49:03.326106: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-03 01:49:03.333318: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-03 01:49:03.430722: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746236943.538909  116523 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746236943.61

In [2]:
# import data base
df = pd.read_parquet('https://minio.lab.sspcloud.fr/nguibe/NLP/multi_eurlex_reduced.parquet', engine='pyarrow')

# 1 - First result reproduction: Performance drop from English-only fine-tuning

In [12]:
# Define R-Precision computation
def r_precision(y_true, y_pred, top_k=10):
    """
    R-Precision: Precision at top-k (where k is the number of relevant labels).
    """
    precision_list = []
    for i in range(len(y_true)):
        true_labels = y_true[i]
        predicted_scores = y_pred[i]
        
        # Get the indices of top-k predicted labels based on predicted scores
        top_k_indices = predicted_scores.argsort()[-top_k:][::-1]
        
        # Calculate the number of relevant labels in the top-k predictions
        relevant_in_top_k = sum([1 for idx in top_k_indices if true_labels[idx] == 1])
        precision = relevant_in_top_k / top_k
        precision_list.append(precision)
    
    return np.mean(precision_list)

# Add additional metrics (Micro, Macro F1)
def evaluate_model(model, test_dataset, batch_size=batch_size):
    start_time = time.time()
    
    # Evaluate the model
    y_true = []
    y_pred = []
    
    for batch in test_dataset.batch(batch_size):
        #input_ids = batch['input_ids']
        #attention_mask = batch['attention_mask']
        #(input_ids, attention_mask), labels = batch
        #labels = batch['labels']
        inputs, labels = batch
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        
        # Get model predictions
        logits = model(input_ids, attention_mask=attention_mask)[0]  # Directly access the first element
        predictions = tf.sigmoid(logits).numpy()  # Apply sigmoid to get probabilities
        #logits = model(input_ids, attention_mask=attention_mask)[0]
        #predictions = tf.sigmoid(logits.logits).numpy()
        
        y_true.extend(labels.numpy())
        y_pred.extend(predictions)
    
    # Convert to numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    # Calculate R-Precision
    r_precision_score = r_precision(y_true, y_pred)
    
    # Calculate Micro and Macro F1 Scores
    micro_f1 = f1_score(y_true, (y_pred > 0.5), average='micro',zero_division=0)
    macro_f1 = f1_score(y_true, (y_pred > 0.5), average='macro',zero_division=0)
    
    # Calculate Label Ranking Average Precision (LRAP)
    lrap_score = label_ranking_average_precision_score(y_true, y_pred)
    
    # Log the results
    print(f"R-Precision: {r_precision_score:.4f}")
    print(f"Micro F1: {micro_f1:.4f}")
    print(f"Macro F1: {macro_f1:.4f}")
    print(f"LRAP: {lrap_score:.4f}")
    
    # Calculate evaluation time
    evaluation_time = time.time() - start_time
    print(f"Evaluation time: {evaluation_time:.2f} seconds")
    
    return r_precision_score, micro_f1, macro_f1, lrap_score, evaluation_time

# Function to track training time and memory usage
def track_training_time_and_memory(model, train_dataset, batch_size=batch_size, epochs=epochs):
    # Track training time
    start_time = time.time()
    
    # Use psutil to track memory usage
    process = psutil.Process(os.getpid())
    
    # Get initial memory usage
    initial_memory = process.memory_info().rss / 1024 ** 2  # in MB
    
    # Train the model
    model.fit(train_dataset.batch(batch_size), epochs=epochs)
    
    # Get final memory usage
    final_memory = process.memory_info().rss / 1024 ** 2  # in MB
    
    # Track training time
    training_time = time.time() - start_time
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Initial memory usage: {initial_memory:.2f} MB")
    print(f"Final memory usage: {final_memory:.2f} MB")
    print(f"Memory used during training: {final_memory - initial_memory:.2f} MB")
    
    return training_time, initial_memory, final_memory


In [13]:
# model_runner/train_and_evaluate.py

import os
import time
import psutil
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, label_ranking_average_precision_score
from datasets import Dataset

def run_training_pipeline(data, train_sample_size=1000, test_sample_size=5000, batch_size=8, epochs=2):
    df = data.copy()

    # Preprocess training and test sets
    df['level_1_labels'] = df['eurovoc_concepts'].apply(lambda d: d.get('level_1', []))
    train_df = df[df['split'] == 'train']
    train_df['text'] = train_df['text'].apply(lambda x: x.get("en") if isinstance(x, dict) else "")
    test_df = df[df['split'] == 'test']

    test_langs = ["en", "fr", "de", "pl", "fi"]
    test_dfs = []
    for lang in test_langs:
        df_lang = test_df[test_df['text'].apply(lambda x: isinstance(x, dict) and lang in x)].copy()
        df_lang['text'] = df_lang['text'].apply(lambda x: x[lang])
        df_lang['lang'] = lang
        test_dfs.append(df_lang)

    final_test_df = pd.concat(test_dfs, ignore_index=True)

    train_df = train_df.sample(train_sample_size, random_state=42)
    final_test_df = final_test_df.sample(test_sample_size, random_state=42)

    mlb = MultiLabelBinarizer()
    mlb.fit(df["level_1_labels"])
    train_df["label_vector"] = list(mlb.transform(train_df["level_1_labels"]))
    final_test_df["label_vector"] = list(mlb.transform(final_test_df["level_1_labels"]))

    tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
    def tokenize(batch): 
        encodings = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)
        encodings['labels'] = batch['label_vector']
        return encodings

    train_dataset = Dataset.from_pandas(train_df[["text", "label_vector"]]).map(tokenize, batched=True)
    test_datasets = {
        lang: Dataset.from_pandas(df[["text", "label_vector"]]).map(tokenize, batched=True)
        for lang, df in final_test_df.groupby("lang")
    }

    def dataset_to_tf(dataset):
        def gen():
            for ex in dataset:
                yield {"input_ids": ex["input_ids"], "attention_mask": ex["attention_mask"]}, ex["labels"]
        return tf.data.Dataset.from_generator(
            gen,
            output_signature=(
                {
                    "input_ids": tf.TensorSpec(shape=(512,), dtype=tf.int64),
                    "attention_mask": tf.TensorSpec(shape=(512,), dtype=tf.int64),
                },
                tf.TensorSpec(shape=(len(mlb.classes_),), dtype=tf.float32)
            )
        )

    train_tf = dataset_to_tf(train_dataset)
    test_tf = {lang: dataset_to_tf(ds) for lang, ds in test_datasets.items()}

    num_labels = len(mlb.classes_)
    model = TFAutoModelForSequenceClassification.from_pretrained(
        'xlm-roberta-base',
        num_labels=num_labels,
        problem_type='multi_label_classification'
    )
    model.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

    # Train and evaluate with reusable functions
    training_time, initial_memory, final_memory = track_training_time_and_memory(
        model, train_tf, batch_size=batch_size, epochs=epochs
    )

    # Evaluate on all test languages using existing evaluate_model()
    results = {}
    for lang, lang_dataset in test_tf.items():
        print(f"[INFO] Evaluating on language: {lang}")
        r_prec, micro_f1, macro_f1, lrap, eval_time = evaluate_model(
        model, lang_dataset, batch_size=batch_size
        )
        results[lang] = {
        "R-Precision": r_prec,
        "Micro F1": micro_f1,
        "Macro F1": macro_f1,
        "LRAP": lrap,
        "Eval Time (s)": eval_time
    }

    # Optionally return training stats
    results["training"] = {
    "Training Time (s)": training_time,
    "Initial Memory (MB)": initial_memory,
    "Final Memory (MB)": final_memory,
    "Memory Used (MB)": final_memory - initial_memory
    }
    return results


In [None]:
# Choose your parameters
train_size = 10
test_size = 5
batch_size = 32
epochs = 1

# Run training and evaluation
results = run_training_pipeline(data=df,train_sample_size=train_size,
                                test_sample_size=test_size,
                                batch_size=batch_size,
                                epochs=epochs)

# Display results
import pprint
pprint.pprint(results)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['text'] = train_df['text'].apply(lambda x: x.get("en") if isinstance(x, dict) else "")
Map: 100%|██████████| 10/10 [00:00<00:00, 49.62 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 10.76 examples/s]
Map: 100%|██████████| 2/2 [00:00<00:00, 21.12 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 127.60 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00,  7.45 examples/s]
All PyTorch model weights were used when initializing TFXLMRobertaForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFXLMRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_