# Legal document classification in zero-shot cross lingual transfer setting

# Part II: Results reproduction

Date: May 2025

Project of course: Natural Language Processing - ENSAE 3A S2

Author: Noémie Guibé

In [3]:
# imports
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import pandas as pd 
from datasets import Dataset
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
from sklearn.metrics import f1_score
import os

2025-04-28 06:30:59.300984: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-28 06:30:59.400575: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745821859.511197  128579 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745821859.520609  128579 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745821859.599597  128579 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [4]:
os.chdir('NLP-Legal-document-classification')

FileNotFoundError: [Errno 2] No such file or directory: 'NLP-Legal-document-classification'

In [5]:
# import data base
df = pd.read_parquet('data/dataset/multi_eurlex_reduced.parquet', engine='pyarrow')

# Get the data ready

In [None]:
# keep only level 3 labels
df['level_3_labels'] = df['eurovoc_concepts'].apply(lambda d: d['level_3'] if 'level_3' in d else [])
df.head()

In [9]:
train_df = df[df['split']=='train']
# English-only training set
train_df.loc[:,'text'] = train_df["text"].apply(lambda x: isinstance(x, dict) and x.get("en"))
print(train_df.head())

     celex_id publication_date  \
0  32006D0213       2006-03-06   
1  32003R1330       2003-07-25   
2  32003R1786       2003-09-29   
3  31985R2590       1985-09-13   
4  31993R1103       1993-04-30   

                                                text  \
0  COMMISSION DECISION\nof 6 March 2006\nestablis...   
1  Commission Regulation (EC) No 1330/2003\nof 25...   
2  Council Regulation (EC) No 1786/2003\nof 29 Se...   
3  *****\nCOMMISSION REGULATION (EEC) No 2590/85\...   
4  COMMISSION REGULATION (EEC) No 1103/93 of 30 A...   

                                    eurovoc_concepts  split  \
0  {'all_levels': ['1706', '1826', '2754', '3690'...  train   
1  {'all_levels': ['1117', '1118', '1605', '2635'...  train   
2  {'all_levels': ['2173', '4854', '614', '797'],...  train   
3  {'all_levels': ['1201', '1261', '5334', '755',...  train   
4  {'all_levels': ['1309', '2159', '2192', '235',...  train   

                                         doc_lengths  max_doc_length  \
0  {'de

In [11]:
# test 
test_df = df[df['split']=='test']

# Test set in multiple languages
test_langs = ["fr", "de", "pl",'fi'] 
test_dfs = []

for lang in test_langs:
    # Filter rows where the language exists in the text dictionary
    df_lang = test_df[test_df["text"].apply(lambda x: isinstance(x, dict) and lang in x)]
    
    # Now extract the respective language text, and add the 'lang' column
    df_lang.loc[:,"text"] = df_lang["text"].apply(lambda x: x[lang])  # Extract the language text
    df_lang["lang"] = lang  # Add a new column for language
    
    # Append to test_dfs
    test_dfs.append(df_lang)

# Combine the list of DataFrames into one (exploded test set)
final_test_df = pd.concat(test_dfs, ignore_index=True)
print(final_test_df.head())

     celex_id publication_date  \
0  32013R1390       2013-12-16   
1  32015R0176       2015-02-05   
2  32015R0596       2015-04-15   
3  32015D0041       2014-12-17   
4  32013D0785       2013-12-16   

                                                text  \
0  RÈGLEMENT (UE) No 1390/2013 DU CONSEIL\ndu 16 ...   
1  RÈGLEMENT D'EXÉCUTION (UE) 2015/176 DE LA COMM...   
2  RÈGLEMENT D'EXÉCUTION (UE) 2015/596 DE LA COMM...   
3  DÉCISION (UE) 2015/41 DU PARLEMENT EUROPÉEN ET...   
4  DÉCISION DU CONSEIL\ndu 16 décembre 2013\nrela...   

                                    eurovoc_concepts split  \
0  {'all_levels': ['1085', '1474', '2329', '2556'...  test   
1  {'all_levels': ['2749', '3173', '5573', '5898'...  test   
2  {'all_levels': ['1239', '1318', '1878', '2972'...  test   
3  {'all_levels': ['1361', '1647', '2543', '2910'...  test   
4  {'all_levels': ['1474', '1819', '2329', '2556'...  test   

                                         doc_lengths  max_doc_length  \
0  {'de': 372

In [18]:
# Fit on all labels
mlb = MultiLabelBinarizer()
mlb.fit(df["level_3_labels"])

# Now safely transform
train_df["label_vector"] = [row.tolist() for row in mlb.transform(train_df["level_3_labels"])]
final_test_df["label_vector"] = [row.tolist() for row in mlb.transform(final_test_df["level_3_labels"])]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["label_vector"] = [row.tolist() for row in mlb.transform(train_df["level_3_labels"])]


In [None]:
len(mlb.classes_)

512

In [None]:
row_index = 5  # Change to the row you want to inspect
label_vector = train_df["label_vector"].iloc[row_index]

active_labels = [
    (i, mlb.classes_[i]) for i, val in enumerate(label_vector) if val == 1
]

print(f"Active labels for row {row_index}:")
for idx, label in active_labels:
    print(f"Index: {idx}, Label: {label}")


Active labels for row 5:
Index: 90, Label: 1810
Index: 173, Label: 2487
Index: 270, Label: 3489
Index: 338, Label: 4314
Index: 450, Label: 614


In [23]:
# check same index for same label in test dataset
label_id = "1810"
label_index = list(mlb.classes_).index(label_id)

# Get row indices in the test set where label 1810 is present
matching_indices = [
    i for i, row in enumerate(final_test_df["label_vector"]) if row[label_index] == 1
]

print(f"Rows in test set with label {label_id}: {matching_indices}")

Rows in test set with label 1810: [32, 181, 381, 480, 619, 708, 881, 940, 964, 1338, 1412, 1807, 2484, 2921, 3125, 3203, 3603, 3620, 3860, 4167, 4491, 4911, 4949, 5028, 5177, 5377, 5476, 5615, 5704, 5877, 5936, 5960, 6334, 6408, 6803, 7480, 7917, 8121, 8199, 8599, 8616, 8856, 9163, 9487, 9907, 9945, 10024, 10173, 10373, 10472, 10611, 10700, 10873, 10932, 10956, 11330, 11404, 11799, 12476, 12913, 13117, 13195, 13595, 13612, 13852, 14159, 14483, 14903, 14941, 15020, 15169, 15369, 15468, 15607, 15696, 15869, 15928, 15952, 16326, 16400, 16795, 17472, 17909, 18113, 18191, 18591, 18608, 18848, 19155, 19479, 19899, 19937]


In [24]:
row_index = 32  # Change to the row you want to inspect
label_vector = final_test_df["label_vector"].iloc[row_index]

active_labels = [
    (i, mlb.classes_[i]) for i, val in enumerate(label_vector) if val == 1
]

print(f"Active labels for row {row_index}:")
for idx, label in active_labels:
    print(f"Index: {idx}, Label: {label}")


Active labels for row 32:
Index: 90, Label: 1810
Index: 128, Label: 2188
Index: 265, Label: 3461


In [25]:
train_dataset = Dataset.from_pandas(train_df[["text", "label_vector"]])
test_datasets = {
    lang: Dataset.from_pandas(df[["text", "label_vector"]]) 
    for lang, df in final_test_df.groupby("lang")
}

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')

def tokenize_and_format_tf(batch):
    encodings = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)
    encodings['labels'] = batch['label_vector']
    return encodings

# Tokenize the datasets (train and test)
train_dataset = train_dataset.map(tokenize_and_format_tf, batched=True)
for lang in test_datasets:
    test_datasets[lang] = test_datasets[lang].map(tokenize_and_format_tf, batched=True)

Map:  71%|███████   | 39000/54994 [02:54<01:06, 241.52 examples/s]

In [None]:
# Convert datasets to TensorFlow Dataset
def dataset_to_tf(dataset):
    def gen():
        for example in dataset:
            yield {
                "input_ids": example['input_ids'],
                "attention_mask": example['attention_mask']
            }, example['labels']

    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            {
                "input_ids": tf.TensorSpec(shape=(512,), dtype=tf.int64),
                "attention_mask": tf.TensorSpec(shape=(512,), dtype=tf.int64)
            },
            tf.TensorSpec(shape=(len(mlb.classes_),), dtype=tf.float32)
        )
    )

# Convert both train and test datasets to TensorFlow Dataset
train_tf_dataset = dataset_to_tf(train_dataset)
test_tf_datasets = {lang: dataset_to_tf(test_datasets[lang]) for lang in test_datasets}


In [None]:
# Model for multi-label classification
# Get the number of labels
num_labels = len(mlb.classes_)
model = TFAutoModelForSequenceClassification.from_pretrained(
    'xlm-roberta-base', num_labels=num_labels, problem_type='multi_label_classification'
)

# Compile the model with appropriate loss and optimizer
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
# Train the model
model.fit(train_tf_dataset.batch(8), epochs=5)

In [None]:
# Evaluate the model for each language
for lang, tf_dataset in test_tf_datasets.items():
    results = model.evaluate(tf_dataset.batch(8))
    print(f"Language: {lang}")
    print("Evaluation results:", results)

In [24]:
training_args = TrainingArguments(
    output_dir="./xlm-roberta-eurovoc",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
    logging_dir="./logs",                    # Log directory
    report_to="tensorboard"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_datasets["fr"],  # Or "de", "es" — you can loop through them too
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


RuntimeError: result type Float can't be cast to the desired output type Long