# Legal document classification in zero-shot cross lingual transfer setting

# Part II: Results reproduction

Date: May 2025

Project of course: Natural Language Processing - ENSAE 3A S2

Author: Noémie Guibé

In [1]:
# imports
import pandas as pd
import json

In [None]:
# import data base
df = pd.read_parquet('data/dataset/multi_eurlex_reduced.parquet', engine='pyarrow')

In [4]:
langs_to_keep = ['en', 'de', 'fr', 'pl', 'fi'] 

In [3]:
df_reduced = df.copy()

In [6]:
# Calculate the length of the document for each language
def compute_lengths(text_dict):
    lengths = {lang: len(text_dict[lang]) for lang in langs_to_keep if text_dict.get(lang) is not None}
    return lengths
# Apply the function to the 'text' column and store the result in a new column 'doc_lengths'
df_reduced['doc_lengths'] = df_reduced['text'].apply(compute_lengths)

In [8]:
df_reduced['max_doc_length'] = df_reduced['doc_lengths'].apply(lambda d: max(d.values(), default=0))

In [7]:
df_reduced = df_reduced[df_reduced['max_doc_length']<500000]

In [11]:
len(df_reduced[df_reduced['max_doc_length']<500000]), len(df_reduced)

(64990, 64990)

In [9]:
df_reduced.to_parquet('data/dataset/multi_eurlex_reduced.parquet')

# Get the data ready

In [4]:
# keep only level 3 labels
df_reduced['level_3_labels'] = df_reduced['eurovoc_concepts'].apply(lambda d: d['level_3'] if 'level_3' in d else [])

In [13]:
df_reduced.head()

Unnamed: 0,celex_id,publication_date,text,eurovoc_concepts,split,doc_lengths,max_doc_length,level_3_labels
0,32006D0213,2006-03-06,{'de': 'ENTSCHEIDUNG DER KOMMISSION vom 6. Mär...,"{'all_levels': ['1706', '1826', '2754', '3690'...",train,"{'en': 3233, 'de': 3302, 'fr': 3642, 'pl': 332...",3642,"[1386, 2825, 138, 2475, 3879, 3641]"
1,32003R1330,2003-07-25,{'de': 'Verordnung (EG) Nr. 1330/2003 der Komm...,"{'all_levels': ['1117', '1118', '1605', '2635'...",train,"{'en': 1328, 'de': 1430, 'fr': 1437, 'fi': 1366}",1437,"[1115, 2656, 1602]"
2,32003R1786,2003-09-29,{'de': 'Verordnung (EG) Nr. 1786/2003 des Rate...,"{'all_levels': ['2173', '4854', '614', '797'],...",train,"{'en': 17741, 'de': 19641, 'fr': 19133, 'pl': ...",19641,"[614, 712, 1277, 2443]"
3,31985R2590,1985-09-13,{'de': '***** VERORDNUNG (EWG) Nr. 2590/85 DER...,"{'all_levels': ['1201', '1261', '5334', '755',...",train,"{'en': 2525, 'de': 2720, 'fr': 2684, 'fi': 2527}",2720,"[2413, 712, 2477, 4488, 2443]"
4,31993R1103,1993-04-30,{'de': 'VERORDNUNG (EWG) Nr. 1103/93 DER KOMMI...,"{'all_levels': ['1309', '2159', '2192', '235',...",train,"{'en': 27992, 'de': 29436, 'fr': 32297}",32297,"[539, 956, 1847, 2106, 614, 2858, 6205, 1845, ..."


In [5]:
train_df = df_reduced[df_reduced['split']=='train']
# English-only training set
train_df.loc[:,'text'] = train_df["text"].apply(lambda x: isinstance(x, dict) and x.get("en"))

In [6]:
# test 
test_df = df_reduced[df_reduced['split']=='test']

# Test set in multiple languages
test_langs = ["fr", "de", "pl",'fi']  # whatever languages you want
test_dfs = []

for lang in test_langs:
    # Filter rows where the language exists in the text dictionary
    df_lang = test_df[test_df["text"].apply(lambda x: isinstance(x, dict) and lang in x)]
    
    # Now extract the respective language text, and add the 'lang' column
    df_lang.loc[:,"text"] = df_lang["text"].apply(lambda x: x[lang])  # Extract the language text
    df_lang["lang"] = lang  # Add a new column for language
    
    # Append to test_dfs
    test_dfs.append(df_lang)

# Combine the list of DataFrames into one (exploded test set)
final_test_df = pd.concat(test_dfs, ignore_index=True)

In [82]:
train_df.head()

Unnamed: 0,celex_id,publication_date,text,eurovoc_concepts,split,doc_lengths,max_doc_length,level_3_labels
0,32006D0213,2006-03-06,COMMISSION DECISION\nof 6 March 2006\nestablis...,"{'all_levels': ['1706', '1826', '2754', '3690'...",train,"{'en': 3233, 'de': 3302, 'fr': 3642, 'pl': 332...",3642,"[1386, 2825, 138, 2475, 3879, 3641]"
1,32003R1330,2003-07-25,Commission Regulation (EC) No 1330/2003\nof 25...,"{'all_levels': ['1117', '1118', '1605', '2635'...",train,"{'en': 1328, 'de': 1430, 'fr': 1437, 'fi': 1366}",1437,"[1115, 2656, 1602]"
2,32003R1786,2003-09-29,Council Regulation (EC) No 1786/2003\nof 29 Se...,"{'all_levels': ['2173', '4854', '614', '797'],...",train,"{'en': 17741, 'de': 19641, 'fr': 19133, 'pl': ...",19641,"[614, 712, 1277, 2443]"
3,31985R2590,1985-09-13,*****\nCOMMISSION REGULATION (EEC) No 2590/85\...,"{'all_levels': ['1201', '1261', '5334', '755',...",train,"{'en': 2525, 'de': 2720, 'fr': 2684, 'fi': 2527}",2720,"[2413, 712, 2477, 4488, 2443]"
4,31993R1103,1993-04-30,COMMISSION REGULATION (EEC) No 1103/93 of 30 A...,"{'all_levels': ['1309', '2159', '2192', '235',...",train,"{'en': 27992, 'de': 29436, 'fr': 32297}",32297,"[539, 956, 1847, 2106, 614, 2858, 6205, 1845, ..."


In [7]:
train_label_matrix = mlb.transform(train_df["level_3_labels"])

# Check shape consistency
assert train_label_matrix.shape[0] == train_df.shape[0], "Mismatch in rows!"

# Now assign safely
train_df = train_df.copy()  # Avoid SettingWithCopyWarning
train_df["label_vector"] = [row.tolist() for row in train_label_matrix]

NameError: name 'mlb' is not defined

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
label_matrix = mlb.fit_transform(train_df["level_3_labels"])
train_df["label_vector"] = [row.tolist() for row in label_matrix]

# Apply same transformation to test sets
final_test_df["label_vector"] = [row.tolist() for row in mlb.transform(final_test_df["level_3_labels"])]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df["label_vector"] = [row.tolist() for row in label_matrix]


In [88]:
train_df["label_vector"].iloc[0]

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [9]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df[["text", "label_vector"]])
test_datasets = {
    lang: Dataset.from_pandas(df[["text", "label_vector"]]) 
    for lang, df in final_test_df.groupby("lang")
}

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Tokenization function (ensure it handles both text and labels correctly)
def tokenize(batch):
    if isinstance(batch["text"], list):
        return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
    else:
        texts = [str(item) for item in batch["text"]]  # Ensure the text is in the correct format
        return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)
for lang in test_datasets:
    test_datasets[lang] = test_datasets[lang].map(tokenize, batched=True)

# Prepare labels (float32 for multi-label classification)
def prepare_dataset(example):
    example["labels"] = example["label_vector"]  # Your label_vector is the multi-label encoding
    return example

train_dataset = train_dataset.map(prepare_dataset)
for lang in test_datasets:
    test_datasets[lang] = test_datasets[lang].map(prepare_dataset)

# Set format for input_ids and attention_mask (int64) for both train and test datasets
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"], dtype=torch.int64)
# Set format for labels (float32) for multi-label classification
train_dataset.set_format(type="torch", columns=["labels"], dtype=torch.float32)

# Do the same for test datasets
for lang in test_datasets:
    test_datasets[lang].set_format(type="torch", columns=["input_ids", "attention_mask"], dtype=torch.int64)
    test_datasets[lang].set_format(type="torch", columns=["labels"], dtype=torch.float32)

print(train_dataset[0])  # Check the first example after format adjustment


Map: 100%|██████████| 54994/54994 [00:42<00:00, 1279.24 examples/s]
Map: 100%|██████████| 4996/4996 [00:09<00:00, 537.85 examples/s]
Map: 100%|██████████| 4996/4996 [00:08<00:00, 609.07 examples/s]
Map: 100%|██████████| 4996/4996 [00:09<00:00, 518.93 examples/s]
Map: 100%|██████████| 4996/4996 [00:09<00:00, 525.25 examples/s]
Map: 100%|██████████| 54994/54994 [00:19<00:00, 2775.05 examples/s]
Map: 100%|██████████| 4996/4996 [00:01<00:00, 3530.64 examples/s]
Map: 100%|██████████| 4996/4996 [00:01<00:00, 2505.32 examples/s]
Map: 100%|██████████| 4996/4996 [00:01<00:00, 2523.33 examples/s]
Map: 100%|██████████| 4996/4996 [00:01<00:00, 2500.61 examples/s]


NameError: name 'torch' is not defined

In [13]:
import torch

In [14]:
# Set format for input_ids and attention_mask (int64) for both train and test datasets
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"], dtype=torch.int64)
# Set format for labels (float32) for multi-label classification
train_dataset.set_format(type="torch", columns=["labels"], dtype=torch.float32)

# Do the same for test datasets
for lang in test_datasets:
    test_datasets[lang].set_format(type="torch", columns=["input_ids", "attention_mask"], dtype=torch.int64)
    test_datasets[lang].set_format(type="torch", columns=["labels"], dtype=torch.float32)

print(train_dataset[0])  # Check the first example after format adjustment


{'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0

In [21]:
num_labels = len(mlb.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    problem_type="multi_label_classification",  # Ensure this is correct for multi-label task
    num_labels=num_labels,  # The number of labels from the MultiLabelBinarizer
    id2label={i: label for i, label in enumerate(mlb.classes_)},
    label2id={label: i for i, label in enumerate(mlb.classes_)}
)


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
training_args = TrainingArguments(
    output_dir="./xlm-roberta-eurovoc",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
    logging_dir="./logs",                    # Log directory
    report_to="tensorboard"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_datasets["fr"],  # Or "de", "es" — you can loop through them too
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


RuntimeError: result type Float can't be cast to the desired output type Long

In [23]:
def tokenize_and_format(batch):
    # Tokenize the texts (make sure to include padding and truncation)
    encodings = tokenizer(batch['text'], padding='max_length', truncation=True, max_length=512)
    
    # Add labels to the encoded data
    encodings['labels'] = batch['label_vector']
    return encodings

# Tokenize and add labels to the training and test datasets
train_dataset = train_dataset.map(tokenize_and_format, batched=True)
for lang in test_datasets:
    test_datasets[lang] = test_datasets[lang].map(tokenize_and_format, batched=True)
    
# Check if all the required columns are present in train and test datasets
print(train_dataset[0])  # Should show input_ids, attention_mask, and labels

# Now set the format correctly:
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"], dtype=torch.int64)
for lang in test_datasets:
    test_datasets[lang].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"], dtype=torch.int64)


Map: 100%|██████████| 54994/54994 [01:02<00:00, 873.04 examples/s]
Map: 100%|██████████| 4996/4996 [00:12<00:00, 397.19 examples/s]
Map: 100%|██████████| 4996/4996 [00:11<00:00, 424.15 examples/s]
Map: 100%|██████████| 4996/4996 [00:12<00:00, 394.05 examples/s]
Map: 100%|██████████| 4996/4996 [00:13<00:00, 369.92 examples/s]

{'labels': tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0




In [90]:
print(train_dataset)

Dataset({
    features: ['text', 'label_vector', '__index_level_0__'],
    num_rows: 54994
})


In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

def tokenize(batch):
    # Make sure batch["text"] is a list of strings, not a list of dictionaries
    if isinstance(batch["text"], list):
        # If already a list of strings, continue
        return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
    else:
        # If it's not, extract the correct string from each entry (e.g., handling dicts)
        texts = [str(item) for item in batch["text"]]  # Convert each item to string (adjust if it's a dictionary)
        return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

In [41]:
# Save the tokenizer to a specified directory
tokenizer.save_pretrained("model/tokenizer")

('model/tokenizer/tokenizer_config.json',
 'model/tokenizer/special_tokens_map.json',
 'model/tokenizer/tokenizer.json')

In [15]:
from transformers import AutoTokenizer

# Load the tokenizer from the local directory
tokenizer = AutoTokenizer.from_pretrained("model/tokenizer/")


ValueError: Unrecognized model in model/tokenizer/. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, aria, aria_text, audio-spectrogram-transformer, autoformer, aya_vision, bamba, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, cohere2, colpali, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dab-detr, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deepseek_v3, deformable_detr, deit, depth_anything, depth_pro, deta, detr, diffllama, dinat, dinov2, dinov2_with_registers, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, emu3, encodec, encoder-decoder, ernie, ernie_m, esm, falcon, falcon_mamba, fastspeech2_conformer, flaubert, flava, fnet, focalnet, fsmt, funnel, fuyu, gemma, gemma2, gemma3, gemma3_text, git, glm, glm4, glpn, got_ocr2, gpt-sw3, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gpt_neox_japanese, gptj, gptsan-japanese, granite, granitemoe, granitemoeshared, granitevision, graphormer, grounding-dino, groupvit, helium, hiera, hubert, ibert, idefics, idefics2, idefics3, idefics3_vision, ijepa, imagegpt, informer, instructblip, instructblipvideo, jamba, jetmoe, jukebox, kosmos-2, layoutlm, layoutlmv2, layoutlmv3, led, levit, lilt, llama, llama4, llama4_text, llava, llava_next, llava_next_video, llava_onevision, longformer, longt5, luke, lxmert, m2m_100, mamba, mamba2, marian, markuplm, mask2former, maskformer, maskformer-swin, mbart, mctct, mega, megatron-bert, mgp-str, mimi, mistral, mistral3, mixtral, mllama, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, modernbert, moonshine, moshi, mpnet, mpt, mra, mt5, musicgen, musicgen_melody, mvp, nat, nemotron, nezha, nllb-moe, nougat, nystromformer, olmo, olmo2, olmoe, omdet-turbo, oneformer, open-llama, openai-gpt, opt, owlv2, owlvit, paligemma, patchtsmixer, patchtst, pegasus, pegasus_x, perceiver, persimmon, phi, phi3, phi4_multimodal, phimoe, pix2struct, pixtral, plbart, poolformer, pop2piano, prompt_depth_anything, prophetnet, pvt, pvt_v2, qdqbert, qwen2, qwen2_5_vl, qwen2_audio, qwen2_audio_encoder, qwen2_moe, qwen2_vl, qwen3, qwen3_moe, rag, realm, recurrent_gemma, reformer, regnet, rembert, resnet, retribert, roberta, roberta-prelayernorm, roc_bert, roformer, rt_detr, rt_detr_resnet, rt_detr_v2, rwkv, sam, sam_vision_model, seamless_m4t, seamless_m4t_v2, segformer, seggpt, sew, sew-d, shieldgemma2, siglip, siglip2, siglip_vision_model, smolvlm, smolvlm_vision, speech-encoder-decoder, speech_to_text, speech_to_text_2, speecht5, splinter, squeezebert, stablelm, starcoder2, superglue, superpoint, swiftformer, swin, swin2sr, swinv2, switch_transformers, t5, table-transformer, tapas, textnet, time_series_transformer, timesformer, timm_backbone, timm_wrapper, trajectory_transformer, transfo-xl, trocr, tvlt, tvp, udop, umt5, unispeech, unispeech-sat, univnet, upernet, van, video_llava, videomae, vilt, vipllava, vision-encoder-decoder, vision-text-dual-encoder, visual_bert, vit, vit_hybrid, vit_mae, vit_msn, vitdet, vitmatte, vitpose, vitpose_backbone, vits, vivit, wav2vec2, wav2vec2-bert, wav2vec2-conformer, wavlm, whisper, xclip, xglm, xlm, xlm-prophetnet, xlm-roberta, xlm-roberta-xl, xlnet, xmod, yolos, yoso, zamba, zamba2, zoedepth

In [17]:
# Apply tokenization to the training dataset
def tokenize(batch):
    # batch["text"] is already List[str]
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize, batched=True)

# Apply tokenization to each language-specific test dataset
for lang in test_datasets:
    test_datasets[lang] = test_datasets[lang].map(tokenize, batched=True)

Map: 100%|██████████| 54994/54994 [00:49<00:00, 1106.69 examples/s]
Map: 100%|██████████| 4996/4996 [00:11<00:00, 444.51 examples/s]
Map: 100%|██████████| 4996/4996 [00:10<00:00, 487.82 examples/s]
Map: 100%|██████████| 4996/4996 [00:11<00:00, 416.63 examples/s]
Map: 100%|██████████| 4996/4996 [00:11<00:00, 426.73 examples/s]


In [92]:
# still not empty?
print(train_dataset)

Dataset({
    features: ['text', 'label_vector', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 54994
})


In [93]:
print(train_dataset[0])

{'text': 'COMMISSION DECISION\nof 6 March 2006\nestablishing the classes of reaction-to-fire performance for certain construction products as regards wood flooring and solid wood panelling and cladding\n(notified under document number C(2006) 655)\n(Text with EEA relevance)\n(2006/213/EC)\nTHE COMMISSION OF THE EUROPEAN COMMUNITIES,\nHaving regard to the Treaty establishing the European Community,\nHaving regard to Directive 89/106/EEC of 21 December 1988, on the approximation of laws, regulations and administrative provisions of the Member States relating to construction products (1), and in particular Article 20(2) thereof,\nWhereas:\n(1)\nDirective 89/106/EEC envisages that in order to take account of different levels of protection for construction works at national, regional or local level, it may be necessary to establish in the interpretative documents classes corresponding to the performance of products in respect of each essential requirement. Those documents have been publishe

In [95]:
print(test_datasets)

{'de': Dataset({
    features: ['text', 'label_vector', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 4996
}), 'fi': Dataset({
    features: ['text', 'label_vector', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 4996
}), 'fr': Dataset({
    features: ['text', 'label_vector', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 4996
}), 'pl': Dataset({
    features: ['text', 'label_vector', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 4996
})}


In [97]:
print(test_datasets['de'][0])

{'text': 'VERORDNUNG (EU) Nr. 1390/2013 DES RATES\nvom 16. Dezember 2013\nüber die Aufteilung der Fangmöglichkeiten nach dem zwischen der Europäischen Union und der Union der Komoren vereinbarten Protokoll zur Festlegung der Fangmöglichkeiten und der finanziellen Gegenleistung nach dem partnerschaftlichen Fischereiabkommen zwischen den beiden Vertragsparteien\nDER RAT DER EUROPÄISCHEN UNION -\ngestützt auf den Vertrag über die Arbeitsweise der Europäischen Union, insbesondere auf Artikel 43 Absatz 3,\nauf Vorschlag der Europäischen Kommission,\nin Erwägung nachstehender Gründe:\n(1)\nAm 5. Oktober 2006 hat der Rat die Verordnung (EG) Nr. 1563/2006 (1) über den Abschluss des partnerschaftlichen Fischereiabkommens zwischen der Europäischen Gemeinschaft und der Union der Komoren (im Folgenden „partnerschaftliches Fischereiabkommen“) angenommen.\n(2)\nDie Europäische Union hat mit der Union der Komoren ein neues Protokoll zum partnerschaftlichen Fischereiabkommen (nachstehend „neues Protok

In [20]:
import torch
print(torch.__version__)
print(torch.cuda.is_available())

2.7.0+cu126
False


# with pytorch

In [21]:
train_dataset

Dataset({
    features: ['text', 'label_vector', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 54994
})

In [16]:
from transformers import AutoModelForSequenceClassification

In [45]:
print(train_dataset[0]["label_vector"])
print(type(train_dataset[0]["label_vector"]))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [46]:
from tqdm import tqdm

for ex in tqdm(train_dataset):
    lv = ex["label_vector"]
    if not isinstance(lv, list) or not all(isinstance(i, int) for i in lv):
        print("Invalid label_vector:", lv)
        break


100%|██████████| 54994/54994 [00:39<00:00, 1377.72it/s]


In [67]:
def prepare_dataset(example):
    example["labels"] = example["label_vector"]
    return example
train_dataset = train_dataset.map(prepare_dataset)
for lang in test_datasets:
    test_datasets[lang] = test_datasets[lang].map(prepare_dataset)

Map: 100%|██████████| 54994/54994 [00:17<00:00, 3112.98 examples/s]
Map: 100%|██████████| 4996/4996 [00:01<00:00, 3596.54 examples/s]
Map: 100%|██████████| 4996/4996 [00:01<00:00, 2886.22 examples/s]


In [68]:
# Verify column names before setting format
print(train_dataset.column_names)  # This should show the column names if the dataset is valid

# Set format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

['text', 'label_vector', '__index_level_0__', 'input_ids', 'attention_mask', 'labels']


In [49]:
print(train_dataset[0]["label_vector"])
print(type(train_dataset[0]["label_vector"]))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [51]:
train_dataset = train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [69]:
print(train_dataset.column_names)

['text', 'label_vector', '__index_level_0__', 'input_ids', 'attention_mask', 'labels']


In [56]:
empty_labels = train_df[train_df['label_vector'].apply(lambda x: len(x) == 0)]
print(empty_labels)

Empty DataFrame
Columns: [celex_id, publication_date, text, eurovoc_concepts, split, doc_lengths, max_doc_length, level_3_labels, label_vector]
Index: []


In [70]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=4)

for batch in train_loader:
    for k, v in batch.items():
        print(f"{k}: {type(v)} - shape: {v.shape if hasattr(v, 'shape') else 'N/A'}")
    break


input_ids: <class 'torch.Tensor'> - shape: torch.Size([4, 512])
attention_mask: <class 'torch.Tensor'> - shape: torch.Size([4, 512])
labels: <class 'torch.Tensor'> - shape: torch.Size([4, 500])


In [71]:
for lang in test_datasets:
    test_datasets[lang].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"], dtype={"labels": torch.float32})


In [74]:
print(test_datasets.column_names)

AttributeError: 'dict' object has no attribute 'column_names'

In [73]:
test_datasets_loader = DataLoader(test_datasets, batch_size=4)

for batch in test_datasets_loader:
    for k, v in batch.items():
        print(f"{k}: {type(v)} - shape: {v.shape if hasattr(v, 'shape') else 'N/A'}")
    break


KeyError: 0

In [18]:
# last modif
def prepare_dataset(example):
    example["labels"] = example["label_vector"]
    return example
train_dataset = train_dataset.map(prepare_dataset)
for lang in test_datasets:
    test_datasets[lang] = test_datasets[lang].map(prepare_dataset)
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
    dtype={"input_ids": torch.int64, "attention_mask": torch.int64, "labels": torch.float32}
)
for lang in test_datasets:
    test_datasets[lang].set_format(type="torch", columns=["input_ids", "attention_mask", "labels"], dtype={"labels": torch.float32, "attention_mask": torch.int64, "labels": torch.float32})


Map: 100%|██████████| 54994/54994 [00:20<00:00, 2667.03 examples/s]
Map: 100%|██████████| 4996/4996 [00:01<00:00, 3516.65 examples/s]
Map: 100%|██████████| 4996/4996 [00:01<00:00, 3459.28 examples/s]
Map: 100%|██████████| 4996/4996 [00:01<00:00, 2560.81 examples/s]
Map: 100%|██████████| 4996/4996 [00:02<00:00, 2080.74 examples/s]


In [21]:
# Get the number of labels
num_labels = len(mlb.classes_)
# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    problem_type="multi_label_classification",
    num_labels=num_labels,
    id2label={i: label for i, label in enumerate(mlb.classes_)},
    label2id={label: i for i, label in enumerate(mlb.classes_)}
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
from transformers import TrainingArguments

In [33]:
from transformers import pipeline
print(pipeline('sentiment-analysis')('we love you'))

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998704195022583}]


In [23]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./xlm-roberta-eurovoc",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
    logging_dir="./logs",                    # Log directory
    report_to="tensorboard"
)

In [18]:
from transformers import Trainer

In [19]:
def compute_metrics(pred):
    logits = pred.predictions  # ← grab predictions from the object
    labels = pred.label_ids    # ← grab true labels

    probs = torch.sigmoid(torch.from_numpy(logits))  # safely convert to tensor
    preds = (probs > 0.5).int().numpy()

    return {
        "micro_f1": f1_score(labels, preds, average="micro"),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

In [26]:
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_datasets["fr"],  # Or "de", "es" — you can loop through them too
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

  trainer = Trainer(


TypeError: tensor(): argument 'dtype' must be torch.dtype, not dict

In [130]:
train_dataset['labels'][0][0].dtype

AttributeError: 'int' object has no attribute 'dtype'

In [134]:
print(torch.Tensor(train_dataset['labels']).dtype)

ValueError: expected sequence of length 5 at dim 1 (got 2)

# with TF

In [31]:
import os
os.environ["TRANSFORMERS_BACKEND"] = "pt"

In [34]:
from transformers import TFAutoModelForSequenceClassification


# Get the number of labels
num_labels = len(mlb.classes_)

# Load model
model = TFAutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    problem_type="multi_label_classification",
    num_labels=num_labels,
    id2label={i: label for i, label in enumerate(mlb.classes_)},
    label2id={label: i for i, label in enumerate(mlb.classes_)}
)




Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





All PyTorch model weights were used when initializing TFXLMRobertaForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFXLMRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
def prepare_dataset(example):
    example["labels"] = example["label_vector"]
    return example

# Apply the transformation for training and test datasets
train_dataset = train_dataset.map(prepare_dataset)
for lang in test_datasets:
    test_datasets[lang] = test_datasets[lang].map(prepare_dataset)

# Set format for TensorFlow
train_dataset.set_format(type="tensorflow", columns=["input_ids", "attention_mask", "labels"])
for lang in test_datasets:
    test_datasets[lang].set_format(type="tensorflow", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|██████████| 54994/54994 [00:42<00:00, 1288.60 examples/s]
Map: 100%|██████████| 4996/4996 [00:04<00:00, 1242.97 examples/s]
Map: 100%|██████████| 4996/4996 [00:03<00:00, 1313.88 examples/s]


In [36]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    logits, labels = pred
    probs = torch.sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    return {
        "micro_f1": f1_score(labels, preds, average="micro"),
        "macro_f1": f1_score(labels, preds, average="macro")
    }

In [46]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(
    "xlm-roberta-base",
    num_labels=2,  # Adjust as per your specific task
    problem_type="multi_label_classification"
)


ImportError: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.


In [47]:
# Load pre-trained TensorFlow model and tokenizer
from transformers import TFAutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
model = TFAutoModelForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

All PyTorch model weights were used when initializing TFXLMRobertaForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFXLMRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./test_output",      
    evaluation_strategy="epoch",             
    save_strategy="epoch",                   
    learning_rate=2e-5,                      
    per_device_train_batch_size=8,           
    per_device_eval_batch_size=8,            
    num_train_epochs=3,                      
    weight_decay=0.01,                       
    save_total_limit=1,                      
    load_best_model_at_end=True,
    logging_dir="./logs",                    # Log directory
    report_to="tensorboard"                  # Use tensorboard for logging
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [44]:
# Define your training arguments (adjust hyperparameters as needed)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./xlm-roberta-eurovoc",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="micro_f1",
)


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_datasets["fr"],  # Or "de", "es" — you can loop through them too
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


In [None]:
for lang, dataset in test_datasets.items():
    results = trainer.evaluate(dataset)
    print(f"Language: {lang}")
    print(results)


## Test with article code

In [None]:
# label encoding
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit(train_df['level_3_labels'])

train_df['label_vector'] = list(mlb.transform(train_df['level_3_labels']))
test_df['label_vector'] = list(mlb.transform(test_df['level_3_labels']))

label_index = {label: idx for idx, label in enumerate(mlb.classes_)}

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['label_vector'] = list(mlb.transform(train_df['level_3_labels']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['label_vector'] = list(mlb.transform(test_df['level_3_labels']))


In [None]:
from datasets import load_dataset

streamed_dataset = load_dataset("path/to/your/data.csv", split="train", streaming=True)


In [46]:
from datasets import Dataset as HFDataset

train_dataset = HFDataset.from_pandas(train_df[['text', 'label_vector', 'celex_id']])
test_dataset = HFDataset.from_pandas(test_df[['text', 'label_vector', 'celex_id']])

ArrowMemoryError: realloc of size 826277888 failed

In [51]:
small_train_df = train_df.sample(1000).copy()
train_dataset = HFDataset.from_pandas(small_train_df[['text', 'label_vector', 'celex_id']])

In [52]:
small_test_df = test_df.sample(500).copy()
test_dataset = HFDataset.from_pandas(small_test_df[['text', 'label_vector', 'celex_id']])

In [53]:
from experiments.model import Classifier

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
model = Classifier(bert_model_path='xlm-roberta-base', num_labels=len(label_index))
model.adapt_model(use_adapters=True, num_frozen_layers=None)  # Or skip this for baseline


##

In [None]:
train_gen = SampleGenerator(train_dataset, label_index, 'xlm-roberta-base', lang='en', multilingual_train=False)
test_gen = SampleGenerator(test_dataset, label_index, 'xlm-roberta-base', lang=['fr', 'de', 'es', 'it'], multilingual_train=True)


Suggested Steps to Fix the Issue:
Downgrade Python to a Compatible Version (3.7–3.10): To resolve the issue, I recommend downgrading Python to a version that TensorFlow supports, ideally Python 3.10. Here's how to do it:

Step 1: Install Python 3.10
Download Python 3.10: Go to the Python 3.10 download page and download the installer for your operating system.

Install Python 3.10: During installation, make sure to check the box that says "Add Python to PATH" to make it accessible from the command line.

Step 2: Create a Virtual Environment with Python 3.10
After installing Python 3.10, create a new virtual environment:

Windows:

bash
Copier
Modifier
python3.10 -m venv tf_env
.\tf_env\Scripts\activate
macOS/Linux:

bash
Copier
Modifier
python3.10 -m venv tf_env
source tf_env/bin/activate
Step 3: Install TensorFlow in the New Virtual Environment
After creating and activating your new environment, install TensorFlow:

bash
Copier
Modifier
pip install tensorflow
Verify TensorFlow Installation: Once TensorFlow is installed, verify that the installation is successful by running:

python
Copier
Modifier
import tensorflow as tf
print(tf.__version__)
This should print the TensorFlow version without errors.

Alternative: Using Docker for Isolation (Optional)
If you prefer not to downgrade Python globally or create a new Python installation, you can use Docker to run a TensorFlow-compatible environment in an isolated container. Docker allows you to run a specific version of Python and TensorFlow without affecting your system-wide Python installation.

# Debug

In [103]:
# train
train_df = df_reduced[df_reduced['split']=='train']
train_df['text'] = train_df["text"].apply(lambda x: isinstance(x, dict) and x.get("en"))

# test 
test_df = df_reduced[df_reduced['split']=='test']
test_langs = ["fr", "de", "pl","fi"] 
test_dfs = []

for lang in test_langs:
    # Filter rows where the language exists in the text dictionary
    df_lang = test_df[test_df["text"].apply(lambda x: isinstance(x, dict) and lang in x)]
    
    # Now extract the respective language text, and add the 'lang' column
    df_lang["text"] = df_lang["text"].apply(lambda x: x[lang])  # Extract the language text
    df_lang["lang"] = lang  # Add a new column for language
    
    # Append to test_dfs
    test_dfs.append(df_lang)

# Combine the list of DataFrames into one (exploded test set)
final_test_df = pd.concat(test_dfs, ignore_index=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['text'] = train_df["text"].apply(lambda x: isinstance(x, dict) and x.get("en"))


In [105]:
from sklearn.preprocessing import MultiLabelBinarizer

# Initialize the MultiLabelBinarizer for the 'level_3_labels'
mlb = MultiLabelBinarizer()

# Encode the 'level_3_labels' column
train_df['label_vector'] = mlb.fit_transform(train_df['level_3_labels']).tolist()

# You can optionally do this for the test set as well if you have labels for the test data
final_test_df['label_vector'] = mlb.transform(final_test_df['level_3_labels']).tolist()

# Check the resulting label vectors (optional)
print(train_df['label_vector'].head())
print(final_test_df['label_vector'].head())



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['label_vector'] = mlb.fit_transform(train_df['level_3_labels']).tolist()


0    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: label_vector, dtype: object
0    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: label_vector, dtype: object


In [106]:
# Initialize the tokenizer (for example, using BERT)
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Define a function to tokenize the text
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Convert the train and test DataFrames to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(final_test_df)

# Tokenize the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Check the tokenized datasets
print(train_dataset[0])
print(test_dataset[0])


Map: 100%|██████████| 54994/54994 [00:39<00:00, 1390.91 examples/s]
Map: 100%|██████████| 19984/19984 [00:26<00:00, 756.52 examples/s]

{'celex_id': '32006D0213', 'publication_date': '2006-03-06', 'text': 'COMMISSION DECISION\nof 6 March 2006\nestablishing the classes of reaction-to-fire performance for certain construction products as regards wood flooring and solid wood panelling and cladding\n(notified under document number C(2006) 655)\n(Text with EEA relevance)\n(2006/213/EC)\nTHE COMMISSION OF THE EUROPEAN COMMUNITIES,\nHaving regard to the Treaty establishing the European Community,\nHaving regard to Directive 89/106/EEC of 21 December 1988, on the approximation of laws, regulations and administrative provisions of the Member States relating to construction products (1), and in particular Article 20(2) thereof,\nWhereas:\n(1)\nDirective 89/106/EEC envisages that in order to take account of different levels of protection for construction works at national, regional or local level, it may be necessary to establish in the interpretative documents classes corresponding to the performance of products in respect of ea




In [109]:
from transformers import Trainer, TrainingArguments, BertForSequenceClassification

# Initialize the model (for example, using BERT)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(mlb.classes_))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # You can use the validation set if available
    compute_metrics=compute_metrics,  # Add metrics if needed
)

# Train the model
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Trainer is attempting to log a value of "{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2', 3: 'LABEL_3', 4: 'LABEL_4', 5: 'LABEL_5', 6: 'LABEL_6', 7: 'LABEL_7', 8: 'LABEL_8', 9: 'LABEL_9', 10: 'LABEL_10', 11: 'LABEL_11', 12: 'LABEL_12', 13: 'LABEL_13', 14: 'LABEL_14', 15: 'LABEL_15', 16: 'LABEL_16', 17: 'LABEL_17', 18: 'LABEL_18', 19: 'LABEL_19', 20: 'LABEL_20', 21: 'LABEL_21', 22: 'LABEL_22', 23: 'LABEL_23', 24: 'LABEL_24', 25: 'LABEL_25', 26: 'LABEL_26', 27: 'LABEL_27', 28: 'LABEL_28', 29: 'LABEL_29', 30: 'LABEL_30', 31: 'LABEL_31', 32: 'LABEL_32', 33: 'LABEL_33', 34: 'LABEL_34', 35: 'LABEL_35', 36: 'LABEL_36', 37: 'LABEL_37', 38: 'LABEL_38', 39: 'LABEL_39', 40: 'LABEL_40', 41: 'LABEL_41', 42: 'LABEL_42

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.

In [112]:
from transformers import Trainer, TrainingArguments
from transformers import BertForSequenceClassification
from torch.nn import BCEWithLogitsLoss
import torch

class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Make sure the model returns logits and not just predictions
        outputs = model(**inputs)
        logits = outputs.logits
        labels = inputs.get("labels")

        # Ensure the labels are in the correct format (binary vector for multi-label)
        if labels is not None:
            # BCEWithLogitsLoss works directly with logits (no need to apply sigmoid)
            loss_fct = BCEWithLogitsLoss()
            loss = loss_fct(logits, labels.float())  # Convert to float for BCE
        else:
            loss = None

        if return_outputs:
            return loss, outputs
        return loss

# Prepare the model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=86)  # Adjust num_labels

# Prepare the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
)

trainer = MultiLabelTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,  # Optional, if you have metrics to compute
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: MultiLabelTrainer.compute_loss() got an unexpected keyword argument 'num_items_in_batch'

# en réutilisant code

In [114]:
from datasets import load_dataset

In [122]:
load_dataset('multi_eurlex', language='en', label_level='level_3')

Generating train split: 55000 examples [01:28, 618.59 examples/s] 
Generating test split: 2876 examples [00:12, 239.55 examples/s]


KeyboardInterrupt: 

In [None]:
train_dataset = load_dataset('multi_eurlex', language='en', label_level='level_3', split='train')

TypeError: MultiEURLEXConfig.__init__() missing 1 required positional argument: 'language'

In [125]:
load_dataset('multi_eurlex', language=['fr','de','pl','fi'], label_level='level_3')

Generating train split: 0 examples [00:11, ? examples/s]


DatasetGenerationError: An error occurred while generating the dataset

In [None]:

eval_dataset = load_dataset('multi_eurlex', language='all_languages',
                                languages=['fr','de','pl','fi'], label_level='level_3')

In [117]:
train_dataset

Dataset({
    features: ['celex_id', 'text', 'labels'],
    num_rows: 55000
})