# Legal document classification in zero-shot cross lingual transfer setting

# Part III: Performance improvement and pattern analysis

Date: May 2025

Project of course: Natural Language Processing - ENSAE 3A S2

Author: Noémie Guibé

In [1]:
# import 
import pandas as pd
import spacy
from tqdm import tqdm
from collections import defaultdict, Counter
import itertools
import matplotlib.pyplot as plt

In [2]:
df = pd.read_parquet('https://minio.lab.sspcloud.fr/nguibe/NLP/multi_eurlex_reduced.parquet', engine='pyarrow')

# 1 - Original model through token analysis

This section was intended to explore token patterns accross languages in the specific legal field.

Due to time constraints, this analysis was not completed. However, the following code sketch could be used to pursue this direction later on.

## Cleaning and lemmatization

In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm
!python -m spacy download de_core_news_sm
!python -m spacy download pl_core_news_sm
!python -m spacy download fi_core_news_sm

In [2]:
# Load spaCy models for each language
spacy_models = {
    "en": spacy.load("en_core_web_sm"),
    "fr": spacy.load("fr_core_news_sm"),
    "de": spacy.load("de_core_news_sm"),
    "pl": spacy.load("pl_core_news_sm"),
    "fi": spacy.load("fi_core_news_sm")
}

# List of languages you care about
languages = ["en", "fr", "de", "pl", "fi"]

# Function to clean and lemmatize text
def clean_and_lemmatize(text, lang_code, remove_stopwords=True):
    if lang_code not in spacy_models:
        return None
    
    nlp = spacy_models[lang_code]
    doc = nlp(text)

    tokens = [
        token.lemma_.lower() for token in doc
        if not token.is_punct and not token.is_space and (not token.is_stop if remove_stopwords else True)
    ]
    return " ".join(tokens)

# Apply cleaning/lemmatization across the dataframe
def process_dataframe(df, languages):
    lemmatized_texts = []

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text_dict = row["text"]  # {lang: text}
        labels = row["level_1_labels"]

        for lang in languages:
            if isinstance(text_dict, dict) and lang in text_dict:
                raw_text = text_dict[lang]
                lemmatized = clean_and_lemmatize(raw_text, lang)
                if lemmatized:
                    lemmatized_texts.append({
                        "lang": lang,
                        "text_lemmatized": lemmatized,
                        "labels": labels
                    })

    return pd.DataFrame(lemmatized_texts)



In [None]:
# Usage
processed_df = process_dataframe(df, languages)

In [None]:
# Save for later use
processed_df.to_parquet("data/processed_legal_texts.parquet", index=False)

## Token - Label co-occurence analysis

In [None]:
# Load preprocessed lemmatized texts
df = pd.read_parquet("data/processed_legal_texts.parquet")

# Build token-label co-occurrence mapping
token_label_counts = defaultdict(Counter)

for _, row in df.iterrows():
    tokens = row["text_lemmatized"].split()
    labels = row["labels"]
    
# Count each token against all its labels
    for token in set(tokens):
        for label in labels:
            token_label_counts[token][label] += 1

# Convert to DataFrame for inspection
top_tokens = sorted(token_label_counts.items(), key=lambda x: sum(x[1].values()), reverse=True)[:100]

rows = []
for token, label_counter in top_tokens:
    for label, count in label_counter.items():
        rows.append({"token": token, "label": label, "count": count})

df_token_label = pd.DataFrame(rows)
df_token_label.to_csv("output/token_label_cooccurrence.csv", index=False)


In [None]:
# Visualize token-label counts for one label
def plot_top_tokens_for_label(label, top_n=10):
    label_filtered = df_token_label[df_token_label["label"] == label]
    top = label_filtered.sort_values("count", ascending=False).head(top_n)
    plt.figure(figsize=(10, 5))
    plt.bar(top["token"], top["count"])
    plt.title(f"Top {top_n} tokens for label {label}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# Example: plot_top_tokens_for_label("xx")

## Token distribution accross languages

In [None]:
# Load the same preprocessed file
df = pd.read_parquet("data/processed_legal_texts.parquet")

# Count token frequencies by language
lang_token_freq = defaultdict(Counter)

for _, row in df.iterrows():
    lang = row["lang"]
    tokens = row["text_lemmatized"].split()
    lang_token_freq[lang].update(tokens)

# Create a DataFrame with token frequencies across languages
def get_freq_df(top_tokens=None, min_freq=50):
    all_tokens = set()
    if top_tokens:
        all_tokens = set(top_tokens)
    else:
        # Get common tokens across languages
        for lang, counter in lang_token_freq.items():
            common = {token for token, freq in counter.items() if freq > min_freq}
            all_tokens |= common

    data = []
    for token in all_tokens:
        row = {"token": token}
        for lang in lang_token_freq:
            row[lang] = lang_token_freq[lang][token]
        data.append(row)
    
    return pd.DataFrame(data)


In [None]:
# Plot distribution of selected tokens
def plot_token_distribution(tokens):
    df_freq = get_freq_df(tokens)
    df_freq.set_index("token").T.plot(kind='bar', figsize=(10, 5))
    plt.title("Token frequency across languages")
    plt.ylabel("Frequency")
    plt.xlabel("Language")
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()


In [None]:
# Example plot_token_distribution(["regulation", "union", "market", "recht", "union", "protection"])

# 2 - Other strategies

In [None]:
from src import label_embedding, prompt_model

## Prompt based classification

In [7]:
results =prompt_model.run_prompt_classification(
    df=df,
    train_size=5000,
    test_size=5000,
    batch_size=32,
    epochs=2,
    prompt_type="guided",  # or "generic"
    # freeze_layers=6        # optional
)


Final test set size: 5000


Map: 100%|██████████| 5000/5000 [00:05<00:00, 922.15 examples/s] 
Map: 100%|██████████| 987/987 [00:02<00:00, 469.74 examples/s]
Map: 100%|██████████| 1024/1024 [00:01<00:00, 723.11 examples/s]
Map: 100%|██████████| 1003/1003 [00:02<00:00, 494.85 examples/s]
Map: 100%|██████████| 1014/1014 [00:02<00:00, 453.33 examples/s]
Map: 100%|██████████| 972/972 [00:01<00:00, 488.07 examples/s]
All PyTorch model weights were used when initializing TFXLMRobertaForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFXLMRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/2
Epoch 2/2
Training time: 4727.13 seconds
Initial memory usage: 14895.63 MB
Final memory usage: 41035.94 MB
Memory used during training: 26140.31 MB
[INFO] Evaluating on language: de
R-Precision: 0.2700
Micro F1: 0.2289
Macro F1: 0.0329
LRAP: 0.5418
Evaluation time: 161.53 seconds
[INFO] Evaluating on language: en


2025-05-04 14:52:19.536348: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


R-Precision: 0.2688
Micro F1: 0.2366
Macro F1: 0.0335
LRAP: 0.5420
Evaluation time: 164.89 seconds
[INFO] Evaluating on language: fi
R-Precision: 0.2687
Micro F1: 0.2309
Macro F1: 0.0331
LRAP: 0.5333
Evaluation time: 164.10 seconds
[INFO] Evaluating on language: fr
R-Precision: 0.2740
Micro F1: 0.2419
Macro F1: 0.0339
LRAP: 0.5532
Evaluation time: 166.29 seconds
[INFO] Evaluating on language: pl
R-Precision: 0.2711
Micro F1: 0.2399
Macro F1: 0.0342
LRAP: 0.5391
Evaluation time: 158.41 seconds


In [8]:
results

{'de': {'R-Precision': 0.27001013171225935,
  'Micro F1': 0.22891036906854131,
  'Macro F1': 0.032903877731463936,
  'LRAP': 0.5418273646792338,
  'Eval Time (s)': 161.53033113479614},
 'en': {'R-Precision': 0.26884765625,
  'Micro F1': 0.23659574468085104,
  'Macro F1': 0.03351416515973478,
  'LRAP': 0.5419704551780962,
  'Eval Time (s)': 164.89226126670837},
 'fi': {'R-Precision': 0.26869391824526423,
  'Micro F1': 0.23091891891891894,
  'Macro F1': 0.033088577005297895,
  'LRAP': 0.5332799534823319,
  'Eval Time (s)': 164.10067582130432},
 'fr': {'R-Precision': 0.27396449704142006,
  'Micro F1': 0.2419146183699871,
  'Macro F1': 0.03392290249433107,
  'LRAP': 0.5531542290513365,
  'Eval Time (s)': 166.28934144973755},
 'pl': {'R-Precision': 0.27109053497942387,
  'Micro F1': 0.23991179713340685,
  'Macro F1': 0.03417514763161201,
  'LRAP': 0.5390594264784088,
  'Eval Time (s)': 158.4082236289978}}

## label based embedding

In [None]:
all_results = []

for lang in ["en", "fr", "de", "pl", "fi"]:
    res = label_embedding.run_label_embedding_classification(df, top_k=5, batch_size=32, eval_lang=lang)
    all_results.append(res)

final_df = pd.DataFrame(all_results)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.
100%|██████████| 157/157 [45:19<00:00, 17.32s/it]



[RESULTS]
Language: en
Top-5 Micro F1: 0.2631
Top-5 Macro F1: 0.1001
Top-5 LRAP:     0.2179


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.
100%|██████████| 157/157 [35:48<00:00, 13.68s/it]



[RESULTS]
Language: fr
Top-5 Micro F1: 0.2649
Top-5 Macro F1: 0.0941
Top-5 LRAP:     0.2174


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.
100%|██████████| 157/157 [27:45<00:00, 10.61s/it]



[RESULTS]
Language: de
Top-5 Micro F1: 0.2637
Top-5 Macro F1: 0.0923
Top-5 LRAP:     0.2170


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.
100%|██████████| 157/157 [27:55<00:00, 10.67s/it]



[RESULTS]
Language: pl
Top-5 Micro F1: 0.2634
Top-5 Macro F1: 0.0907
Top-5 LRAP:     0.2167


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.
100%|██████████| 157/157 [27:22<00:00, 10.46s/it]



[RESULTS]
Language: fi
Top-5 Micro F1: 0.2654
Top-5 Macro F1: 0.0938
Top-5 LRAP:     0.2176
