# Legal document classification in zero-shot cross lingual transfer setting

# Part III: Performance improvement and pattern analysis

Date: May 2025

Project of course: Natural Language Processing - ENSAE 3A S2

Author: Noémie Guibé

In [1]:
# import 
import pandas as pd
import spacy
from tqdm import tqdm
from collections import defaultdict, Counter
import itertools
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'spacy'

In [7]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm
!python -m spacy download de_core_news_sm
!python -m spacy download pl_core_news_sm
!python -m spacy download fi_core_news_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.1/12.8 MB 2.2 MB/s eta 0:00:06
      --------------------------------------- 0.2/12.8 MB 2.0 MB/s eta 0:00:07
     - -------------------------------------- 0.3/12.8 MB 2.0 MB/s eta 0:00:07
     - -------------------------------------- 0.5/12.8 MB 2.2 MB/s eta 0:00:06
     - -------------------------------------- 0.6/12.8 MB 2.5 MB/s eta 0:00:05
     -- ------------------------------------- 0.7/12.8 MB 2.4 MB/s eta 0:00:06
     -- ------------------------------------- 0.7/12.8 MB 2.4 MB/s eta 0:00:06
     -- ------------------------------------- 0.9/12.8 MB 2.2 MB/s eta 0:00:06
     --- ------------------------------------ 1.2/12.8 MB 2.7 MB/s eta 0:00:05
     ---- -------------------------------

In [6]:
df = pd.read_parquet('https://minio.lab.sspcloud.fr/nguibe/NLP/multi_eurlex_reduced.parquet', engine='pyarrow')

# 1 - Original model through token analysis

## Cleaning and lemmatization

In [2]:
# Load spaCy models for each language
spacy_models = {
    "en": spacy.load("en_core_web_sm"),
    "fr": spacy.load("fr_core_news_sm"),
    "de": spacy.load("de_core_news_sm"),
    "pl": spacy.load("pl_core_news_sm"),
    "fi": spacy.load("fi_core_news_sm")
}

# List of languages you care about
languages = ["en", "fr", "de", "pl", "fi"]

# Function to clean and lemmatize text
def clean_and_lemmatize(text, lang_code, remove_stopwords=True):
    if lang_code not in spacy_models:
        return None
    
    nlp = spacy_models[lang_code]
    doc = nlp(text)

    tokens = [
        token.lemma_.lower() for token in doc
        if not token.is_punct and not token.is_space and (not token.is_stop if remove_stopwords else True)
    ]
    return " ".join(tokens)

# Apply cleaning/lemmatization across the dataframe
def process_dataframe(df, languages):
    lemmatized_texts = []

    for idx, row in tqdm(df.iterrows(), total=len(df)):
        text_dict = row["text"]  # {lang: text}
        labels = row["level_1_labels"]

        for lang in languages:
            if isinstance(text_dict, dict) and lang in text_dict:
                raw_text = text_dict[lang]
                lemmatized = clean_and_lemmatize(raw_text, lang)
                if lemmatized:
                    lemmatized_texts.append({
                        "lang": lang,
                        "text_lemmatized": lemmatized,
                        "labels": labels
                    })

    return pd.DataFrame(lemmatized_texts)



In [4]:
df = pd.read_parquet('C:/Users/guibe/OneDrive/Documents/ENSAE/3A/S2/NLP/projet/multi_eurlex_reduced.parquet')
df = df.sample(10)

ArrowMemoryError: realloc of size 731906048 failed

In [6]:
from datasets import load_dataset
dataset = load_dataset("multi_eurlex", 'all_languages',split="train", streaming=True)

In [None]:
dataset = load_dataset("multi_eurlex", 'all_languages',split="train[:1%]")

In [11]:
from itertools import islice

In [13]:
batch = list(islice(dataset, 5))  # Adjust number as needed
df = pd.DataFrame(batch)

In [14]:
df['labels']

0    [1, 20, 7, 3, 0]
1             [2, 17]
2          [3, 19, 6]
3     [12, 17, 19, 6]
4       [18, 3, 4, 1]
Name: labels, dtype: object

In [None]:
# Usage
processed_df = process_dataframe(df, languages)

In [None]:
# Save for later use
processed_df.to_parquet("data/processed_legal_texts.parquet", index=False)

## Token - Label co-occurence analysis

In [None]:
# Load preprocessed lemmatized texts
df = pd.read_parquet("data/processed_legal_texts.parquet")

# Build token-label co-occurrence mapping
token_label_counts = defaultdict(Counter)

for _, row in df.iterrows():
    tokens = row["text_lemmatized"].split()
    labels = row["labels"]
    
# Count each token against all its labels
    for token in set(tokens):
        for label in labels:
            token_label_counts[token][label] += 1

# Convert to DataFrame for inspection
top_tokens = sorted(token_label_counts.items(), key=lambda x: sum(x[1].values()), reverse=True)[:100]

rows = []
for token, label_counter in top_tokens:
    for label, count in label_counter.items():
        rows.append({"token": token, "label": label, "count": count})

df_token_label = pd.DataFrame(rows)
df_token_label.to_csv("output/token_label_cooccurrence.csv", index=False)


In [None]:
# Optional: Visualize token-label counts for one label
def plot_top_tokens_for_label(label, top_n=10):
    label_filtered = df_token_label[df_token_label["label"] == label]
    top = label_filtered.sort_values("count", ascending=False).head(top_n)
    plt.figure(figsize=(10, 5))
    plt.bar(top["token"], top["count"])
    plt.title(f"Top {top_n} tokens for label {label}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
# Example: plot_top_tokens_for_label("0806")

## Token distribution accross languages

In [None]:
# Load the same preprocessed file
df = pd.read_parquet("data/processed_legal_texts.parquet")

# Count token frequencies by language
lang_token_freq = defaultdict(Counter)

for _, row in df.iterrows():
    lang = row["lang"]
    tokens = row["text_lemmatized"].split()
    lang_token_freq[lang].update(tokens)

# Create a DataFrame with token frequencies across languages
def get_freq_df(top_tokens=None, min_freq=50):
    all_tokens = set()
    if top_tokens:
        all_tokens = set(top_tokens)
    else:
        # Get common tokens across languages
        for lang, counter in lang_token_freq.items():
            common = {token for token, freq in counter.items() if freq > min_freq}
            all_tokens |= common

    data = []
    for token in all_tokens:
        row = {"token": token}
        for lang in lang_token_freq:
            row[lang] = lang_token_freq[lang][token]
        data.append(row)
    
    return pd.DataFrame(data)


In [None]:
# Plot distribution of selected tokens
def plot_token_distribution(tokens):
    df_freq = get_freq_df(tokens)
    df_freq.set_index("token").T.plot(kind='bar', figsize=(10, 5))
    plt.title("Token frequency across languages")
    plt.ylabel("Frequency")
    plt.xlabel("Language")
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.show()


In [None]:
# Example usage:
# plot_token_distribution(["regulation", "union", "market", "recht", "union", "protection"])


What You Can Explore With These:
Token–Label Co-occurrence:
•	Find tokens that strongly signal a specific legal topic (label).
•	Detect domain-specific terms frequently co-occurring with multi-label groups.
Cross-Language Token Comparison:
•	Discover shared Latin-root legal vocabulary (e.g., regulation, directive, protection).
•	Identify language-specific phrasing or underrepresentation of concepts.



# 2 - Other strategies

In [4]:
from src.label_embedding import run_label_embedding_classification

2025-05-03 09:05:17.217370: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-03 09:05:17.223079: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-03 09:05:17.233750: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-03 09:05:17.416622: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746263117.537261  188376 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746263117.62

## Prompt based classification

## label based embedding

In [None]:
all_results = []

for lang in ["en", "fr", "de", "pl", "fi"]:
    res = run_label_embedding_classification(df, top_k=5, batch_size=32, eval_lang=lang)
    all_results.append(res)

final_df = pd.DataFrame(all_results)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFXLMRobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing TFXLMRobertaModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFXLMRobertaModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLMRobertaModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLMRobertaModel for predictions without further training.
  0%|          | 0/157 [00:00<?, ?it/s]