# 0. Instructions and setup

## 0.1. Instructions. Part 3: State of the Art Comparison (2 points)

- **Objective:** Benchmark your model against the SOA with the full dataset now available.

- **Tasks:**
  - **a. Full Dataset Training (0.25 points):** Incrementally train your model with varying percentages of the full dataset (1%, 10%, 25%, 50%, 75%, and 100%). Record the results.
  - **b. Learning Curve (0.25 points):** Plot a learning curve based on the training data percentages.
  - **c. Technique Comparison (0.5 points):** Incorporate the techniques tested in Part 2 into your training schema for comparison.
  - **d. Methodology Analysis (1 point):** Analyze and compare all methods employed. Discuss the effectiveness and limitations observed.

## 0.2. Libraries

In [1]:
import numpy as np
import polars as pl
from library.utilities import set_seed, sample_balanced_dataset

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.metrics import Recall, Precision
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import (
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)
from library.incremental_train.doc import run_incremental_training, train_with_percentage

✓ SpaCy model 'fr_core_news_sm' loaded successfully
Downloading WordNet...
Downloading Open Multilingual WordNet...


[nltk_data] Downloading package wordnet to /home/pablo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/pablo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
2025-06-15 19:32:59.895636: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-15 19:32:59.904233: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750008779.915095  419911 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750008779.918116  419911 cuda_blas.cc:1407] Unable to register cuB

In [72]:
# # specific for mac users with M1 chip (That do not have CUDA)
# # !pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
# import torch
# print(torch.backends.mps.is_available())      # Is Metal available?
# print(torch.backends.mps.is_built()) 

In [73]:
# # detect if MPS (GPU in Mac) is available
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# x = torch.ones(3, 3).to(device)
# print(x)

In [2]:
# Set random seed for reproducibility
seed = 42
set_seed(42)

Seed set to 42. This ensures reproducibility of results across runs.


# 1. Full dataset training: Incrementally train your mode

In [3]:
# Load the cleaned Parquet file
df = pl.read_parquet('data/FRENCH_swiss_judgment_prediction_combined.parquet')

# Display the loaded DataFrame
print("\nLoaded DataFrame shape:", df.shape)
print("\nLoaded DataFrame schema:")
print(df.schema)
print("\nFirst few rows of the loaded DataFrame:")
df.head()


Loaded DataFrame shape: (31094, 9)

Loaded DataFrame schema:
Schema({'id': Int32, 'year': Int32, 'text': String, 'labels': Int64, 'language': String, 'region': String, 'canton': String, 'legal area': String, 'split': String})

First few rows of the loaded DataFrame:


id,year,text,labels,language,region,canton,legal area,split
i32,i32,str,i64,str,str,str,str,str
0,2000,"""A.- Par contrat d'entreprise s…",0,"""fr""",,,"""civil law""","""train"""
1,2000,"""A.- Le 12 avril 1995, A._ a su…",0,"""fr""",,,"""insurance law""","""train"""
2,2000,"""A.- En février 1994, M._ a été…",0,"""fr""","""Région lémanique""","""ge""","""insurance law""","""train"""
3,2000,"""A.- M._ a travaillé en qualité…",0,"""fr""",,,"""insurance law""","""train"""
6,2000,"""A.- Le 29 septembre 1997, X._ …",0,"""fr""","""Espace Mittelland""","""ne""","""penal law""","""train"""


In [4]:
# Split the DataFrame into training, validation and test sets
train_df = df.filter(pl.col('split') == 'train')
val_df = df.filter(pl.col('split') == 'validation')
test_df = df.filter(pl.col('split') == 'test')

# Delete the original data to free up memory
del df

In [5]:
# # Load the train, validation and test Parquet files
# train_df = pl.read_parquet('data/FRENCH_swiss_judgment_prediction_train.parquet')
# valid_df = pl.read_parquet('data/FRENCH_swiss_judgment_prediction_valid.parquet')
# test_df = pl.read_parquet('data/FRENCH_swiss_judgment_prediction_test.parquet')

# # Display the loaded DataFrames
# print("\nTrain DataFrame shape:", train_df.shape)
# print("\nValidation DataFrame shape:", valid_df.shape)
# print("\nTest DataFrame shape:", test_df.shape)

# print("\nTrain DataFrame schema:")
# print(train_df.schema)
# print("\nFirst few rows of the train DataFrame:")
# train_df.head()

In [5]:
model_name = "almanach/camembert-base"  # Path to the pre-trained model
num_labels = 2  # Number of labels for the classification task (in this case, binary classification)
max_length = min(int(AutoModel.from_pretrained(model_name).config.max_position_embeddings), 512)  # Maximum length of the input sequences (truncation if larger than this). Set dynamically based on the chosen model.

print(f"Model: {model_name}, Max Length: {max_length}")

Model: almanach/camembert-base, Max Length: 512


In [None]:
result1 = train_with_percentage(train_df, val_df, 1, model_name, max_length, num_labels, seed)

In [None]:
result10 = train_with_percentage(train_df, val_df, 10, model_name, max_length, num_labels, seed)

In [None]:
result25 = train_with_percentage(train_df, val_df, 1, model_name, max_length, num_labels, seed)

In [None]:
# Note that, due to the class imbalance, we will have to do sampling with replacement
# to get enough samples for the minority class
result50 = train_with_percentage(train_df, val_df, 1, model_name, max_length, num_labels, seed)

In [None]:
result75 = train_with_percentage(train_df, val_df, 1, model_name, max_length, num_labels, seed)

In [None]:
result10 = train_with_percentage(train_df, val_df, 1, model_name, max_length, num_labels, seed)

In [6]:
# Train all the percentages incrementally:
summary_df = run_incremental_training(
    train_df=train_df,
    valid_df=val_df,  
    model_name=model_name,
    max_length=max_length,
    num_labels=num_labels,
    seed=seed
)

print("\nFinal summary of results across all percentages:")
display(summary_df)


# Or to train specific percentage only:
# result = train_with_percentage(train_df, val_df, 1, model_name, max_length, num_labels, seed)


Training with 1% of the data...


  obj.co_lnotab,  # for < python 3.10 [not counted in args]


Map:   0%|          | 0/210 [00:00<?, ? examples/s]

Map:   0%|          | 0/3095 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at almanach/camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.673184,0.787399,0.0,0.0,0.0
2,No log,0.70154,0.317932,0.351459,0.219493,0.881356
3,No log,0.697207,0.438449,0.340167,0.225693,0.690293
4,No log,0.722294,0.253958,0.347924,0.213001,0.949153
5,No log,0.71787,0.312439,0.348039,0.217208,0.875193
6,No log,0.673108,0.654281,0.24435,0.225554,0.266564
7,No log,0.672467,0.634249,0.272494,0.233738,0.326656
8,0.681200,0.657576,0.685299,0.223285,0.231405,0.215716
9,0.681200,0.677178,0.59063,0.304992,0.236797,0.428351
10,0.681200,0.699906,0.494669,0.328179,0.227516,0.588598



Best metrics for 1% of data:
shape: (1, 18)
┌───────────┬───────────┬──────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ eval_loss ┆ eval_accu ┆ eval_f1  ┆ eval_prec ┆ … ┆ train_sam ┆ train_ste ┆ total_flo ┆ train_los │
│ ---       ┆ racy      ┆ ---      ┆ ision     ┆   ┆ ples_per_ ┆ ps_per_se ┆ s         ┆ s         │
│ f64       ┆ ---       ┆ f64      ┆ ---       ┆   ┆ second    ┆ cond      ┆ ---       ┆ ---       │
│           ┆ f64       ┆          ┆ f64       ┆   ┆ ---       ┆ ---       ┆ f64       ┆ f64       │
│           ┆           ┆          ┆           ┆   ┆ f64       ┆ f64       ┆           ┆           │
╞═══════════╪═══════════╪══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 0.632334  ┆ 0.670436  ┆ 0.234234 ┆ 0.228404  ┆ … ┆ null      ┆ null      ┆ null      ┆ null      │
└───────────┴───────────┴──────────┴───────────┴───┴───────────┴───────────┴───────────┴───────────┘

Training with 10% of the data...


  obj.co_lnotab,  # for < python 3.10 [not counted in args]


Map:   0%|          | 0/2116 [00:00<?, ? examples/s]

Map:   0%|          | 0/3095 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at almanach/camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6955,0.704335,0.394184,0.367195,0.235091,0.838213
2,0.6819,0.656064,0.666882,0.263045,0.245333,0.283513
3,0.669,0.625096,0.644911,0.371641,0.295455,0.50077
4,0.6135,0.57981,0.707593,0.386441,0.345036,0.439137
5,0.5855,0.626186,0.673021,0.396901,0.323615,0.513097
6,0.4281,0.740511,0.620355,0.400816,0.299543,0.605547
7,0.3414,1.015419,0.518578,0.392333,0.266778,0.74114
8,0.2738,1.218639,0.498546,0.386076,0.259713,0.751926
9,0.1991,1.066191,0.583845,0.392453,0.282801,0.640986



Best metrics for 10% of data:
shape: (1, 18)
┌──────┬───────────┬─────────────┬───────┬───┬─────────────┬─────────────┬────────────┬────────────┐
│ loss ┆ grad_norm ┆ learning_ra ┆ epoch ┆ … ┆ train_sampl ┆ train_steps ┆ total_flos ┆ train_loss │
│ ---  ┆ ---       ┆ te          ┆ ---   ┆   ┆ es_per_seco ┆ _per_second ┆ ---        ┆ ---        │
│ f64  ┆ f64       ┆ ---         ┆ f64   ┆   ┆ nd          ┆ ---         ┆ f64        ┆ f64        │
│      ┆           ┆ f64         ┆       ┆   ┆ ---         ┆ f64         ┆            ┆            │
│      ┆           ┆             ┆       ┆   ┆ f64         ┆             ┆            ┆            │
╞══════╪═══════════╪═════════════╪═══════╪═══╪═════════════╪═════════════╪════════════╪════════════╡
│ null ┆ null      ┆ null        ┆ 4.0   ┆ … ┆ null        ┆ null        ┆ null       ┆ null       │
└──────┴───────────┴─────────────┴───────┴───┴─────────────┴─────────────┴────────────┴────────────┘

Training with 25% of the data...


  obj.co_lnotab,  # for < python 3.10 [not counted in args]


Map:   0%|          | 0/5294 [00:00<?, ? examples/s]

Map:   0%|          | 0/3095 [00:00<?, ? examples/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at almanach/camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6838,0.693649,0.556704,0.326791,0.239741,0.513097
2,0.6611,0.735433,0.488853,0.379608,0.254603,0.745763
3,0.6012,0.577443,0.685299,0.383544,0.325456,0.466872
4,0.5242,0.627165,0.67916,0.391917,0.325203,0.493066
5,0.4976,0.696368,0.66559,0.399304,0.320298,0.530046
6,0.3803,0.82221,0.631987,0.402727,0.305246,0.59168
7,0.3189,0.849376,0.658158,0.370238,0.301649,0.479199
8,0.2658,0.988017,0.652342,0.401557,0.314186,0.55624



Best metrics for 25% of data:
shape: (1, 18)
┌──────┬───────────┬─────────────┬───────┬───┬─────────────┬─────────────┬────────────┬────────────┐
│ loss ┆ grad_norm ┆ learning_ra ┆ epoch ┆ … ┆ train_sampl ┆ train_steps ┆ total_flos ┆ train_loss │
│ ---  ┆ ---       ┆ te          ┆ ---   ┆   ┆ es_per_seco ┆ _per_second ┆ ---        ┆ ---        │
│ f64  ┆ f64       ┆ ---         ┆ f64   ┆   ┆ nd          ┆ ---         ┆ f64        ┆ f64        │
│      ┆           ┆ f64         ┆       ┆   ┆ ---         ┆ f64         ┆            ┆            │
│      ┆           ┆             ┆       ┆   ┆ f64         ┆             ┆            ┆            │
╞══════╪═══════════╪═════════════╪═══════╪═══╪═════════════╪═════════════╪════════════╪════════════╡
│ null ┆ null      ┆ null        ┆ 3.0   ┆ … ┆ null        ┆ null        ┆ null       ┆ null       │
└──────┴───────────┴─────────────┴───────┴───┴─────────────┴─────────────┴────────────┴────────────┘

Training with 50% of the data...


ShapeError: cannot take a larger sample than the total population when `with_replacement=false`

# 2.  Learning Curve

In [None]:
import matplotlib.pyplot as plt

def plot_learning_curve(summary_df, metric='eval_accuracy'):
    """
    Plots a learning curve for the given metric from the summary DataFrame.
    """
    plt.figure(figsize=(8, 5))
    plt.plot(summary_df['percentage'], summary_df[metric], marker='o', label=metric)
    plt.xlabel('Percentage of Training Data Used')
    plt.ylabel(metric.replace('_', ' ').title())
    plt.title(f'Learning Curve: {metric.replace("_", " ").title()} vs. Training Set Size')
    plt.grid(True)
    plt.xticks(summary_df['percentage'])
    plt.legend()
    plt.show()

In [None]:
# Plot accuracy learning curve
plot_learning_curve(summary_df, metric='eval_accuracy')

# You can also plot F1, loss, etc.
plot_learning_curve(summary_df, metric='eval_f1')
plot_learning_curve(summary_df, metric='eval_loss')

# 3. Technique Comparison

In [None]:
import matplotlib.pyplot as plt
from library.incremental_train.doc import run_incremental_training

# Model names
model_names = [
    "almanach/camembert-base",
    "dascim/juribert-base",
    "google-bert/bert-base-multilingual-cased"
]

# Dictionary to store the results
results = {}

# Train each model 
for model_name in model_names:
    print(f"\nEntrenando modelo: {model_name}")
    summary_df = run_incremental_training(
        train_df=train_df,
        valid_df=val_df,
        model_name=model_name,
        max_length=max_length,
        num_labels=num_labels,
        seed=seed
    )
    results[model_name] = summary_df

# Graph the learning curves
def plot_comparison(results, metric='eval_accuracy'):
    plt.figure(figsize=(8, 5))
    for model_name, summary_df in results.items():
        plt.plot(summary_df['percentage'], summary_df[metric], marker='o', label=model_name.split('/')[-1])
    plt.xlabel('Percentage of Training Data Used')
    plt.ylabel(metric.replace('_', ' ').title())
    plt.title(f'Learning Curve Comparison: {metric.replace("_", " ").title()}')
    plt.grid(True)
    plt.xticks([1, 10, 25, 50, 75, 100])
    plt.legend()
    plt.show()

# Comparison of accuracy and F1 scores
plot_comparison(results, metric='eval_accuracy')
plot_comparison(results, metric='eval_f1')

# 4. Methodology Analysis