# 0. Instructions and setup

## 0.1. Instructions. Part 3: State of the Art Comparison (2 points)

- **Objective:** Benchmark your model against the SOA with the full dataset now available.

- **Tasks:**
  - **a. Full Dataset Training (0.25 points):** Incrementally train your model with varying percentages of the full dataset (1%, 10%, 25%, 50%, 75%, and 100%). Record the results.
  - **b. Learning Curve (0.25 points):** Plot a learning curve based on the training data percentages.
  - **c. Technique Comparison (0.5 points):** Incorporate the techniques tested in Part 2 into your training schema for comparison.
  - **d. Methodology Analysis (1 point):** Analyze and compare all methods employed. Discuss the effectiveness and limitations observed.

## 0.2. Libraries

In [71]:
import numpy as np
import polars as pl
from library.utilities import set_seed, sample_balanced_dataset

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.metrics import Recall, Precision
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import (
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EarlyStoppingCallback,
    Trainer,
    TrainingArguments,
)

In [72]:
# # specific for mac users with M1 chip (That do not have CUDA)
# # !pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
# import torch
# print(torch.backends.mps.is_available())      # Is Metal available?
# print(torch.backends.mps.is_built()) 

In [73]:
# # detect if MPS (GPU in Mac) is available
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# x = torch.ones(3, 3).to(device)
# print(x)

In [74]:
# Set random seed for reproducibility
seed = 42
set_seed(42)

Seed set to 42. This ensures reproducibility of results across runs.


# 1. Full dataset training: Incrementally train your mode

In [75]:
# Load the cleaned Parquet file
df = pl.read_parquet('data/FRENCH_swiss_judgment_prediction_combined.parquet')

# Display the loaded DataFrame
print("\nLoaded DataFrame shape:", df.shape)
print("\nLoaded DataFrame schema:")
print(df.schema)
print("\nFirst few rows of the loaded DataFrame:")
df.head()


Loaded DataFrame shape: (31094, 9)

Loaded DataFrame schema:
Schema([('id', Int32), ('year', Int32), ('text', String), ('labels', Int64), ('language', String), ('region', String), ('canton', String), ('legal area', String), ('split', String)])

First few rows of the loaded DataFrame:


id,year,text,labels,language,region,canton,legal area,split
i32,i32,str,i64,str,str,str,str,str
0,2000,"""A.- Par contrat d'entreprise s…",0,"""fr""",,,"""civil law""","""train"""
1,2000,"""A.- Le 12 avril 1995, A._ a su…",0,"""fr""",,,"""insurance law""","""train"""
2,2000,"""A.- En février 1994, M._ a été…",0,"""fr""","""Région lémanique""","""ge""","""insurance law""","""train"""
3,2000,"""A.- M._ a travaillé en qualité…",0,"""fr""",,,"""insurance law""","""train"""
6,2000,"""A.- Le 29 septembre 1997, X._ …",0,"""fr""","""Espace Mittelland""","""ne""","""penal law""","""train"""


In [76]:
# Split the DataFrame into training, validation and test sets
train_df = df.filter(pl.col('split') == 'train')
val_df = df.filter(pl.col('split') == 'validation')
test_df = df.filter(pl.col('split') == 'test')

# Delete the original data to free up memory
del df

In [77]:
# # Load the train, validation and test Parquet files
# train_df = pl.read_parquet('data/FRENCH_swiss_judgment_prediction_train.parquet')
# valid_df = pl.read_parquet('data/FRENCH_swiss_judgment_prediction_valid.parquet')
# test_df = pl.read_parquet('data/FRENCH_swiss_judgment_prediction_test.parquet')

# # Display the loaded DataFrames
# print("\nTrain DataFrame shape:", train_df.shape)
# print("\nValidation DataFrame shape:", valid_df.shape)
# print("\nTest DataFrame shape:", test_df.shape)

# print("\nTrain DataFrame schema:")
# print(train_df.schema)
# print("\nFirst few rows of the train DataFrame:")
# train_df.head()

In [78]:
model_name = "almanach/camembert-base"  # Path to the pre-trained model
num_labels = 2  # Number of labels for the classification task (in this case, binary classification)
max_length = min(int(AutoModel.from_pretrained(model_name).config.max_position_embeddings), 512)  # Maximum length of the input sequences (truncation if larger than this). Set dynamically based on the chosen model.

print(f"Model: {model_name}, Max Length: {max_length}")

Model: almanach/camembert-base, Max Length: 512


In [None]:
from library.incremental_train.doc import run_incremental_training, train_with_percentage

# Train all the percentages incrementally:
summary_df = run_incremental_training(
    train_df=train_df,
    valid_df=val_df,  
    model_name=model_name,
    max_length=max_length,
    num_labels=num_labels,
    seed=seed
)

print("\nFinal summary of results across all percentages:")
display(summary_df)

# Or to train specific percentage only:
# result = train_with_percentage(train_df, val_df, 1, model_name, max_length, num_labels, seed)


Map: 100%|██████████| 7/7 [00:00<00:00, 195.56 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 756.32 examples/s]
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at almanach/camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,,0.9,0.0,0.0,0.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


KeyboardInterrupt: 

# 2.  Learning Curve

In [None]:
import matplotlib.pyplot as plt

def plot_learning_curve(summary_df, metric='eval_accuracy'):
    """
    Plots a learning curve for the given metric from the summary DataFrame.
    """
    plt.figure(figsize=(8, 5))
    plt.plot(summary_df['percentage'], summary_df[metric], marker='o', label=metric)
    plt.xlabel('Percentage of Training Data Used')
    plt.ylabel(metric.replace('_', ' ').title())
    plt.title(f'Learning Curve: {metric.replace("_", " ").title()} vs. Training Set Size')
    plt.grid(True)
    plt.xticks(summary_df['percentage'])
    plt.legend()
    plt.show()

In [None]:
# Plot accuracy learning curve
plot_learning_curve(summary_df, metric='eval_accuracy')

# You can also plot F1, loss, etc.
plot_learning_curve(summary_df, metric='eval_f1')
plot_learning_curve(summary_df, metric='eval_loss')

# 3. Technique Comparison

In [None]:
import matplotlib.pyplot as plt
from library.incremental_train.doc import run_incremental_training

# Model names
model_names = [
    "almanach/camembert-base",
    "dascim/juribert-base",
    "google-bert/bert-base-multilingual-cased"
]

# Dictionary to store the results
results = {}

# Train each model 
for model_name in model_names:
    print(f"\nEntrenando modelo: {model_name}")
    summary_df = run_incremental_training(
        train_df=train_df,
        valid_df=val_df,
        model_name=model_name,
        max_length=max_length,
        num_labels=num_labels,
        seed=seed
    )
    results[model_name] = summary_df

# Graph the learning curves
def plot_comparison(results, metric='eval_accuracy'):
    plt.figure(figsize=(8, 5))
    for model_name, summary_df in results.items():
        plt.plot(summary_df['percentage'], summary_df[metric], marker='o', label=model_name.split('/')[-1])
    plt.xlabel('Percentage of Training Data Used')
    plt.ylabel(metric.replace('_', ' ').title())
    plt.title(f'Learning Curve Comparison: {metric.replace("_", " ").title()}')
    plt.grid(True)
    plt.xticks([1, 10, 25, 50, 75, 100])
    plt.legend()
    plt.show()

# Comparison of accuracy and F1 scores
plot_comparison(results, metric='eval_accuracy')
plot_comparison(results, metric='eval_f1')

# 4. Methodology Analysis