In [None]:
# Installation
!pip install -q "sentence-transformers[train]" datasets accelerate huggingface_hub
!pip install -q git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview

# Login HuggingFace (Kaggle secrets)
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

secrets = UserSecretsClient()
hf_token = secrets.get_secret("HF_TOKEN")
login(token=hf_token)
print("HuggingFace login OK")

In [None]:
# Télécharger les données d'entraînement
!wget -q https://raw.githubusercontent.com/pierrealexandreguillemin-a11y/pocket_arbiter/main/data/training/triplets_training.jsonl

import json
triplets = [json.loads(l) for l in open("triplets_training.jsonl") if l.strip()]
print(f"Triplets chargés: {len(triplets)}")

In [None]:
import torch
import gc
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from datasets import Dataset

# Nettoyer la mémoire
gc.collect()
torch.cuda.empty_cache()

# Info GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"VRAM: {vram:.1f} GB")

# Charger EmbeddingGemma
model = SentenceTransformer("google/embeddinggemma-300M", device=device)
print(f"Model loaded: {model.get_sentence_embedding_dimension()} dims")

# Gradient checkpointing
model[0].auto_model.gradient_checkpointing_enable()
print("Gradient checkpointing enabled")

In [None]:
# Configuration optimisée pour Kaggle GPU (P100/T4)
# Kaggle a plus de RAM système donc plus de marge

trainer = SentenceTransformerTrainer(
    model=model,
    args=SentenceTransformerTrainingArguments(
        output_dir="embeddinggemma-chess-arbiter-fr",
        num_train_epochs=3,
        per_device_train_batch_size=2,            # Kaggle peut gérer 2
        gradient_accumulation_steps=8,            # Effective batch = 16
        learning_rate=2e-5,
        warmup_ratio=0.1,
        fp16=False,                               # EmbeddingGemma ne supporte pas fp16
        bf16=False,
        logging_steps=50,
        save_strategy="epoch",
        report_to="none",
        dataloader_drop_last=True,
        optim="adamw_torch_fused",
        dataloader_num_workers=2,                 # Kaggle a plus de CPU
    ),
    train_dataset=Dataset.from_list(triplets),
    loss=MultipleNegativesRankingLoss(model)
)

print("Démarrage training...")
print(f"Steps par epoch: {len(triplets) // 2 // 8}")
print(f"Total steps: {len(triplets) // 2 // 8 * 3}")

In [None]:
# TRAINING
trainer.train()
model.save("embeddinggemma-chess-arbiter-fr")
print("Training terminé!")

In [None]:
# Évaluation rapide (ISO 42001)
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import random

finetuned = SentenceTransformer("embeddinggemma-chess-arbiter-fr")
test_samples = random.sample(triplets, min(10, len(triplets)))

print("=" * 60)
print("ÉVALUATION QUALITÉ (ISO 42001 - Recall)")
print("=" * 60)

correct = 0
for i, t in enumerate(test_samples):
    q_emb = finetuned.encode(t["anchor"])
    pos_emb = finetuned.encode(t["positive"])
    neg_emb = finetuned.encode(t["negative"])
    
    sim_pos = cos_sim(q_emb, pos_emb).item()
    sim_neg = cos_sim(q_emb, neg_emb).item()
    
    is_correct = sim_pos > sim_neg
    correct += int(is_correct)
    
    status = "OK" if is_correct else "FAIL"
    print(f"[{i+1}] {status} | pos={sim_pos:.3f} neg={sim_neg:.3f}")

accuracy = correct / len(test_samples) * 100
print("=" * 60)
print(f"RÉSULTAT: {correct}/{len(test_samples)} = {accuracy:.0f}%")
print(f"Cible: >=80% → {'PASS' if accuracy >= 80 else 'FAIL'}")

In [None]:
# Télécharger le modèle fine-tuné
import shutil
shutil.make_archive("embeddinggemma-chess-arbiter-fr", "zip", "embeddinggemma-chess-arbiter-fr")

# Sur Kaggle: le fichier sera dans /kaggle/working/
print("Modèle exporté: embeddinggemma-chess-arbiter-fr.zip")
print("Télécharge depuis l'onglet 'Output' à droite →")