In [None]:
# Installation avec version transformers recommandée par Google
!pip install -q "sentence-transformers[train]" datasets accelerate huggingface_hub
!pip install -q git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview

from google.colab import userdata
from huggingface_hub import login
login(token=userdata.get('HF_TOKEN'))

In [None]:
!wget -q https://raw.githubusercontent.com/pierrealexandreguillemin-a11y/pocket_arbiter/main/data/training/triplets_training.jsonl
import json
triplets = [json.loads(l) for l in open("triplets_training.jsonl") if l.strip()]
print(f"Triplets: {len(triplets)}")

In [None]:
import torch
from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from datasets import Dataset

# Vérifier GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Device: {device}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Charger le modèle EmbeddingGemma 300M
model = SentenceTransformer("google/embeddinggemma-300M", device=device)
print(f"Model: {model.get_sentence_embedding_dimension()} dims")

# Configuration optimisée pour T4 16GB
# IMPORTANT: EmbeddingGemma ne supporte PAS fp16 (voir Google docs)
# batch_size=4 avec gradient_accumulation=4 = effective batch 16
trainer = SentenceTransformerTrainer(
    model=model,
    args=SentenceTransformerTrainingArguments(
        output_dir="embeddinggemma-chess-arbiter-fr",
        num_train_epochs=3,
        per_device_train_batch_size=4,           # Réduit pour T4 (Google recommande 1)
        gradient_accumulation_steps=4,            # Effective batch = 16
        learning_rate=2e-5,
        warmup_ratio=0.1,
        fp16=False,                               # CRITIQUE: EmbeddingGemma ne supporte pas fp16!
        bf16=False,                               # Pas de bf16 non plus sur T4
        logging_steps=50,
        save_strategy="epoch",
        report_to="none",
        dataloader_drop_last=True,                # Évite batch incomplet
    ),
    train_dataset=Dataset.from_list(triplets),
    loss=MultipleNegativesRankingLoss(model)
)

print("Training...")
trainer.train()
model.save("embeddinggemma-chess-arbiter-fr")
print("Training terminé!")

In [None]:
# Évaluation rapide du modèle fine-tuné (conformité ISO 42001)
from sentence_transformers import SentenceTransformer
from sentence_transformers.util import cos_sim
import random

# Charger le modèle fine-tuné
finetuned = SentenceTransformer("embeddinggemma-chess-arbiter-fr")

# Échantillon de test (5 triplets aléatoires)
test_samples = random.sample(triplets, min(5, len(triplets)))

print("=" * 60)
print("ÉVALUATION QUALITÉ (ISO 42001 - AI-R03: Recall)")
print("=" * 60)

correct = 0
for i, t in enumerate(test_samples):
    q_emb = finetuned.encode(t["anchor"])
    pos_emb = finetuned.encode(t["positive"])
    neg_emb = finetuned.encode(t["negative"])
    
    sim_pos = cos_sim(q_emb, pos_emb).item()
    sim_neg = cos_sim(q_emb, neg_emb).item()
    
    is_correct = sim_pos > sim_neg
    correct += int(is_correct)
    
    print(f"\n[{i+1}] Question: {t['anchor'][:60]}...")
    print(f"    Sim(positive): {sim_pos:.4f}")
    print(f"    Sim(negative): {sim_neg:.4f}")
    print(f"    {'✅ CORRECT' if is_correct else '❌ INCORRECT'}")

accuracy = correct / len(test_samples) * 100
print("\n" + "=" * 60)
print(f"RÉSULTAT: {correct}/{len(test_samples)} = {accuracy:.0f}%")
print(f"Cible ISO: ≥80%  →  {'✅ CONFORME' if accuracy >= 80 else '⚠️ À AMÉLIORER'}")
print("=" * 60)

In [None]:
import shutil
from google.colab import files
shutil.make_archive("embeddinggemma-chess-arbiter-fr", "zip", "embeddinggemma-chess-arbiter-fr")
files.download("embeddinggemma-chess-arbiter-fr.zip")