In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

!pip install -q sentence-transformers datasets accelerate huggingface_hub bitsandbytes
from google.colab import userdata
from huggingface_hub import login
login(token=userdata.get('HF_TOKEN'))

In [None]:
!wget -q https://raw.githubusercontent.com/pierrealexandreguillemin-a11y/pocket_arbiter/main/data/training/triplets_training.jsonl
import json
triplets = [json.loads(l) for l in open("triplets_training.jsonl") if l.strip()]
print(f"Triplets: {len(triplets)}")

In [None]:
import torch
import gc
gc.collect()
torch.cuda.empty_cache()

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from datasets import Dataset

model = SentenceTransformer("google/embeddinggemma-300m-qat-q4_0-unquantized", model_kwargs={"torch_dtype": torch.float16, "load_in_8bit": True})
print(f"Model: {model.get_sentence_embedding_dimension()} dims")

trainer = SentenceTransformerTrainer(
    model=model,
    args=SentenceTransformerTrainingArguments(
        output_dir="embeddinggemma-chess-arbiter-fr",
        num_train_epochs=3,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        learning_rate=2e-5,
        warmup_ratio=0.1,
        fp16=True,
        logging_steps=50,
        save_strategy="epoch",
        report_to="none",
        gradient_checkpointing=True,
        optim="adafactor"
    ),
    train_dataset=Dataset.from_list(triplets),
    loss=MultipleNegativesRankingLoss(model)
)
trainer.train()
model.save("embeddinggemma-chess-arbiter-fr")
print("OK!")

In [None]:
import shutil
from google.colab import files
shutil.make_archive("embeddinggemma-chess-arbiter-fr", "zip", "embeddinggemma-chess-arbiter-fr")
files.download("embeddinggemma-chess-arbiter-fr.zip")