# Fine-tune BGE-M3 for Arabic Islamic Texts (Memory-Optimized)

This notebook fine-tunes BAAI/bge-m3 on Quran and Hadith data for improved Arabic Islamic text search.

## Memory-Optimized Strategy
- **CachedMultipleNegativesRankingLoss** - properly handles hard negatives
- **Gradient accumulation** (batch 4 √ó 8 = 32 effective)
- **Sequence length limit** (256 tokens) for memory efficiency
- **3 hard negatives per query** (similarity 0.65-0.85 zone)
- **Target metrics**: Precision@5 > 0.85, MRR > 0.80

## Instructions
1. Go to **Runtime ‚Üí Change runtime type ‚Üí Select H100 GPU**
2. Upload your training files when prompted:
   - `combined_training.jsonl` (required - 161MB with hard negatives)
   - `gold_standard_evaluation.jsonl` (recommended)
3. Run all cells
4. Download the fine-tuned model at the end

## Verification Checklist
- [ ] GPU memory should stay under 30GB (check after first training step)
- [ ] Initial loss should be ~2.5-4.0 (NOT 0.000000)
- [ ] Loss should decrease over training

**Estimated time**: ~30-45 minutes on H100

In [None]:
# Install required packages
!pip install -q sentence-transformers>=3.4.0 datasets accelerate

In [None]:
# Check GPU availability
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    DEVICE = "cuda"
else:
    print("WARNING: No GPU detected. Training will be slow.")
    DEVICE = "cpu"

In [None]:
# Upload training data file
from google.colab import files

print("Please upload your training files:")
print("=" * 50)
print("\nREQUIRED:")
print("  - combined_training.jsonl (training data with hard negatives)")
print("\nRECOMMENDED:")
print("  - gold_standard_evaluation.jsonl (200 curated test queries)")
print("\nThese are in: training/data/")
print("\nYou can select multiple files at once.")

uploaded = files.upload()

# Check what was uploaded
has_eval_file = 'gold_standard_evaluation.jsonl' in uploaded
has_training_file = 'combined_training.jsonl' in uploaded

if has_training_file:
    print("\n‚úì Training data uploaded")
else:
    print("\n‚ö† Warning: combined_training.jsonl not found")
    
if has_eval_file:
    print("‚úì Gold standard evaluation set uploaded (will evaluate during training)")
else:
    print("‚ö† No evaluation file - training will proceed without checkpointing")

In [None]:
# Load training data
import json
from sentence_transformers import InputExample

# Configuration - Keep all 3 hard negatives for best quality
MAX_HARD_NEGATIVES = 3

def load_jsonl(filename, max_negatives=3):
    """Load training pairs from JSONL, handling hard negatives if present"""
    examples = []
    has_negatives = False

    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            if not line.strip():
                continue

            data = json.loads(line)
            query = data.get('query', '')
            positives = data.get('pos', [])
            negatives = data.get('neg', [])

            if not query or not positives:
                continue

            if negatives:
                has_negatives = True

            for pos in positives:
                if negatives:
                    # With hard negatives: [query, positive, neg1, neg2, neg3]
                    neg_subset = negatives[:max_negatives]
                    texts = [query, pos] + neg_subset
                    examples.append(InputExample(texts=texts))
                else:
                    # Without negatives: [query, positive]
                    examples.append(InputExample(texts=[query, pos]))

    return examples, has_negatives

# Load all uploaded files
train_examples = []
has_hard_negatives = False

for filename in uploaded.keys():
    if filename.endswith('.jsonl') and 'evaluation' not in filename:
        print(f"Loading {filename}...")
        examples, has_neg = load_jsonl(filename, MAX_HARD_NEGATIVES)
        train_examples.extend(examples)
        has_hard_negatives = has_hard_negatives or has_neg
        print(f"  Loaded {len(examples)} examples")

print(f"\nTotal training examples: {len(train_examples)}")
print(f"Hard negatives present: {has_hard_negatives}")
if has_hard_negatives:
    print(f"Texts per example: {len(train_examples[0].texts)} (query + pos + {MAX_HARD_NEGATIVES} neg)")

In [None]:
# Load BGE-M3 model
import gc
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
import random

# Clear any existing memory
gc.collect()
torch.cuda.empty_cache()

MODEL_NAME = 'BAAI/bge-m3'

# Memory optimization: limit sequence length
MAX_SEQ_LENGTH = 256  # Reduces memory significantly (default is often 512+)

print(f"Loading {MODEL_NAME} model...")
model = SentenceTransformer(MODEL_NAME, device=DEVICE)

# Apply sequence length limit for memory optimization
model.max_seq_length = MAX_SEQ_LENGTH

print(f"Model loaded on: {model.device}")
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")
print(f"Max sequence length: {model.max_seq_length}")
print(f"GPU memory used: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"GPU memory available: {(torch.cuda.get_device_properties(0).total_memory - torch.cuda.memory_allocated()) / 1e9:.1f} GB")

In [None]:
# Training configuration (OPTIMIZED FOR H100 80GB - MEMORY-SAFE)
from sentence_transformers.losses import CachedMultipleNegativesRankingLoss

# Memory-optimized settings with gradient accumulation
BATCH_SIZE = 4                    # Small batch for memory safety
GRADIENT_ACCUMULATION_STEPS = 8   # Effective batch = 4 √ó 8 = 32
MINI_BATCH_SIZE = 16              # Internal batching for CachedMNRL

EPOCHS = 2              # 2 epochs with quality data
LEARNING_RATE = 1e-5    # Conservative learning rate
WARMUP_RATIO = 0.1      # 10% warmup
OUTPUT_DIR = './arabic-islamic-bge-m3'

# Shuffle training data
random.shuffle(train_examples)

# Use CachedMultipleNegativesRankingLoss - designed for hard negatives
# This loss properly handles [query, pos, neg1, neg2, neg3] format
# and caches embeddings to reduce memory during loss computation
train_loss = CachedMultipleNegativesRankingLoss(
    model,
    mini_batch_size=MINI_BATCH_SIZE,
)

# Create dataloader with small batch size
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=BATCH_SIZE)

# Calculate steps (accounting for gradient accumulation)
steps_per_epoch = len(train_dataloader)
effective_batch_size = BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS
total_steps = steps_per_epoch * EPOCHS
warmup_steps = int(total_steps * WARMUP_RATIO)

# Calculate negatives
in_batch_negatives = effective_batch_size - 1
total_negatives = in_batch_negatives + MAX_HARD_NEGATIVES

print(f"\n{'='*60}")
print("MEMORY-OPTIMIZED Training Configuration (H100 80GB)")
print(f"{'='*60}")
print(f"  GPU: {torch.cuda.get_device_name(0)}")
print(f"  GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.0f} GB")
print(f"  Max sequence length: {MAX_SEQ_LENGTH}")
print(f"  Actual batch size: {BATCH_SIZE}")
print(f"  Gradient accumulation: {GRADIENT_ACCUMULATION_STEPS}")
print(f"  Effective batch size: {effective_batch_size}")
print(f"  Loss function: CachedMultipleNegativesRankingLoss")
print(f"  Mini-batch size (for loss): {MINI_BATCH_SIZE}")
print(f"  In-batch negatives: {in_batch_negatives}")
print(f"  Hard negatives: {MAX_HARD_NEGATIVES}")
print(f"  Total negatives per sample: {total_negatives}")
print(f"  Epochs: {EPOCHS}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Warmup steps: {warmup_steps}")
print(f"  Steps per epoch: {steps_per_epoch}")
print(f"  Total steps: {total_steps}")
print(f"  Training examples: {len(train_examples)}")
print(f"  Mixed precision (FP16): ENABLED")
print(f"{'='*60}")
print(f"\nüí° Memory estimate: ~20-30GB (vs 80GB+ before)")
print(f"   Check after first step: print(f'GPU: {{torch.cuda.memory_allocated()/1e9:.1f}}GB')")

In [None]:
# Train with SentenceTransformerTrainer (supports gradient accumulation)
import os
from datasets import Dataset
from sentence_transformers import SentenceTransformerTrainingArguments, SentenceTransformerTrainer
from sentence_transformers.training_args import BatchSamplers

OUTPUT_DIR = './arabic-islamic-bge-m3'
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Starting training on H100...")
print(f"  {len(train_examples)} examples")
print(f"  Batch size: {BATCH_SIZE} √ó {GRADIENT_ACCUMULATION_STEPS} = {effective_batch_size} effective")
print(f"  {total_negatives} negatives per sample ({in_batch_negatives} in-batch + {MAX_HARD_NEGATIVES} hard)")
print(f"  Loss: CachedMultipleNegativesRankingLoss (handles hard negatives properly)")
print(f"\n‚ö†Ô∏è  IMPORTANT: Initial loss should be ~2.5-4.0 (NOT 0.000000)")
print(f"   If loss = 0.000, stop training - something is wrong.\n")

# Convert InputExample list to HuggingFace Dataset
# SentenceTransformerTrainer expects columns: sentence_0, sentence_1, ...
def convert_to_dataset(examples):
    """Convert list of InputExample to HuggingFace Dataset"""
    # Find the expected number of texts (max across all examples)
    expected_num_texts = max(len(ex.texts) for ex in examples)
    
    # Filter to only include examples with the expected number of texts
    # (CachedMultipleNegativesRankingLoss needs consistent format)
    filtered_examples = [ex for ex in examples if len(ex.texts) == expected_num_texts]
    
    if len(filtered_examples) < len(examples):
        print(f"  Filtered {len(examples) - len(filtered_examples)} examples with inconsistent text count")
        print(f"  Using {len(filtered_examples)} examples with {expected_num_texts} texts each")
    
    data = {}
    for i in range(expected_num_texts):
        data[f"sentence_{i}"] = [ex.texts[i] for ex in filtered_examples]
    
    return Dataset.from_dict(data)

train_dataset = convert_to_dataset(train_examples)
print(f"Converted to Dataset with columns: {train_dataset.column_names}")
print(f"Dataset size: {len(train_dataset)} examples")

# Training arguments with gradient accumulation
training_args = SentenceTransformerTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    warmup_steps=warmup_steps,  # Use warmup_steps instead of deprecated warmup_ratio
    fp16=True,  # Mixed precision for memory efficiency
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=1,  # Only keep latest checkpoint
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    report_to="none",  # Disable wandb/tensorboard
)

# Create trainer with proper Dataset
trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    loss=train_loss,
)

# Train
trainer.train()

# Check GPU memory after training
print(f"\n{'='*60}")
print("‚úì Training complete!")
print(f"{'='*60}")
print(f"Final GPU memory: {torch.cuda.memory_allocated() / 1e9:.1f} GB")
print(f"Model saved to: {OUTPUT_DIR}")

# Save the final model explicitly
model.save(OUTPUT_DIR)

In [None]:
# Save training configuration
import json

config = {
    "model_name": MODEL_NAME,
    "batch_size": BATCH_SIZE,
    "gradient_accumulation_steps": GRADIENT_ACCUMULATION_STEPS,
    "effective_batch_size": BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS,
    "max_seq_length": MAX_SEQ_LENGTH,
    "epochs": EPOCHS,
    "learning_rate": LEARNING_RATE,
    "warmup_steps": warmup_steps,
    "warmup_ratio": WARMUP_RATIO,
    "loss_function": "CachedMultipleNegativesRankingLoss",
    "mini_batch_size": MINI_BATCH_SIZE,
    "hard_negatives_per_query": MAX_HARD_NEGATIVES,
    "in_batch_negatives": effective_batch_size - 1,
    "total_negatives_per_sample": effective_batch_size - 1 + MAX_HARD_NEGATIVES,
    "num_training_examples": len(train_examples),
    "fp16": True,
    "gpu": torch.cuda.get_device_name(0),
    "strategy": "memory-optimized-h100",
    "target_metrics": {
        "precision_at_5": "> 0.85",
        "mrr": "> 0.80",
        "false_positive_rate": "< 15%"
    }
}

with open(f'{OUTPUT_DIR}/training_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("Training configuration saved:")
print(json.dumps(config, indent=2))

In [None]:
# Test the fine-tuned model
print("Testing fine-tuned model...\n")

test_queries = [
    # Arabic queries
    "ÿ•ŸÜŸÖÿß ÿßŸÑÿ£ÿπŸÖÿßŸÑ ÿ®ÿßŸÑŸÜŸäÿßÿ™",  # Actions are by intentions
    "ÿßŸÑÿµŸÑÿßÿ© ŸÅŸä ŸàŸÇÿ™Ÿáÿß",  # Prayer on time
    "ÿ¢Ÿäÿ© ÿßŸÑŸÉÿ±ÿ≥Ÿä",  # Ayat al-Kursi
    "ŸÖÿß ÿ≠ŸÉŸÖ ÿßŸÑÿµŸäÿßŸÖ ŸÅŸä ÿ±ŸÖÿ∂ÿßŸÜÿü",  # What is the ruling on fasting in Ramadan?
    # English queries
    "What is the reward for patience?",
    "hadith about charity",
    "fasting in Ramadan",
    "importance of good intentions",
]

for query in test_queries:
    embedding = model.encode(query)
    print(f"'{query[:50]}' ‚Üí {len(embedding)}-dim vector")

In [None]:
# Optional: Quick evaluation with sample queries
import numpy as np

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

# Test semantic similarity
test_pairs = [
    # Should be similar
    ("Actions are judged by intentions", "ÿ•ŸÜŸÖÿß ÿßŸÑÿ£ÿπŸÖÿßŸÑ ÿ®ÿßŸÑŸÜŸäÿßÿ™"),
    ("What does Islam say about patience?", "ÿßŸÑÿµÿ®ÿ± ŸÅŸä ÿßŸÑÿ•ÿ≥ŸÑÿßŸÖ"),
    # Should be less similar
    ("Actions are judged by intentions", "ÿßŸÑÿµŸÑÿßÿ© ŸÅŸä ŸàŸÇÿ™Ÿáÿß"),
]

print("Semantic similarity test:\n")
for q1, q2 in test_pairs:
    e1 = model.encode(q1)
    e2 = model.encode(q2)
    sim = cosine_similarity(e1, e2)
    print(f"'{q1[:30]}...' vs '{q2[:30]}...': {sim:.4f}")

In [None]:
# Evaluate on Gold Standard (if uploaded)
import numpy as np
from collections import defaultdict

def evaluate_on_gold_standard(model, eval_file='gold_standard_evaluation.jsonl'):
    """Evaluate model on gold standard queries, computing Precision@K and MRR."""
    import os
    if not os.path.exists(eval_file):
        print(f"‚ö† Evaluation file not found: {eval_file}")
        print("Upload gold_standard_evaluation.jsonl for detailed evaluation.")
        return None
    
    print(f"\n{'='*50}")
    print("Gold Standard Evaluation")
    print(f"{'='*50}\n")
    
    # Load evaluation queries
    eval_queries = []
    with open(eval_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                eval_queries.append(json.loads(line))
    
    print(f"Loaded {len(eval_queries)} evaluation queries")
    
    # Load all training passages as the retrieval corpus
    # (In a real evaluation, you'd use the actual Qdrant index)
    corpus = []
    corpus_ids = []
    
    # Use the training data as a simple corpus for evaluation
    with open('combined_training.jsonl', 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data = json.loads(line)
                for pos in data.get('pos', []):
                    if pos not in corpus:
                        corpus.append(pos)
                        # Create a simple ID
                        corpus_ids.append(f"passage_{len(corpus_ids)}")
    
    print(f"Corpus size: {len(corpus)} passages")
    print("Encoding corpus (this may take a few minutes)...")
    
    # Encode corpus
    corpus_embeddings = model.encode(corpus, show_progress_bar=True, batch_size=32)
    
    # Evaluate each query
    results = {
        'precision_at_5': [],
        'precision_at_10': [],
        'mrr': [],
        'by_category': defaultdict(list),
        'by_difficulty': defaultdict(list),
    }
    
    print("\nEvaluating queries...")
    for eq in eval_queries:
        query = eq['query']
        relevant_ids = set(r['id'] for r in eq.get('relevant', []))
        category = eq.get('category', 'unknown')
        difficulty = eq.get('difficulty', 'medium')
        
        # Encode query
        query_embedding = model.encode(query)
        
        # Compute similarities
        similarities = np.dot(corpus_embeddings, query_embedding)
        top_indices = np.argsort(similarities)[::-1][:10]
        
        # Since we don't have real IDs mapped, we'll use a simplified evaluation
        # In practice, you'd map corpus_ids to the relevant_ids from gold standard
        # Here we'll compute based on whether retrieved passages are "correct"
        
        # For now, just track the similarity scores for analysis
        top_sims = [similarities[i] for i in top_indices]
        
        # Record metrics (simplified - full evaluation needs proper ID mapping)
        results['precision_at_5'].append(np.mean(top_sims[:5]))
        results['precision_at_10'].append(np.mean(top_sims[:10]))
        results['mrr'].append(top_sims[0] if top_sims else 0)
        results['by_category'][category].append(top_sims[0] if top_sims else 0)
        results['by_difficulty'][difficulty].append(top_sims[0] if top_sims else 0)
    
    # Print results
    print(f"\n{'='*50}")
    print("EVALUATION RESULTS")
    print(f"{'='*50}")
    print(f"\nOverall Metrics (similarity-based proxy):")
    print(f"  Avg Top-5 Similarity:  {np.mean(results['precision_at_5']):.4f}")
    print(f"  Avg Top-10 Similarity: {np.mean(results['precision_at_10']):.4f}")
    print(f"  Avg Top-1 Similarity:  {np.mean(results['mrr']):.4f}")
    
    print(f"\nBy Category:")
    for cat, scores in sorted(results['by_category'].items()):
        print(f"  {cat}: {np.mean(scores):.4f} (n={len(scores)})")
    
    print(f"\nBy Difficulty:")
    for diff, scores in sorted(results['by_difficulty'].items()):
        print(f"  {diff}: {np.mean(scores):.4f} (n={len(scores)})")
    
    print(f"\n{'='*50}")
    print("Note: For full Precision@K and MRR evaluation, run the")
    print("evaluate-precision.ts script locally with your Qdrant index.")
    print(f"{'='*50}")
    
    return results

# Run evaluation if gold standard was uploaded
if has_eval_file:
    eval_results = evaluate_on_gold_standard(model)
else:
    print("‚ö† No gold standard file uploaded.")
    print("For detailed evaluation metrics, upload gold_standard_evaluation.jsonl")

In [None]:
# Download the fine-tuned model
import shutil
from google.colab import files

# Create zip file
print("Creating zip file...")
shutil.make_archive('arabic-islamic-bge-m3', 'zip', OUTPUT_DIR)

print("Downloading fine-tuned model...")
print("\nAfter download:")
print("1. Extract: unzip arabic-islamic-bge-m3.zip -d training/outputs/arabic-islamic-bge-m3/")
print("2. Start server: CUSTOM_WEIGHTS_PATH=./training/outputs/arabic-islamic-bge-m3 python embedding-server/main.py")
print("3. Regenerate: bun run scripts/generate-embeddings.ts --model=bge-m3")
print("4. Evaluate: bun run training/scripts/evaluate-precision.ts --model=bge-m3")

files.download('arabic-islamic-bge-m3.zip')