In [2]:
!nvidia-smi

Thu May 22 03:49:22 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 570.133.20             Driver Version: 570.133.20     CUDA Version: 12.8     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          Off |   00000000:00:10.0 Off |                    0 |
| N/A   29C    P0             41W /  400W |      14MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [3]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import transformers
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from huggingface_hub import login
import wandb
from dotenv import load_dotenv
import logging
import random

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [13]:
# Load environment variables from .env file
load_dotenv()

# Authenticate with Hugging Face using environment variables
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
    login(token=hf_token)
    logger.info("Successfully authenticated with Hugging Face")
else:
    logger.warning("HF_TOKEN not found in environment variables. Some operations may fail.")
    
# Set Wandb API key from environment variables
wandb_api_key = os.environ.get("WANDB_API_KEY")
if wandb_api_key:
    os.environ["WANDB_API_KEY"] = wandb_api_key
    logger.info("WANDB_API_KEY set successfully")
else:
    logger.warning("WANDB_API_KEY not found in environment variables. Wandb logging may not work.")

# Set random seeds for reproducibility
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
2025-05-22 04:02:11,533 - INFO - Successfully authenticated with Hugging Face
2025-05-22 04:02:11,535 - INFO - WANDB_API_KEY set successfully


In [5]:
!pip install bitsandbytes
!pip install accelerate
!pip install transformers torch
!pip install datasets
!pip install tqdm
!pip install wandb
!pip install python-dotenv



In [None]:
class Config:
    """
    Configuration class to store all training parameters
    """
    # Model parameters
    SOURCE_MODEL = "facebook/nllb-200-1.3B"  # Teacher model
    TARGET_MODEL = "facebook/nllb-200-distilled-600M"  # Student model
    SOURCE_LANG = "eng_Latn"  # English
    TARGET_LANG = "khm_Khmr"  # Khmer
    TARGET_LANG_TOKEN_ID = 256092  # Khmer language token ID
    
    # Distillation parameters
    TARGET_ENCODER_LAYERS = 3  # Number of encoder layers to keep
    TARGET_DECODER_LAYERS = 3  # Number of decoder layers to keep
    TEMPERATURE = 5  # Temperature for knowledge distillation
    LAMBDA_PARAM = 0.5  # Weight balancing between student loss and distillation loss
    
    # Training hyperparameters
    BATCH_SIZE = 48
    GRADIENT_ACCUMULATION_STEPS = 1
    WARMUP_RATIO = 0.06
    NUM_EPOCHS = 1
    LEARNING_RATE = 3e-5
    FP16 = True # Set False for standard precision; set True for GPU speedup
    LOGGING_STEPS = 1000
    OPTIM = "adamw_torch"
    EVAL_STRATEGY = 'epoch'
    SAVE_STRATEGY = "epoch"
    MAX_GRAD_NORM = 1.0
    LR_SCHEDULER_TYPE = 'cosine'
    LOAD_BEST_MODEL_AT_END = True  # Changed to True to save best model
    SAVE_TOTAL_LIMIT = 1
    DDP_FIND_UNUSED_PARAMETERS = False
    GROUP_BY_LENGTH = False
    REPORT_TO = 'wandb'
    
    # Early stopping parameters
    EARLY_STOPPING_PATIENCE = 3
    EARLY_STOPPING_THRESHOLD = 0.01
    
    # Data parameters
    MAX_SEQ_LENGTH = 128
    DATA_PATH = os.environ.get("TRAINING_DATA_PATH", "/home/ubuntu/distill/316K-synthetic.json")
    
    # Output parameters
    OUTPUT_DIR = './nllb_350M'
    MODEL_NAME = "nllb_350M_en_km_v1"
    WANDB_PROJECT = "NLLB_DISTILLATION_v1"

# Create config instance
config = Config()

# Configure Wandb
os.environ["WANDB_PROJECT"] = config.WANDB_PROJECT
os.environ["WANDB_LOG_MODEL"] = config.MODEL_NAME

In [7]:
def create_distilled_model(teacher_model_name, student_model_name, num_encoder_layers, num_decoder_layers, target_lang_token_id):
    """
    Initialize teacher and student models, and prepare the student model for distillation
    
    Args:
        teacher_model_name (str): Hugging Face model ID for the teacher model
        student_model_name (str): Hugging Face model ID for the student model
        num_encoder_layers (int): Number of encoder layers to keep in the student model
        num_decoder_layers (int): Number of decoder layers to keep in the student model
        target_lang_token_id (int): Token ID for the target language
        
    Returns:
        tuple: (teacher_model, student_model, tokenizer)
    """
    logger.info(f"Loading teacher model from {teacher_model_name}")
    teacher_model = AutoModelForSeq2SeqLM.from_pretrained(teacher_model_name)
    
    logger.info(f"Loading student model from {student_model_name}")
    student_model = AutoModelForSeq2SeqLM.from_pretrained(student_model_name)
    
    logger.info(f"Loading tokenizer from {student_model_name}")
    tokenizer = AutoTokenizer.from_pretrained(
        student_model_name, 
        src_lang=config.SOURCE_LANG, 
        tgt_lang=config.TARGET_LANG
    )
    
    # Reduce the model size by keeping only selected layers
    logger.info(f"Reducing model size: keeping {num_encoder_layers} encoder layers and {num_decoder_layers} decoder layers")
    student_model.model.encoder.layers = student_model.model.encoder.layers[:num_encoder_layers]
    student_model.model.decoder.layers = student_model.model.decoder.layers[:num_decoder_layers]
    
    # Update the model config to reflect the reduced architecture
    student_model.config.encoder_layers = num_encoder_layers
    student_model.config.decoder_layers = num_decoder_layers
    
    # Set forced BOS token ID for both models
    logger.info(f"Setting forced BOS token ID to {target_lang_token_id} (Khmer)")
    teacher_model.config.forced_bos_token_id = target_lang_token_id
    student_model.config.forced_bos_token_id = target_lang_token_id
    
    # Log model size
    num_params = student_model.num_parameters() / 1_000_000
    logger.info(f"Student model has {num_params:.2f}M parameters")
    
    return teacher_model, student_model, tokenizer

# Initialize models
teacher_model, model, tokenizer = create_distilled_model(
    config.SOURCE_MODEL,
    config.TARGET_MODEL,
    config.TARGET_ENCODER_LAYERS,
    config.TARGET_DECODER_LAYERS,
    config.TARGET_LANG_TOKEN_ID
)

2025-05-22 03:49:51,840 - INFO - Loading teacher model from facebook/nllb-200-1.3B
2025-05-22 03:49:53,735 - INFO - Loading student model from facebook/nllb-200-distilled-600M
2025-05-22 03:49:54,833 - INFO - Loading tokenizer from facebook/nllb-200-distilled-600M
2025-05-22 03:49:56,473 - INFO - Reducing model size: keeping 3 encoder layers and 3 decoder layers
2025-05-22 03:49:56,511 - INFO - Setting forced BOS token ID to 256092 (Khmer)
2025-05-22 03:49:56,514 - INFO - Student model has 350.54M parameters


In [8]:
def data_prepare(dataset):
    """
    Tokenize the dataset for training and evaluation
    
    Args:
        dataset (dict): Dataset containing source and target text pairs
        
    Returns:
        dict: Tokenized model inputs
    """
    model_inputs = tokenizer(
        dataset[config.SOURCE_LANG], 
        text_target=dataset[config.TARGET_LANG], 
        max_length=config.MAX_SEQ_LENGTH,
        padding="max_length",
        truncation=True,
    )
    
    return model_inputs

def load_and_prepare_datasets(data_path):
    """
    Load and prepare datasets for training and evaluation
    
    Args:
        data_path (str): Path to the training data file
        
    Returns:
        tuple: (train_dataset, eval_dataset)
    """
    try:
        # Load the training dataset
        logger.info(f"Loading training data from {data_path}")
        train_dataset = load_dataset("json", data_files=data_path)
        
        # Load the ALT dataset for evaluation
        logger.info("Loading ALT dataset for evaluation")
        alt_dataset = load_dataset("mutiyama/alt")
        
        # Debug dataset structure
        logger.info(f"Available ALT dataset splits: {list(alt_dataset.keys())}")
        
        # Create evaluation data with English-Khmer pairs from all splits
        eval_data = []
        for split_name in alt_dataset.keys():
            logger.info(f"Processing {split_name} split")
            for item in alt_dataset[split_name]:
                # 'translation' is a dict with language codes as keys
                translations = item.get("translation", {})
                if "en" in translations and "khm" in translations and translations["en"] and translations["khm"]:
                    eval_data.append({
                        config.SOURCE_LANG: translations["en"],
                        config.TARGET_LANG: translations["khm"]
                    })
        
        # Extract training data
        try:
            train_data = train_dataset["train"]["train"][0]["data"]
            train_dataset = Dataset.from_list(train_data)
        except (KeyError, IndexError) as e:
            logger.error(f"Error extracting training data: {e}")
            raise ValueError("Training data format is not as expected. Please check the JSON structure.")

        # Create the evaluation dataset
        eval_dataset = Dataset.from_list(eval_data)
        
        # Apply the same processing to both datasets
        logger.info("Tokenizing training dataset")
        train_dataset = train_dataset.map(
            data_prepare, 
            remove_columns=train_dataset.column_names,
            desc="Processing training data"
        )
        
        logger.info("Tokenizing evaluation dataset")
        if len(eval_data) > 0:
            eval_dataset = eval_dataset.map(
                data_prepare, 
                remove_columns=eval_dataset.column_names,
                desc="Processing evaluation data"
            )
        
        logger.info(f"Training dataset size: {len(train_dataset)}")
        logger.info(f"Evaluation dataset size: {len(eval_dataset)}")
        
        if len(eval_dataset) == 0:
            logger.warning("Evaluation dataset is empty! Consider using a different evaluation dataset.")
            
        return train_dataset, eval_dataset
        
    except Exception as e:
        logger.error(f"Error loading datasets: {e}")
        raise

# Load and prepare datasets
train_dataset, eval_dataset = load_and_prepare_datasets(config.DATA_PATH)

2025-05-22 03:49:56,540 - INFO - Loading training data from /home/ubuntu/distill/316K-synthetic.json


Generating train split: 0 examples [00:00, ? examples/s]

2025-05-22 03:50:03,208 - INFO - Loading ALT dataset for evaluation
2025-05-22 03:50:08,756 - INFO - Available ALT dataset splits: ['train', 'validation', 'test']
2025-05-22 03:50:08,757 - INFO - Processing train split
2025-05-22 03:50:10,098 - INFO - Processing validation split
2025-05-22 03:50:10,172 - INFO - Processing test split
2025-05-22 03:50:14,710 - INFO - Tokenizing training dataset


Processing training data:   0%|          | 0/316110 [00:00<?, ? examples/s]

2025-05-22 03:54:09,735 - INFO - Tokenizing evaluation dataset


Processing evaluation data:   0%|          | 0/20106 [00:00<?, ? examples/s]

2025-05-22 03:54:22,047 - INFO - Training dataset size: 316110
2025-05-22 03:54:22,049 - INFO - Evaluation dataset size: 20106


In [9]:
class DistilTrainer(Seq2SeqTrainer):
    """
    Custom trainer for knowledge distillation from a teacher model to a student model.
    
    This trainer implements temperature-based knowledge distillation, combining
    supervised learning loss with distillation loss during training.
    """
    
    def __init__(self, teacher_model=None, student_model=None, temperature=None, lambda_param=None, *args, **kwargs):
        """
        Initialize the distillation trainer.
        
        Args:
            teacher_model: Pre-trained model to use as the teacher
            student_model: Smaller model to be trained through distillation
            temperature: Temperature parameter for softening probability distributions
            lambda_param: Weight balancing supervised loss vs distillation loss
            *args: Additional arguments passed to Seq2SeqTrainer
            **kwargs: Additional keyword arguments passed to Seq2SeqTrainer
        """
        super().__init__(model=student_model, *args, **kwargs)
        
        # Set device and store as instance variable
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        logger.info(f"DistilTrainer using device: {self.device}")
        
        # Move models to device and update references
        self.teacher = teacher_model.to(self.device)
        self.student = student_model.to(self.device)
        
        # Move loss function to device
        self.loss_function = nn.CrossEntropyLoss().to(self.device)
        
        # Set teacher to evaluation mode
        self.teacher.eval()
        
        # Store hyperparameters
        self.temperature = temperature
        self.lambda_param = lambda_param
        
        # Verify models are on correct device
        logger.info(f"Teacher model device: {next(self.teacher.parameters()).device}")
        logger.info(f"Student model device: {next(self.student.parameters()).device}")

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        """
        Compute combined loss using both supervised and distillation objectives.
        
        Args:
            model: The student model being trained
            inputs: Input data dictionary containing 'input_ids', 'attention_mask', etc.
            return_outputs: Whether to return model outputs along with the loss
            num_items_in_batch: Number of items in the current batch (unused)
            
        Returns:
            Loss tensor or tuple of (loss, outputs) if return_outputs is True
        """
        # Move inputs to the same device as the model
        inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()}
        
        # Get student model outputs
        student_output = model(**inputs)
        
        # Get teacher model outputs (without gradient computation)
        with torch.no_grad():
            teacher_output = self.teacher(**inputs)
        
        # Apply temperature scaling and compute softmax distributions
        soft_teacher = F.softmax(teacher_output.logits / self.temperature, dim=-1)
        soft_student = F.log_softmax(student_output.logits / self.temperature, dim=-1)
        
        # Compute distillation loss and scale by temperature squared
        distillation_loss = self.loss_function(soft_student, soft_teacher) * (self.temperature ** 2)
        
        # Get standard supervised learning loss
        student_target_loss = student_output.loss
        
        # Combine the two losses with lambda weighting
        loss = (1. - self.lambda_param) * student_target_loss + self.lambda_param * distillation_loss
        
        return (loss, student_output) if return_outputs else loss


In [10]:
def create_training_arguments():
    """
    Create training arguments for the Seq2SeqTrainer
    
    Returns:
        Seq2SeqTrainingArguments: Training arguments object
    """
    # Add early stopping callback if enabled
    callbacks = []
    if config.EARLY_STOPPING_PATIENCE > 0:
        from transformers.trainer_callback import EarlyStoppingCallback
        callbacks.append(
            EarlyStoppingCallback(
                early_stopping_patience=config.EARLY_STOPPING_PATIENCE,
                early_stopping_threshold=config.EARLY_STOPPING_THRESHOLD
            )
        )
    
    # Set up training arguments
    args = Seq2SeqTrainingArguments(
        per_device_train_batch_size=config.BATCH_SIZE,
        per_device_eval_batch_size=max(1, min(8, len(eval_dataset))),
        gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
        remove_unused_columns=False,
        warmup_ratio=config.WARMUP_RATIO,
        num_train_epochs=config.NUM_EPOCHS,
        learning_rate=config.LEARNING_RATE,
        fp16=config.FP16,
        logging_steps=config.LOGGING_STEPS,
        optim=config.OPTIM,
        evaluation_strategy=config.EVAL_STRATEGY,  
        save_strategy=config.SAVE_STRATEGY,
        max_grad_norm=config.MAX_GRAD_NORM,
        lr_scheduler_type=config.LR_SCHEDULER_TYPE,
        output_dir=config.OUTPUT_DIR,
        load_best_model_at_end=config.LOAD_BEST_MODEL_AT_END,
        save_total_limit=config.SAVE_TOTAL_LIMIT,
        ddp_find_unused_parameters=config.DDP_FIND_UNUSED_PARAMETERS,
        group_by_length=config.GROUP_BY_LENGTH,
        report_to=config.REPORT_TO,
    )
    
    return args, callbacks

In [11]:
# Create training arguments
training_args, callbacks = create_training_arguments()

# Create trainer
trainer = DistilTrainer(
    teacher_model=teacher_model,
    student_model=model,
    temperature=config.TEMPERATURE,
    lambda_param=config.LAMBDA_PARAM,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=callbacks,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer, 
        pad_to_multiple_of=8, 
        return_tensors="pt", 
        padding=True
    ),
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
2025-05-22 03:54:23,590 - INFO - DistilTrainer using device: cuda
2025-05-22 03:54:24,814 - INFO - Teacher model device: cuda:0
2025-05-22 03:54:24,816 - INFO - Student model device: cuda:0


In [14]:
def train_and_export_model(trainer, model, export_name, private=True, resume_from_checkpoint=False):
    """
    Train the model and export it to Hugging Face Hub
    
    Args:
        trainer: Trainer instance
        model: Model to be exported
        export_name: Name for the exported model on HF Hub
        private: Whether to make the model private on HF Hub
        resume_from_checkpoint: Whether to resume from checkpoint if available
        
    Returns:
        Model: Trained model
    """
    try:
        # Try to resume from checkpoint if specified
        logger.info("Starting training...")
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
        logger.info("Training completed successfully")
        
        # Push model to Hub
        logger.info(f"Pushing model to Hugging Face Hub as {export_name}")
        result = model.push_to_hub(export_name, private=private)
        logger.info(f"Model pushed successfully: {result}")
        
        return model
    except Exception as e:
        logger.error(f"Error during training or export: {e}")
        raise

# Train and export the model
trained_model = train_and_export_model(
    trainer, 
    model,
    config.MODEL_NAME, 
    private=True,
    resume_from_checkpoint=False
)

2025-05-22 04:02:15,913 - INFO - Starting training...


Epoch,Training Loss,Validation Loss
1,0.8131,0.808127


Non-default generation parameters: {'max_length': 200, 'forced_bos_token_id': 256092}
There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].
2025-05-22 07:04:28,435 - INFO - Training completed successfully
2025-05-22 07:04:28,437 - INFO - Pushing model to Hugging Face Hub as nllb_350M_en_km_v1


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Non-default generation parameters: {'max_length': 200, 'forced_bos_token_id': 256092}


model.safetensors:   0%|          | 0.00/1.40G [00:00<?, ?B/s]

2025-05-22 07:05:49,329 - INFO - Model pushed successfully: https://huggingface.co/lyfeyvutha/nllb_350M_en_km_v1/commit/1e76c33107d06e6715b6adfc57b3ee1dbe3df19d
