# NLLB-200 FP16 vs FP32 

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
import time
import psutil
import os

In [None]:
def get_model_size_mb(model):
    """Calculate actual model size in memory"""
    param_size = 0
    buffer_size = 0

    for param in model.parameters():
        param_size += param.nelement() * param.element_size()

    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    model_size = (param_size + buffer_size) / 1024 / 1024  # Convert to MB
    return model_size

In [None]:
def load_fp32_model():
    """Load original FP32 model"""
    start_time = time.time()

    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
    model = AutoModelForSeq2SeqLM.from_pretrained(
        "facebook/nllb-200-distilled-600M",
        torch_dtype=torch.float32,
        device_map="auto"
    )

    load_time = time.time() - start_time
    model_size = get_model_size_mb(model)

    return model, tokenizer, load_time, model_size

In [None]:
def load_fp16_model():
    """Load FP16 quantized model"""
    start_time = time.time()

    # FP16 quantization configuration
    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
    model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    torch_dtype=torch.float16,
    device_map="auto"
    )

    load_time = time.time() - start_time

    # For quantized models, we check GPU memory usage instead
    if torch.cuda.is_available():
        gpu_memory = torch.cuda.memory_allocated() / 1024 / 1024  # MB
        model_size = gpu_memory
    else:
        model_size = get_model_size_mb(model)

    return model, tokenizer, load_time, model_size

In [None]:
def benchmark_translation(model, tokenizer, model_type):
    """Benchmark translation speed and quality"""
   
    test_sentences = [
        "The rapid advancement of artificial intelligence is transforming industries worldwide.",
        "Despite challenging economic conditions, small businesses demonstrate remarkable resilience.",  
        "Climate change represents one of the most pressing challenges of our generation.",
        "Digital transformation has revolutionized how we communicate and collaborate.",
        "Sustainable development requires balancing economic growth with environmental protection."
    ]

    src_lang = "eng_Latn"
    tgt_lang = "fra_Latn"  # Translate to French

    tokenizer.src_lang = src_lang
    translations = []
    total_tokens = 0

    start_time = time.time()

    for sentence in test_sentences:
      
        inputs = tokenizer(sentence, return_tensors="pt", padding=True)

        if torch.cuda.is_available():
            inputs = {k: v.to(model.device) for k, v in inputs.items()}

        target_token_id = tokenizer.convert_tokens_to_ids(tgt_lang)

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                forced_bos_token_id=target_token_id,
                max_length=100,
                num_beams=2,
                do_sample=False,
                pad_token_id=tokenizer.pad_token_id
            )

        translation = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        translations.append(translation)
        total_tokens += len(outputs[0])

    end_time = time.time()
    translation_time = end_time - start_time
    tokens_per_second = total_tokens / translation_time

    return translations, tokens_per_second, translation_time

In [None]:
def compare_translations(fp32_translations, fp16_translations):
    """Compare translation quality between models"""

    test_sentences = [
        "The rapid advancement of artificial intelligence is transforming industries worldwide.",
        "Despite challenging economic conditions, small businesses demonstrate remarkable resilience.",
        "Climate change represents one of the most pressing challenges of our generation."
    ]

    for i, (original, fp32, fp16) in enumerate(zip(test_sentences[:3], fp32_translations[:3], fp16_translations[:3])):
        print(f"\n--- Example {i+1} ---")
        print(f"English: {original}")
        print(f"FP32:    {fp32}")
        print(f"FP16:    {fp16}")
        
        fp32_words = set(fp32.lower().split())
        fp16_words = set(fp16.lower().split())
        similarity = len(fp32_words & fp16_words) / len(fp32_words | fp16_words) * 100
        print(f"Word overlap: {similarity:.1f}%")

In [None]:
def main_comparison():
    """Main function to run complete comparison"""
        
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    try:
 
        fp32_model, fp32_tokenizer, fp32_load_time, fp32_size = load_fp32_model()
        fp32_translations, fp32_speed, fp32_trans_time = benchmark_translation(fp32_model, fp32_tokenizer, "FP32")

        del fp32_model, fp32_tokenizer
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

      
        fp16_model, fp16_tokenizer, fp16_load_time, fp16_size = load_fp16_model()
        fp16_translations, fp16_speed, fp16_trans_time = benchmark_translation(fp16_model, fp16_tokenizer, "FP16")

    
        print("\n PERFORMANCE SUMMARY")
        print("="*60)
        print(f"Model Size:      FP32: {fp32_size/1024:.2f} GB  |  FP16: {fp16_size/1024:.2f} GB")
        print(f"Size Reduction:  {fp32_size/fp16_size:.1f}x smaller")
        print(f"Load Time:       FP32: {fp32_load_time:.1f}s  |  Fp16: {fp16_load_time:.1f}s")
        print(f"Speed:           FP32: {fp32_speed:.1f} tok/s  |  FP16: {fp16_speed:.1f} tok/s")
        print(f"Speed Increase:  {fp16_speed/fp32_speed:.1f}x faster")

     
        compare_translations(fp32_translations, fp16_translations)

        print("\n Comparison Complete")
        print(" FP16 quantization provides:")
        print(f"   • {fp32_size/fp16_size:.1f}x memory reduction")
        print(f"   • {fp16_speed/fp32_speed:.1f}x speed improvement") 
        print(f"   • {fp32_load_time/fp16_load_time:.1f}x faster loading")
      
    except Exception as e:
        print(f" Error during comparison: {e}")
        print("Make sure you have the required packages installed:")
        print("pip install torch transformers accelerate bitsandbytes")

if __name__ == "__main__":
    main_comparison()
