# NLLB-200 (Distilled INT4, 600M Parameters)

In [None]:
!pip install torch transformers bitsandbytes sentencepiece

In [None]:

from transformers import BitsAndBytesConfig

def get_quant_config(mode="int4"):
    if mode == "int4":
        return BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16)
    elif mode == "int8":
        return BitsAndBytesConfig(load_in_8bit=True)
    else:
        return None


In [None]:

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_ID, quantization_config=get_quant_config("int4")).to(DEVICE)

inputs = tokenizer("Bonjour tout le monde", return_tensors="pt").to(DEVICE)
with torch.no_grad():
    outputs = model.generate(**inputs, max_length=128)

print("Translation:", tokenizer.decode(outputs[0], skip_special_tokens=True))
