# English → Hindi Translation with IndicTrans2

In [None]:
!pip install -U transformers torch torchvision torchaudio bitsandbytes scipy accelerate datasets sentencepiece nltk sacremoses pandas regex mock mosestokenizer

In [None]:
!git clone https://github.com/VarunGumma/IndicTransToolkit
%cd IndicTransToolkit
!python -m pip install --editable ./
%cd ..

In [None]:
import nltk
nltk.download('punkt')

In [None]:

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
from IndicTransToolkit.processor import IndicProcessor

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

def initialize_model_and_tokenizer(ckpt_dir, quantization="4-bit"):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(load_in_8bit=True)
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir)
    model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_dir, quantization_config=qconfig).to(DEVICE)
    return tokenizer, model

ip = IndicProcessor(inference=True)


In [None]:

print("=" * 50)
print("English to Hindi Translation")
print("=" * 50)

en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-dist-200M"
en_indic_tokenizer, en_indic_model = initialize_model_and_tokenizer(en_indic_ckpt_dir)

en_sents = [
    "India is a country in South Asia.",
    "I love working on Artificial Intelligence.",
]

batch = ip.preprocess_batch(en_sents, src_lang="eng", tgt_lang="hin")
inputs = en_indic_tokenizer(batch, return_tensors="pt", padding=True).to(DEVICE)

with torch.no_grad():
    generated = en_indic_model.generate(**inputs, max_length=128)

output = en_indic_tokenizer.batch_decode(generated, skip_special_tokens=True)
print("Translations:", output)
