# Hindi → Marathi Translation with IndicTrans2

In [None]:
!pip install -U transformers torch torchvision torchaudio bitsandbytes scipy accelerate datasets sentencepiece nltk sacremoses pandas regex mock mosestokenizer

In [None]:
!git clone https://github.com/VarunGumma/IndicTransToolkit
%cd IndicTransToolkit
!python -m pip install --editable ./
%cd ..

In [None]:
import nltk
nltk.download('punkt')

In [None]:

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
from IndicTransToolkit.processor import IndicProcessor

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

def initialize_model_and_tokenizer(ckpt_dir, quantization="4-bit"):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(load_in_8bit=True)
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir)
    model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_dir, quantization_config=qconfig).to(DEVICE)
    return tokenizer, model

ip = IndicProcessor(inference=True)


In [None]:

print("=" * 50)
print("Hindi to Marathi Translation")
print("=" * 50)

indic_indic_ckpt_dir = "ai4bharat/indictrans2-indic-indic-dist-320M"
indic_indic_tokenizer, indic_indic_model = initialize_model_and_tokenizer(indic_indic_ckpt_dir)

hi_sents = [
    "मुझे स्कूल जाना पसंद है।",
    "वह एक अच्छा खिलाड़ी है।",
]

batch = ip.preprocess_batch(hi_sents, src_lang="hin", tgt_lang="mar")
inputs = indic_indic_tokenizer(batch, return_tensors="pt", padding=True).to(DEVICE)

with torch.no_grad():
    generated = indic_indic_model.generate(**inputs, max_length=128)

output = indic_indic_tokenizer.batch_decode(generated, skip_special_tokens=True)
print("Translations:", output)
