In [None]:
# Install essential libraries for deep learning, model loading, and evaluation
!pip install torch --quiet
!pip install transformers==4.53.2 --quiet
!pip install datasets==2.18.0 --quiet
!pip install evaluate==0.4.1 --quiet
!pip install sacrebleu==2.4.2 --quiet
!pip install pandas --quiet

In [None]:
# Install the custom toolkit required for IndicTrans2 pre-processing and post-processing
!pip install indictranstoolkit --quiet

In [None]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# Access the stored Hugging Face token from Kaggle Secrets
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

# Programmatically log in to Hugging Face
login(token=hf_token)

In [None]:
import torch
import pandas as pd

# Define the device for computation (GPU if available, otherwise CPU)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

LANGS = ["hin_Deva", "pan_Guru", "tam_Taml", "ben_Beng", "mar_Deva", "tel_Telu"]


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

MODEL_NAME = "ai4bharat/indictrans2-indic-indic-dist-320M"

# Load the tokenizer
# trust_remote_code is required for custom tokenization logic
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)

# Load the model
# trust_remote_code is required for custom model architecture
# torch_dtype=torch.float16 uses half-precision for memory efficiency and speed
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
).to(DEVICE)

model.eval() # Set the model to evaluation mode

In [None]:
from IndicTransToolkit.processor import IndicProcessor

# Instantiate the processor for inference tasks
ip = IndicProcessor(inference=True)

In [None]:
from datasets import load_dataset

# Load the 'test' split of the IN22-Gen dataset in the 'all' configuration
# This provides a single table with all language pairs aligned by sentence ID
in22_gen_dataset = load_dataset("ai4bharat/IN22-Gen", split="test")

# Display the first example to inspect the structure
print("Dataset loaded. Example entry:")
print(in22_gen_dataset)

In [None]:
import math
from tqdm.notebook import tqdm

def translate_batch(sentences, src_lang, tgt_lang, batch_size=16):
    translations = []
    for i in tqdm(range(0, len(sentences), batch_size), desc=f"Translating {src_lang} -> {tgt_lang}"):
        batch = sentences[i:i+batch_size]

        if not batch or all(s is None or str(s).strip() == "" for s in batch):
            continue

        # Preprocess
        preprocessed_batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
        if preprocessed_batch is None or len(preprocessed_batch) == 0:
            print(f"⚠️ Preprocessing returned None/empty for {src_lang}->{tgt_lang}")
            continue

        # Tokenize
        inputs = tokenizer(
            preprocessed_batch,
            padding=True,
            truncation=True,
            return_tensors="pt"
        ).to(DEVICE)

        forced_bos_token_id = tokenizer.convert_tokens_to_ids(f"<2{tgt_lang}>")
        
        # Generate
        with torch.no_grad():
            generated_tokens = model.generate(
                **inputs,
                num_beams=5,
                max_length=256,
                forced_bos_token_id=forced_bos_token_id
            )

        # Decode
        decoded = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)

        # Postprocess safely
        try:
            postprocessed = ip.postprocess_batch(decoded, lang=tgt_lang)
            if postprocessed is None:
                raise ValueError("postprocess_batch returned None")
        except Exception as e:
            print(f"⚠️ Postprocessing failed for {src_lang}->{tgt_lang}: {e}")
            postprocessed = decoded  # fallback

        translations.extend(postprocessed)

    return translations


In [None]:
import evaluate
def evaluate_direction(src_lang, tgt_lang, dataset, num_samples=200):
    """
    Evaluate translation performance from src_lang -> tgt_lang
    using chrF++ on a subset of the dataset.
    """
    src_sentences = dataset[src_lang][:num_samples]
    tgt_sentences = dataset[tgt_lang][:num_samples]

    # Translate
    preds = translate_batch(src_sentences, src_lang, tgt_lang)

    # Ensure preds is a flat list of strings
    preds = [p if isinstance(p, str) else "" for p in preds]

    if len(preds) == 0:
        print(f"⚠️ No predictions produced for {src_lang}->{tgt_lang}")
        return None

    from sacrebleu.metrics import CHRF
    chrf = CHRF(word_order=2)  # chrF++
    score = chrf.corpus_score(preds, [tgt_sentences]).score
    return score


In [None]:
# Pick 5 sentences for testing
sample_src = in22_gen_dataset["hin_Deva"][:5]
sample_tgt = in22_gen_dataset["pan_Guru"][:5]

print("RAW source examples:", sample_src)

# Step 1: Preprocess
preprocessed = ip.preprocess_batch(sample_src, src_lang="hin_Deva", tgt_lang="pan_Guru")
print("\nAfter preprocess:", preprocessed)

# Step 2: Tokenize
inputs = tokenizer(
    preprocessed,
    padding=True,
    truncation=True,
    return_tensors="pt"
).to(DEVICE)

print("\nTokenizer output keys:", inputs.keys())
print("Shape of input_ids:", inputs["input_ids"].shape if "input_ids" in inputs else "MISSING")

# Step 3: Generate (only if inputs look good)
if "input_ids" in inputs:
    # --- FIX IS HERE ---
    # Define the target language and get its token ID
    tgt_lang = "pan_Guru"
    forced_bos_token_id = tokenizer.convert_tokens_to_ids(f"<2{tgt_lang}>")
    # -------------------
    
    with torch.no_grad():
        # Pass the token ID to the generate function
        outputs = model.generate(
            **inputs, 
            num_beams=5, 
            max_length=256,
            forced_bos_token_id=forced_bos_token_id  # <--- THIS FIXES THE ERROR
        )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    post = ip.postprocess_batch(decoded, lang="pan_Guru")

    print("\nDecoded:", decoded)
    print("Postprocessed:", post)
    print("\nTarget (for comparison):", sample_tgt)

In [None]:
# Initialize a list to store the results
results_data = []

# Loop through all ordered pairs of your LANGS (excluding same-source/target)
for src in LANGS:
    for tgt in LANGS:
        if src == tgt:
            continue

        print(f"\n--- Evaluating direction: {src} -> {tgt} ---")
        try:
            score = evaluate_direction(src, tgt, in22_gen_dataset)
            results_data.append({
                "Source": src,
                "Target": tgt,
                "Reproduced chrF++": round(score, 1)
            })
        except Exception as e:
            print(f"An error occurred during {src} -> {tgt}: {e}")
            results_data.append({
                "Source": src,
                "Target": tgt,
                "Reproduced chrF++": "Error"
            })

# Convert the results list to a DataFrame
reproduced_df = pd.DataFrame(results_data)

print("\n--- Full Experiment Complete ---")
reproduced_df

In [None]:
import numpy as np

# Calculate average scores for each language (xx->lang and lang->xx)
def calculate_average_scores(results_df):
    avg_scores = {}
    
    for lang in LANGS:
        # xx->lang: all sources to this target language
        xx_to_lang_scores = results_df[results_df['Target'] == lang]['Reproduced chrF++'].tolist()
        
        # lang->xx: this source to all target languages  
        lang_to_xx_scores = results_df[results_df['Source'] == lang]['Reproduced chrF++'].tolist()
        
        avg_scores[lang] = {
            'xx-lang': np.mean(xx_to_lang_scores) if xx_to_lang_scores else 0,
            'lang-xx': np.mean(lang_to_xx_scores) if lang_to_xx_scores else 0
        }
    
    return avg_scores

# Calculate our reproduced average scores
reproduced_avg_scores = calculate_average_scores(reproduced_df)

# Create comparison table with paper scores from the image
comparison_data = []
for lang in LANGS:
    # Paper scores from the table image (IT2-Dist-M2M columns)
    paper_xx_lang = {
        'hin_Deva': 47.1, 'pan_Guru': 40.9, 'tam_Taml': 42.6, 
        'ben_Beng': 43.2, 'mar_Deva': 41.5, 'tel_Telu': 42.9
    }[lang]
    
    paper_lang_xx = {
        'hin_Deva': 42.3, 'pan_Guru': 39.1, 'tam_Taml': 39.3, 
        'ben_Beng': 41.2, 'mar_Deva': 42.4, 'tel_Telu': 41.9
    }[lang]
    
    comparison_data.append({
        'Language': lang,
        'Paper xx-lang': paper_xx_lang,
        'Reproduced xx-lang': round(reproduced_avg_scores[lang]['xx-lang'], 1),
        'Paper lang-xx': paper_lang_xx,
        'Reproduced lang-xx': round(reproduced_avg_scores[lang]['lang-xx'], 1)
    })

# Create final comparison DataFrame
comparison_df = pd.DataFrame(comparison_data)

# Calculate differences
comparison_df['Diff xx-lang'] = comparison_df['Reproduced xx-lang'] - comparison_df['Paper xx-lang']
comparison_df['Diff lang-xx'] = comparison_df['Reproduced lang-xx'] - comparison_df['Paper lang-xx']

print("Comparison of Average Scores (IT2-Dist-M2M)")
print("=" * 80)
display(comparison_df)

# Calculate overall statistics
print(f"\nOverall Statistics:")
print(f"Average xx-lang difference: {comparison_df['Diff xx-lang'].mean():.1f}")
print(f"Average lang-xx difference: {comparison_df['Diff lang-xx'].mean():.1f}")