# IndicTrans2 




## Setup

Please run the cells below to install the necessary dependencies.


In [1]:
!pip install torch transformers sentencepiece numpy pandas requests



In [2]:
!pip install -U bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.3


In [3]:
!pip install git+https://github.com/VarunGumma/IndicTransToolKit

Collecting git+https://github.com/VarunGumma/IndicTransToolKit
  Cloning https://github.com/VarunGumma/IndicTransToolKit to /tmp/pip-req-build-5ywavcdo
  Running command git clone --filter=blob:none --quiet https://github.com/VarunGumma/IndicTransToolKit /tmp/pip-req-build-5ywavcdo
  Resolved https://github.com/VarunGumma/IndicTransToolKit to commit 9ca9208435d2d24514b592286e89eb115483d2fb
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting indic-nlp-library-IT2@ git+https://github.com/VarunGumma/indic_nlp_library.git (from IndicTransToolkit==1.0.3)
  Cloning https://github.com/VarunGumma/indic_nlp_library.git to /tmp/pip-install-egec76k3/indic-nlp-library-it2_077fa2f08cdc48ce86d06be4c0b77880
  Running command git clone --filter=blob:none --quiet https://github.com/VarunGumma/indic_nlp_library.git /tmp/pip-install-egec76k3/indic-nlp-library-it2_077fa2f08cdc4

**IMPORTANT : Restart your run-time first and then run the cells below.**

## Translation 



In [4]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
import torch.nn as nn
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from IndicTransToolkit import IndicProcessor
import pandas as pd
from torch.cuda.amp import autocast

BATCH_SIZE = 88  # Adjust based on memory usage
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"Number of GPUs available: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
    print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

Number of GPUs available: 2
GPU 0: Tesla T4
GPU 1: Tesla T4


In [5]:
def initialize_model_and_tokenizer(ckpt_dir):
    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    num_gpus = torch.cuda.device_count()  # Should be 2
    models = []
    
    for dev_id in range(num_gpus):
        model = AutoModelForSeq2SeqLM.from_pretrained(
            ckpt_dir,
            trust_remote_code=True,
            low_cpu_mem_usage=True,
            torch_dtype=torch.bfloat16  # Use BF16 to reduce memory usage
        )
        model.to(f"cuda:{dev_id}")  # Explicitly place on cuda:0 or cuda:1
        model.eval()
        models.append(model)
    return tokenizer, models



def batch_translate(input_sentences, src_lang, tgt_lang, tokenizer, models, ip, batch_size=BATCH_SIZE):
    translations = []
    num_gpus = len(models)  # Should be 2
    device0 = torch.device("cuda:0")
    device1 = torch.device("cuda:1")

    for i in range(0, len(input_sentences), batch_size):
        batch = input_sentences[i:i + batch_size]
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
        
        # Ensure at least one sentence per GPU, pad if necessary
        while len(batch) < batch_size and len(batch) < 2 * num_gpus:
            batch.append("")  # Pad with empty string to ensure even split
        
        # Split batch evenly between GPUs
        sub_batch_size = max(1, len(batch) // num_gpus)
        sub_batch_0 = batch[:sub_batch_size]
        sub_batch_1 = batch[sub_batch_size:2 * sub_batch_size] if len(batch) >= 2 * sub_batch_size else batch[sub_batch_size:]
        
        # Tokenize for each GPU
        inputs0 = tokenizer(
            sub_batch_0,
            max_length=256,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True
        ).to(device0)
        
        inputs1 = tokenizer(
            sub_batch_1,
            max_length=256,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True
        ).to(device1)
        
        # Use concurrent execution to run generate() on both GPUs concurrently
        import concurrent.futures
        with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:
            future0 = executor.submit(models[0].generate,
                                        **inputs0,
                                        use_cache=True,
                                        max_length=256,
                                        num_beams=2,
                                        num_return_sequences=1)
            future1 = executor.submit(models[1].generate,
                                        **inputs1,
                                        use_cache=True,
                                        max_length=256,
                                        num_beams=2,
                                        num_return_sequences=1)
            outputs0 = future0.result()
            outputs1 = future1.result()
        
        # Decode outputs
        with tokenizer.as_target_tokenizer():
            decoded0 = tokenizer.batch_decode(outputs0, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            decoded1 = tokenizer.batch_decode(outputs1, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            batch_translations = ip.postprocess_batch(decoded0 + decoded1, lang=tgt_lang)
        
        translations.extend(batch_translations[:len(batch)])  # Trim to original batch size
        
        # Clear memory
        del inputs0, outputs0, inputs1, outputs1
        torch.cuda.empty_cache()
    
    return translations


### English to Telugu synthetic dataset creation 



In [6]:
import pandas as pd

# Initialize model and tokenizer
en_indic_ckpt_dir = "ai4bharat/indictrans2-en-indic-1B"
en_indic_tokenizer, en_indic_models = initialize_model_and_tokenizer(en_indic_ckpt_dir)
ip = IndicProcessor(inference=True)
src_lang, tgt_lang = "eng_Latn", "tel_Telu"

# Process CSV in chunks
output_path = "/kaggle/working/Translated_Suicide_Detection.csv"
chunk_size = 1000  # Process 1000 rows at a time
total_rows_processed = 0
first_chunk = True

for chunk in pd.read_csv("/kaggle/input/suicide-watch/Suicide_Detection.csv", encoding="ISO-8859-1", chunksize=chunk_size):
    if total_rows_processed >= 50000:
        break

    # Limit chunk to remaining rows up to 50,000
    rows_to_take = min(chunk_size, 50000 - total_rows_processed)
    df_subset = chunk.head(rows_to_take).copy()

    # Translate text using the updated batch_translate function
    translated_texts = batch_translate(
        df_subset["text"].tolist(),
        src_lang,
        tgt_lang,
        en_indic_tokenizer,
        en_indic_models,
        ip,
        batch_size=BATCH_SIZE
    )
    df_subset["text"] = translated_texts

    # Translate class labels
    class_translation = {"suicide": "ఆత్మహత్య", "non-suicide": "ఆత్మహత్య కాదు"}
    df_subset["class"] = df_subset["class"].map(class_translation)

    # Save to CSV
    if first_chunk:
        df_subset[["text", "class"]].to_csv(output_path, index=False, encoding="utf-8-sig")
        first_chunk = False
    else:
        df_subset[["text", "class"]].to_csv(output_path, mode="a", header=False, index=False, encoding="utf-8-sig")

    total_rows_processed += len(df_subset)
    print(f"Processed {total_rows_processed}/50000 rows")

    # Clear memory
    del translated_texts, df_subset
    torch.cuda.empty_cache()

print(f"Translation complete. Output saved to: {output_path}")

# Free GPU memory
del en_indic_tokenizer, en_indic_models
torch.cuda.empty_cache()


tokenizer_config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

tokenization_indictrans.py:   0%|          | 0.00/8.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- tokenization_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


dict.SRC.json:   0%|          | 0.00/645k [00:00<?, ?B/s]

dict.TGT.json:   0%|          | 0.00/3.39M [00:00<?, ?B/s]

model.SRC:   0%|          | 0.00/759k [00:00<?, ?B/s]

model.TGT:   0%|          | 0.00/3.26M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/96.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

configuration_indictrans.py:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- configuration_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_indictrans.py:   0%|          | 0.00/79.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/ai4bharat/indictrans2-en-indic-1B:
- modeling_indictrans.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/4.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 1000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 2000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 3000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 4000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 5000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 6000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 7000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 8000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 9000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 10000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 11000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 12000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 13000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 14000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 15000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 16000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 17000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 18000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 19000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 20000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 21000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 22000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 23000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 24000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 25000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 26000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 27000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 28000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 29000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 30000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 31000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 32000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 33000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 34000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 35000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 36000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 37000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 38000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 39000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 40000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 41000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 42000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 43000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 44000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 45000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 46000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 47000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 48000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 49000/50000 rows


  with torch.no_grad(), autocast(), concurrent.futures.ThreadPoolExecutor() as executor:


Processed 50000/50000 rows
Translation complete. Output saved to: /kaggle/working/Translated_Suicide_Detection.csv
