<a href="https://colab.research.google.com/github/pathu11/NMT-Training-Speech-to-Text-Convertion/blob/main/NMT_m2m100.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ====================================================================
# STEP 0: Setup and Installation
# ====================================================================

# Install necessary libraries (if running in a fresh environment like Colab)
# !pip install transformers datasets sentencepiece accelerate -q

import torch
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    M2M100ForConditionalGeneration,
    M2M100Tokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,

)
from torch.optim import AdamW   # ADD THIS

In [5]:
# Set up CUDA if a GPU is available (highly recommended for Transformers)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# torch.cuda.empty_cache() # Use this line if you encounter memory issues

# Define File Path (Adjust if necessary)
FILE_PATH = "/content/drive/MyDrive/research/model/merged_f.txt"
OUTPUT_DIR = "/content/drive/MyDrive/research/model/m2m100_finetuned"


Using device: cpu


In [6]:
# ====================================================================
# STEP 1: Data Loading, Splitting, and Initial Dataset Creation
# ====================================================================

src_texts = []
tgt_texts = []

print("1. Loading and parsing data...")
try:
    with open(FILE_PATH, "r", encoding="utf-8") as f:
        for line in f:
            if "@" in line:
                parts = line.strip().split("@")
                if len(parts) == 2:
                    # Source (Sinhala Text) is before '@'
                    src_texts.append(parts[0].strip())
                    # Target (Gloss IDs/Text) is after '@'
                    tgt_texts.append(parts[1].strip())
except FileNotFoundError:
    print(f"Error: File not found at {FILE_PATH}. Please check your path.")
    # Exit or use dummy data for testing if path is wrong

print(f"Total samples loaded: {len(src_texts)}")

1. Loading and parsing data...
Total samples loaded: 114931


In [7]:
from sklearn.model_selection import train_test_split

# Split data: 90% Train, 10% Eval
train_src, eval_src, train_tgt, eval_tgt = train_test_split(
    src_texts, tgt_texts, test_size=0.1, random_state=42
)

# Create Hugging Face Dataset objects
train_dataset = Dataset.from_dict({"source": train_src, "target": train_tgt})
eval_dataset = Dataset.from_dict({"source": eval_src, "target": eval_tgt})

print(f"Training samples: {len(train_dataset)}")
print(f"Evaluation samples: {len(eval_dataset)}")

Training samples: 103437
Evaluation samples: 11494


In [8]:
import os
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

# 1. Define a permanent path in your Google Drive
# This is where the base model files will sit
model_save_path = "/content/drive/MyDrive/research/model/m2m100_base_saved"

MODEL_NAME = "facebook/m2m100_418M"
print(f"\n2. Initializing model...")

# 2. Check if the model already exists in your Drive
if os.path.exists(model_save_path):
    print(f"üìÇ Found saved model in Drive. Loading from: {model_save_path}")
    # Load directly from Drive (Fast!)
    model = M2M100ForConditionalGeneration.from_pretrained(model_save_path).to(device)
    tokenizer = M2M100Tokenizer.from_pretrained(model_save_path)

else:
    print(f"‚¨áÔ∏è Model not found in Drive. Downloading from Hugging Face: {MODEL_NAME}")
    # Download from Internet
    model = M2M100ForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
    tokenizer = M2M100Tokenizer.from_pretrained(MODEL_NAME)

    # Save to Drive for next time
    print(f"üíæ Saving model to Drive for future use...")
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

# 3. Set Languages
# We explicitly set these every time just to be safe
tokenizer.src_lang = "si"
tokenizer.tgt_lang = "si"
print(f"Tokenizer set for src_lang: {tokenizer.src_lang} and tgt_lang: {tokenizer.tgt_lang}")


2. Initializing model...
üìÇ Found saved model in Drive. Loading from: /content/drive/MyDrive/research/model/m2m100_base_saved
Tokenizer set for src_lang: si and tgt_lang: si


In [10]:
# ====================================================================
# STEP 3: Vocabulary Adaptation for Target Gloss Tokens
# (Crucial for handling the SSL/Gloss output)
# ====================================================================

print("\n3. Adapting vocabulary for Gloss tokens...")

# 3.1 Extract unique Gloss Tokens from your target data
unique_gloss_tokens = set()
all_tgt_texts = train_tgt + eval_tgt

for text in all_tgt_texts:
    # Target format: 'ID:GLOSS_WORD|ID:GLOSS_WORD|...'
    parts = text.split('|')
    for part in parts:
        if ':' in part:
            # Take everything after the first ':' (which is the Gloss word)
            gloss_token = part.split(':', 1)[1]
            unique_gloss_tokens.add(gloss_token.strip())

unique_gloss_tokens_list = list(unique_gloss_tokens)
print(f"Identified {len(unique_gloss_tokens_list)} unique Gloss tokens.")

# 3.2 Add new tokens and resize model embeddings
num_added_toks = tokenizer.add_tokens(unique_gloss_tokens_list)
print(f"Added {num_added_toks} new tokens to the tokenizer.")

# Resize the model embeddings to include the new tokens.
# The new embeddings for these tokens are randomly initialized.
model.resize_token_embeddings(len(tokenizer))
print(f"Model vocabulary size resized to: {len(tokenizer)}")


3. Adapting vocabulary for Gloss tokens...
Identified 3479 unique Gloss tokens.
Added 0 new tokens to the tokenizer.
Model vocabulary size resized to: 131514


In [11]:
# ====================================================================
# STEP 4: Tokenization Function and Dataset Mapping
# ====================================================================

MAX_SEQ_LENGTH = 128

def preprocess_function(examples):
    inputs = examples["source"]
    targets = examples["target"]

    # Tokenize inputs (Source - Sinhala Text)
    model_inputs = tokenizer(inputs, max_length=MAX_SEQ_LENGTH, truncation=True)

    # Tokenize targets (Labels - Gloss Text)
    # The context manager ensures the tokenizer uses the target language settings.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_SEQ_LENGTH, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization to datasets
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True)

# Remove original columns that are no longer needed
tokenized_train = tokenized_train.remove_columns(["source", "target"])
tokenized_eval = tokenized_eval.remove_columns(["source", "target"])

Map:   0%|          | 0/103437 [00:00<?, ? examples/s]



Map:   0%|          | 0/11494 [00:00<?, ? examples/s]

In [12]:
# ====================================================================
# STEP 5: Differential Learning Rate Configuration
# (Implementing the paper's strategy)
# ====================================================================

print("\n4. Configuring Differential Learning Rates...")

# Differential LRs based on the paper:
LR_NEW_WEIGHTS = 1.0e-3 # For newly initialized/resized embeddings and output head
LR_PRE_TRAINED = 2.0e-5 # For the core pre-trained layers (Encoder/Decoder)

# Separate parameters into two groups
# Group 1: Pre-trained layers (core Encoder/Decoder) -> SMALL LR
pretrained_params = [
    p for n, p in model.named_parameters() if p.requires_grad and not any(ext in n for ext in ["embed", "lm_head"])
]

# Group 2: Newly resized embeddings and output head -> LARGER LR
# 'embed' for input/output embeddings, 'lm_head' for the final classification layer
new_params = [
    p for n, p in model.named_parameters() if p.requires_grad and any(ext in n for ext in ["embed", "lm_head"])
]

# Define the parameter groups for the optimizer
optimizer_grouped_parameters = [
    {
        "params": pretrained_params,
        "lr": LR_PRE_TRAINED,
        "weight_decay": 0.01
    },
    {
        "params": new_params,
        "lr": LR_NEW_WEIGHTS,
        "weight_decay": 0.01
    },
]


4. Configuring Differential Learning Rates...


In [18]:
# ====================================================================
# STEP 6: Setup Trainer Arguments and Initialization
# ====================================================================

BATCH_SIZE = 50
NUM_EPOCHS = 10
SAVE_STEPS = 500

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Training Arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    warmup_ratio=5/NUM_EPOCHS,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    report_to="none",

    # Checkpointing Configuration
    save_strategy="epoch",
    save_steps=SAVE_STEPS,
    save_total_limit=3,
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    predict_with_generate=True,

    learning_rate=LR_PRE_TRAINED,
)

# Initialize Trainer.
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    processing_class=M2M100Tokenizer,
    data_collator=data_collator,
    # Ensure you imported AdamW from torch.optim as discussed before
    optimizers=(AdamW(optimizer_grouped_parameters, eps=1e-6), None),
)

In [None]:
# ====================================================================
# STEP 7: Start Fine-Tuning (with Resume Capability)
# ====================================================================

print("\n5. Starting fine-tuning...")

# Check for the latest checkpoint in the output directory
import os
latest_checkpoint = None
if os.path.isdir(OUTPUT_DIR):
    # Find the latest checkpoint folder
    checkpoints = [
        os.path.join(OUTPUT_DIR, d)
        for d in os.listdir(OUTPUT_DIR)
        if d.startswith("checkpoint-")
    ]
    if checkpoints:
        latest_checkpoint = max(checkpoints, key=os.path.getmtime)
        print(f"Found existing checkpoint: {latest_checkpoint}. Resuming training...")

# Start training, resuming if a checkpoint was found
train_result = trainer.train(resume_from_checkpoint=latest_checkpoint)

print("\nFine-tuning complete!")

# Save the final model and tokenizer
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Final model and tokenizer saved to: {OUTPUT_DIR}")


5. Starting fine-tuning...
