In [None]:
# Install required packages
!pip install transformers==4.28.1
!pip install sentencepiece
!pip install torch
!pip install datasets
!pip install evaluate
!pip install rouge_score


In [None]:
import json
from random import choice, randint

# Word banks (same as in your original dataset)
word_banks = {
    "subjects_male": ["استاد", "پائلٹ", "لڑکا", "کسان", "ڈاکٹر", "انجینئر", "طالب علم"],
    "subjects_female": ["استانی", "گھوڑی", "لڑکی", "بلی", "ڈاکٹر صاحبہ", "انجینئر خاتون", "طالبہ"],
    "subjects_plural": ["وہ لوگ", "میرے والدین", "لڑکے", "پائلٹس", "ڈاکٹرز", "انجینئرز"],
    "verbs_present_male": ["پڑھتا ہے", "لکھتا ہے", "دیکھتا ہے", "کھاتا ہے", "پیتا ہے"],
    "verbs_present_female": ["پڑھتی ہے", "لکھتی ہے", "دیکھتی ہے", "کھاتی ہے", "پیتی ہے"],
    "verbs_present_plural": ["پڑھتے ہیں", "لکھتے ہیں", "دیکھتے ہیں", "کھاتے ہیں", "پیتے ہیں"],
    "verbs_past_male": ["پڑھا", "لکھا", "دیکھا", "کھایا", "پیا"],
    "verbs_past_female": ["پڑھی", "لکھی", "دیکھی", "کھائی", "پیی"],
    "verbs_past_plural": ["پڑھے", "لکھے", "دیکھے", "کھائے", "پئے"],
    "objects": ["کمپیوٹر", "کتاب", "تحفہ", "موبائل", "ٹی وی", "خط", "کپڑے", "جوتے"],
    "time_phrases": ["کل", "آج", "صبح", "دوپہر", "شام", "رات", "پرسوں", "مہینے میں", "سال میں", "ہفتے میں"],
    "locations": ["گھر پر", "اسکول پر", "بازار پر", "ہسپتال پر", "پارک میں", "دریا پر", "ساحل پر", "مسجد میں", "دفتر میں", "کھیت پر"]
}

# Error types (same as in your original dataset)
error_types = [
    "gender_disagreement",
    "tense_error",
    "number_disagreement",
    "word_order",
    "spelling_error",
    "missing_word",
    "extra_word"
]

def introduce_error(correct_sentence, error_type):
    words = correct_sentence.split()

    if error_type == "gender_disagreement":
        # Replace verb with incorrect gender form
        for i, word in enumerate(words):
            if word in ["ہے", "ہیں"]:
                words[i] = "ہےں" if word == "ہے" else "ہے"
            elif word.endswith(('تا', 'ا')):  # Male form
                words[i] = word[:-1] + ('تی' if word.endswith('تا') else 'ی')
            elif word.endswith(('تی', 'ی')):  # Female form
                words[i] = word[:-1] + ('تا' if word.endswith('تی') else 'ا')
        return ' '.join(words)

    elif error_type == "tense_error":
        # Replace verb tense randomly
        for i, word in enumerate(words):
            if word in ["ہے", "ہیں"]:
                words[i] = choice(["تھا", "تھی", "تھے"])
            elif word.endswith(('تا', 'تی', 'تے')):
                words[i] = word[:-2] + choice(['ا', 'ی', 'ے'])
        return ' '.join(words)

    elif error_type == "number_disagreement":
        # Change singular/plural forms
        for i, word in enumerate(words):
            if word in ["ہے", "ہیں"]:
                words[i] = "ہےں" if word == "ہے" else "ہے"
            elif word.endswith(('ا', 'ی', 'ے')):
                words[i] = word + 'ں' if not word.endswith('ں') else word[:-1]
        return ' '.join(words)

    elif error_type == "word_order":
        # Swap two random words (not time phrases)
        swap_indices = [i for i in range(len(words)) if words[i] not in word_banks["time_phrases"]]
        if len(swap_indices) >= 2:
            i, j = sorted(choice(swap_indices) for _ in range(2))
            words[i], words[j] = words[j], words[i]
        return ' '.join(words)

    elif error_type == "spelling_error":
        # Common Urdu spelling mistakes
        replacements = {
            'ے': 'ی',
            'ی': 'ے',
            'ھ': 'ہ',
            'ں': 'ن',
            'ا': 'ع',
            'ک': 'ك'
        }
        word_idx = randint(0, len(words)-1)
        char_idx = randint(0, len(words[word_idx])-1)
        char = words[word_idx][char_idx]
        if char in replacements:
            words[word_idx] = words[word_idx][:char_idx] + replacements[char] + words[word_idx][char_idx+1:]
        return ' '.join(words)

    elif error_type == "missing_word":
        # Remove a random non-essential word (not subject/verb)
        removable = [i for i, word in enumerate(words)
                    if word not in word_banks["time_phrases"]
                    and not any(word in bank for bank in [word_banks["subjects_male"],
                                                         word_banks["subjects_female"],
                                                         word_banks["subjects_plural"]])]
        if removable:
            del words[choice(removable)]
        return ' '.join(words)

    elif error_type == "extra_word":
        # Add a common unnecessary word
        extra_words = ["پھر", "اور", "لیکن", "بہت", "زیادہ"]
        insert_pos = randint(1, len(words)-1)
        words.insert(insert_pos, choice(extra_words))
        return ' '.join(words)

    return correct_sentence

def generate_sentence():
    # Randomly select subject type
    subject_type = choice(["male", "female", "plural"])
    subject = choice(word_banks[f"subjects_{subject_type}"])

    # Select appropriate verb
    tense = choice(["present", "past"])
    verb = choice(word_banks[f"verbs_{tense}_{subject_type}"])

    # Build sentence
    time = choice(word_banks["time_phrases"])
    obj = choice(word_banks["objects"])
    loc = choice(word_banks["locations"])

    if randint(0, 1):  # 50% chance to include time phrase
        correct = f"{time} {subject} نے {obj} {loc} {verb}"
    else:
        correct = f"{subject} نے {obj} {loc} {verb}"

    # Clean up spaces
    correct = ' '.join(correct.split())

    # Introduce error
    error_type = choice(error_types)
    incorrect = introduce_error(correct, error_type)

    return {
        "correct": correct,
        "incorrect": incorrect,
        "error_type": error_type,
        "error_description": {
            "gender_disagreement": "Using wrong gender verb form",
            "tense_error": "Using wrong tense",
            "number_disagreement": "Singular/plural disagreement",
            "word_order": "Incorrect word order",
            "spelling_error": "Common spelling mistakes",
            "missing_word": "Omitting a word",
            "extra_word": "Adding an unnecessary word"
        }[error_type],
        "target_verb": verb
    }

# Generate dataset
dataset = {
    "metadata": {
        "total_pairs": 10000,
        "error_distribution": {et: 0 for et in error_types},
        "word_banks_used": list(word_banks.keys()),
        "error_types_available": error_types
    },
    "pairs": []
}

for _ in range(10000):
    pair = generate_sentence()
    dataset["pairs"].append(pair)
    dataset["metadata"]["error_distribution"][pair["error_type"]] += 1

# Save to file
with open('urdu_error_dataset.json', 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False, indent=2)

print("Dataset generated successfully as 'urdu_error_dataset.json'")

In [None]:


import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import json
from tqdm import tqdm
from datasets import load_metric
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Set random seeds for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# Load and preprocess the dataset
with open('urdu_error_dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

pairs = data['pairs']

# Split data into train and validation sets
train_pairs, val_pairs = train_test_split(pairs, test_size=0.1, random_state=seed)

# Initialize tokenizer
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)

# Add special tokens if needed (optional)
special_tokens = ["<correct>", "<incorrect>"]
tokenizer.add_tokens(special_tokens)

# Model parameters
max_source_length = 128
max_target_length = 128
batch_size = 8
num_epochs = 10
learning_rate = 3e-4
warmup_steps = 100

# Create dataset class
class UrduGECDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_source_length, max_target_length):
        self.pairs = pairs
        self.tokenizer = tokenizer
        self.max_source_length = max_source_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        pair = self.pairs[idx]
        source = pair['incorrect']
        target = pair['correct']

        # Tokenize the source and target
        source_encoding = tokenizer(
            source,
            max_length=self.max_source_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        target_encoding = tokenizer(
            target,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        labels = target_encoding['input_ids']
        labels[labels == tokenizer.pad_token_id] = -100  # Ignore padding in loss

        return {
            'input_ids': source_encoding['input_ids'].flatten(),
            'attention_mask': source_encoding['attention_mask'].flatten(),
            'labels': labels.flatten()
        }

# Create datasets
train_dataset = UrduGECDataset(train_pairs, tokenizer, max_source_length, max_target_length)
val_dataset = UrduGECDataset(val_pairs, tokenizer, max_source_length, max_target_length)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Initialize model
model = MT5ForConditionalGeneration.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))  # Resize if we added special tokens

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

# Training function
def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    losses = []

    for batch in tqdm(data_loader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        scheduler.step()

    return np.mean(losses)

# Evaluation function
def eval_epoch(model, data_loader, device):
    model.eval()
    losses = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            losses.append(loss.item())

    return np.mean(losses)

# Training loop
best_val_loss = float('inf')
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    val_loss = eval_epoch(model, val_loader, device)

    print(f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    # Save best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model_urdu_gec.pt')
        print("Saved best model!")

# Load best model for inference
model.load_state_dict(torch.load('best_model_urdu_gec.pt'))

# Save the full model and tokenizer for later use
model.save_pretrained("urdu_gec_mt5")
tokenizer.save_pretrained("urdu_gec_mt5")

# Inference function
def correct_sentence(model, tokenizer, sentence, device, max_length=128):
    model.eval()

    # Tokenize input
    inputs = tokenizer(
        sentence,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    # Generate output
    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=2  # To avoid repeating phrases
    )

    # Decode output
    corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return corrected

# Test the model with some examples
test_examples = [
    "استاد نے کمپیوٹر سویاں۔",  # Should be "استاد نے کمپیوٹر سویا۔"
    "کیا بلی نے کتاب کھیلی؟",   # Correct sentence (should remain same)
    "گھوڑی کل گھر پر پڑھتی ہے۔"  # Correct sentence (should remain same)
]

for example in test_examples:
    corrected = correct_sentence(model, tokenizer, example, device)
    print(f"Original: {example}")
    print(f"Corrected: {corrected}")
    print("-" * 50)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



Epoch 1/10


Training: 100%|██████████| 1125/1125 [08:05<00:00,  2.32it/s]
Evaluating: 100%|██████████| 125/125 [00:15<00:00,  8.00it/s]


Train Loss: 5.5294, Val Loss: 1.7834
Saved best model!
Epoch 2/10


Training: 100%|██████████| 1125/1125 [08:06<00:00,  2.31it/s]
Evaluating: 100%|██████████| 125/125 [00:15<00:00,  8.01it/s]


Train Loss: 2.1133, Val Loss: 1.2638
Saved best model!
Epoch 3/10


Training: 100%|██████████| 1125/1125 [08:06<00:00,  2.31it/s]
Evaluating: 100%|██████████| 125/125 [00:15<00:00,  8.04it/s]


Train Loss: 1.5655, Val Loss: 1.0613
Saved best model!
Epoch 4/10


Training: 100%|██████████| 1125/1125 [08:06<00:00,  2.31it/s]
Evaluating: 100%|██████████| 125/125 [00:15<00:00,  8.01it/s]


Train Loss: 1.2717, Val Loss: 0.7380
Saved best model!
Epoch 5/10


Training: 100%|██████████| 1125/1125 [08:06<00:00,  2.31it/s]
Evaluating: 100%|██████████| 125/125 [00:15<00:00,  8.02it/s]


Train Loss: 0.8209, Val Loss: 0.2302
Saved best model!
Epoch 6/10


Training: 100%|██████████| 1125/1125 [08:06<00:00,  2.31it/s]
Evaluating: 100%|██████████| 125/125 [00:15<00:00,  8.02it/s]


Train Loss: 0.3686, Val Loss: 0.1544
Saved best model!
Epoch 7/10


Training: 100%|██████████| 1125/1125 [08:06<00:00,  2.31it/s]
Evaluating: 100%|██████████| 125/125 [00:15<00:00,  7.98it/s]


Train Loss: 0.2560, Val Loss: 0.1176
Saved best model!
Epoch 8/10


Training: 100%|██████████| 1125/1125 [08:06<00:00,  2.31it/s]
Evaluating: 100%|██████████| 125/125 [00:15<00:00,  7.99it/s]


Train Loss: 0.2000, Val Loss: 0.1028
Saved best model!
Epoch 9/10


Training: 100%|██████████| 1125/1125 [08:06<00:00,  2.31it/s]
Evaluating: 100%|██████████| 125/125 [00:15<00:00,  8.01it/s]


Train Loss: 0.1671, Val Loss: 0.0941
Saved best model!
Epoch 10/10


Training: 100%|██████████| 1125/1125 [08:06<00:00,  2.31it/s]
Evaluating: 100%|██████████| 125/125 [00:15<00:00,  8.00it/s]


Train Loss: 0.1514, Val Loss: 0.0878
Saved best model!
Original: استاد نے کمپیوٹر سویاں۔
Corrected: استاد نے کمپیوٹر سویا۔
--------------------------------------------------
Original: کیا بلی نے کتاب کھیلی؟
Corrected: کیا بلی نے کتاب کھیلی؟
--------------------------------------------------
Original: گھوڑی کل گھر پر پڑھتی ہے۔
Corrected: گھوڑی کل گھر پر پڑھتی ہے۔
--------------------------------------------------


In [None]:
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the fine-tuned model and tokenizer
model_dir = "/content/drive/MyDrive/urdu_gec_mt5"  # Adjust if path changes

tokenizer = MT5Tokenizer.from_pretrained(model_dir)
model = MT5ForConditionalGeneration.from_pretrained(model_dir)
model = model.to(device)

# Inference function
def correct_sentence(model, tokenizer, sentence, device, max_length=128):
    model.eval()

    inputs = tokenizer(
        sentence,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    ).to(device)

    outputs = model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=max_length,
        num_beams=5,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    corrected = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return corrected

# Test on new (unseen) data
unseen_sentences = [
    "میرا بھائی کل بازار گیا۔",
    "بچے سکول جارہیں۔",
    "وہ کتاب پڑھتا ہے۔",
    "ہم کل پارک گیا۔",
    "میرا بھائی کل بازار پر گئے",
    "میری بہن سکول پر کل گئیں",
    "علی اچھی لڑکا ہے",
    "میں اچھا بچہ تھی",
]

for sentence in unseen_sentences:
    corrected = correct_sentence(model, tokenizer, sentence, device)
    print(f"Original:  {sentence}")
    print(f"Corrected: {corrected}")
    print("-" * 50)


Original:  میرا بھائی کل بازار گیا۔
Corrected: میرا بھائی کل بازار پر گیا۔
--------------------------------------------------
Original:  بچے سکول جارہیں۔
Corrected: بلی نے پھول جاتی ہے۔
--------------------------------------------------
Original:  وہ کتاب پڑھتا ہے۔
Corrected: وہ شخص نے کتاب پڑھتا ہے۔
--------------------------------------------------
Original:  ہم کل پارک گیا۔
Corrected: وہ لوگ کل پارک پر گیا۔
--------------------------------------------------
Original:  میرا بھائی کل بازار پر گئے
Corrected: میرا بھائی کل بازار پر گئے۔
--------------------------------------------------
Original:  میری بہن سکول پر کل گئیں
Corrected: میری بہن کل کھیت پر گئی۔
--------------------------------------------------
Original:  علی اچھی لڑکا ہے
Corrected: وہ لوگ ہفتے میں لڑکا دریا پر چلتا ہے۔
--------------------------------------------------
Original:  میں اچھا بچہ تھی
Corrected: وہ لوگ اور استاد گھر میں دیکھے۔
--------------------------------------------------


In [12]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import torch
import json
from sklearn.model_selection import train_test_split
from evaluate import load


model_path = "/content/drive/MyDrive/urdu_gec_mt5"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = MT5ForConditionalGeneration.from_pretrained(model_path).to(device)
tokenizer = MT5Tokenizer.from_pretrained(model_path)



# Load the dataset
with open('/content/drive/MyDrive/urdu_gec_mt5/urdu_error_dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

pairs = data['pairs']

# Split again
val_pairs = train_test_split(pairs, test_size=0.1, random_state=42)[1]



# Generate predictions
references = []
predictions = []

for pair in val_pairs:
    incorrect = pair['incorrect']
    correct = pair['correct']

    input_ids = tokenizer(
        incorrect,
        return_tensors='pt',
        padding=True,
        truncation=True,
        max_length=128
    ).input_ids.to(device)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_length=128,
            num_beams=5,
            early_stopping=True,
            no_repeat_ngram_size=2
        )

    decoded_output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    predictions.append(decoded_output)
    references.append(correct)



chrf = load("chrf")


chrf_score = chrf.compute(predictions=predictions, references=references)

print("ChrF:", chrf_score)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'T5Tokenizer'. 
The class this function is called from is 'MT5Tokenizer'.


ChrF: {'score': 92.3029063885685, 'char_order': 6, 'word_order': 0, 'beta': 2}
