# IndicBART Hindi Grammar Error Correction - Complete Training

This notebook provides a complete solution for training IndicBART on Hindi grammar error correction using the combined dataset.

**Features:**
- Proper MBART tokenization (fixes token_type_ids error)
- Large dataset training (10k+ samples)
- Fixed generation parameters (no repetition loops)
- Early stopping and best model saving
- Comprehensive evaluation on dev set

**Dataset:** `Hindi/combined_test_dataset.csv`

In [35]:
# Install and import required packages
import subprocess
import sys

# Install packages if needed
required_packages = [
    'torch',
    'transformers',
    'datasets',
    'pandas', 
    'numpy',
    'scikit-learn',
    'tqdm'
]

for package in required_packages:
    try:
        __import__(package)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

print("All packages ready!")

Installing scikit-learn...
All packages ready!
All packages ready!


In [36]:
# Core imports
import torch
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
import warnings
warnings.filterwarnings('ignore')

# Transformers imports
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from datasets import Dataset
import transformers

print("Libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

Libraries imported successfully!
PyTorch version: 2.8.0+cu129
Transformers version: 4.56.2
CUDA available: True
GPU: NVIDIA GeForce RTX 4050 Laptop GPU
GPU Memory: 6.0 GB


In [39]:
# Configuration
CONFIG = {
    # Model settings
    'model_name': 'ai4bharat/IndicBART',
    'max_length': 128,
    'device': 'cuda' if torch.cuda.is_available() else 'cpu',
    
    # Data settings
    'dataset_path': 'Hindi/combined_test_dataset.csv',
    'test_size': 0.2,  # 20% for dev set
    'random_seed': 42,
    
    # Training settings
    'output_dir': './indicbart-hindi-final',
    'num_epochs': 50,
    'train_batch_size': 1,  # Reduced for stability
    'eval_batch_size': 1,   # Reduced for stability
    'gradient_accumulation_steps': 16,  # Increased to maintain effective batch size = 1 * 16 = 16
    'learning_rate': 1e-6,  # Conservative learning rate
    'warmup_ratio': 0.1,
    'weight_decay': 0.01,
    'max_grad_norm': 1.0,
    
    # Evaluation settings
    'eval_steps': 500,
    'save_steps': 500,
    'logging_steps': 100,
    'save_total_limit': 3,
    'early_stopping_patience': 3,
    
    # Generation settings (fixes repetition issues)
    'generation_config': {
        'max_new_tokens': 64,
        'min_new_tokens': 1,
        'num_beams': 4,
        'early_stopping': True,
        'repetition_penalty': 1.5,
        'no_repeat_ngram_size': 3,
        'length_penalty': 0.8,
        'do_sample': False,
    }
}

print("Configuration:")
for key, value in CONFIG.items():
    if key != 'generation_config':
        print(f"   {key}: {value}")

print(f"\nGeneration Config:")
for key, value in CONFIG['generation_config'].items():
    print(f"   {key}: {value}")

Configuration:
   model_name: ai4bharat/IndicBART
   max_length: 128
   device: cuda
   dataset_path: Hindi/combined_test_dataset.csv
   test_size: 0.2
   random_seed: 42
   output_dir: ./indicbart-hindi-final
   num_epochs: 50
   train_batch_size: 1
   eval_batch_size: 1
   gradient_accumulation_steps: 16
   learning_rate: 1e-06
   warmup_ratio: 0.1
   weight_decay: 0.01
   max_grad_norm: 1.0
   eval_steps: 500
   save_steps: 500
   logging_steps: 100
   save_total_limit: 3
   early_stopping_patience: 3

Generation Config:
   max_new_tokens: 64
   min_new_tokens: 1
   num_beams: 4
   early_stopping: True
   repetition_penalty: 1.5
   no_repeat_ngram_size: 3
   length_penalty: 0.8
   do_sample: False


In [40]:
# Load and prepare dataset
print("Loading dataset...")

dataset_path = Path(CONFIG['dataset_path'])
if not dataset_path.exists():
    raise FileNotFoundError(f"Dataset not found: {dataset_path}")

# Load dataset
df = pd.read_csv(dataset_path, encoding='utf-8')
print(f"Dataset loaded: {df.shape}")
print(f"File size: {dataset_path.stat().st_size / (1024**2):.1f} MB")
print(f"Columns: {list(df.columns)}")

# Handle column names
if 'input' in df.columns and 'output' in df.columns:
    input_col, output_col = 'input', 'output'
elif len(df.columns) >= 2:
    input_col, output_col = df.columns[0], df.columns[1]
    print(f"Using columns: '{input_col}' -> '{output_col}'")
else:
    raise ValueError("Cannot identify input/output columns")

# Clean data
print(f"\nCleaning data...")
original_size = len(df)

# Remove nulls and empty strings
df = df.dropna(subset=[input_col, output_col])
df[input_col] = df[input_col].astype(str).str.strip()
df[output_col] = df[output_col].astype(str).str.strip()
df = df[(df[input_col] != '') & (df[output_col] != '')]

# Filter by length (5-200 characters)
df = df[
    (df[input_col].str.len() >= 5) & (df[input_col].str.len() <= 200) &
    (df[output_col].str.len() >= 5) & (df[output_col].str.len() <= 200)
]

print(f"   Original: {original_size:,} samples")
print(f"   Cleaned: {len(df):,} samples")
print(f"   Removed: {original_size - len(df):,} samples")

# Rename columns
df_clean = df[[input_col, output_col]].copy()
df_clean.columns = ['input_text', 'output_text']

# Analyze data composition
identical = (df_clean['input_text'] == df_clean['output_text']).sum()
corrections = len(df_clean) - identical

print(f"\nData composition:")
print(f"   Identity pairs: {identical:,} ({identical/len(df_clean)*100:.1f}%)")
print(f"   Corrections: {corrections:,} ({corrections/len(df_clean)*100:.1f}%)")

# Show examples
print(f"\nSample corrections:")
correction_samples = df_clean[df_clean['input_text'] != df_clean['output_text']].head(3)
for i, (_, row) in enumerate(correction_samples.iterrows(), 1):
    print(f"  {i}. Input:  {row['input_text'][:80]}...")
    print(f"     Output: {row['output_text'][:80]}...")
    print()

Loading dataset...
Dataset loaded: (10599, 3)
File size: 14.7 MB
Columns: ['Input sentence', 'Output sentence', 'Unnamed: 2']
Using columns: 'Input sentence' -> 'Output sentence'

Cleaning data...
   Original: 10,599 samples
   Cleaned: 5,268 samples
   Removed: 5,331 samples

Data composition:
   Identity pairs: 2,442 (46.4%)
   Corrections: 2,826 (53.6%)

Sample corrections:
  1. Input:  ये केवल किताबी ज्ञान अर्जन तक ही सिमित नहीं है।...
     Output: ये केवल किताबी ज्ञान अर्जन तक ही सीमित नहीं है।...

  2. Input:  जैसे - व्यावहारिक शिक्षा, किताबी शिक्षा अथवा अध्यामित्क शिक्षा...
     Output: जैसे - व्यावहारिक शिक्षा, किताबी शिक्षा अथवा आध्यात्मिक शिक्षा ।...

  3. Input:  वहाँ  अचार्य होते थे शिक्षक के स्थान पे।...
     Output: वहाँ  अचार्य होते थे शिक्षक के स्थान पर।...



In [41]:
# Create train/dev splits
print("Creating train/dev splits...")

# Stratified split to maintain correction ratio
df_clean['is_correction'] = (df_clean['input_text'] != df_clean['output_text']).astype(int)

train_df, dev_df = train_test_split(
    df_clean,
    test_size=CONFIG['test_size'],
    random_state=CONFIG['random_seed'],
    stratify=df_clean['is_correction']
)

# Remove helper column
train_df = train_df[['input_text', 'output_text']].reset_index(drop=True)
dev_df = dev_df[['input_text', 'output_text']].reset_index(drop=True)

print(f"Split complete:")
print(f"   Train: {len(train_df):,} samples")
print(f"   Dev: {len(dev_df):,} samples")

# Verify split distribution
train_corrections = (train_df['input_text'] != train_df['output_text']).sum()
dev_corrections = (dev_df['input_text'] != dev_df['output_text']).sum()

print(f"\nDistribution maintained:")
print(f"   Train corrections: {train_corrections:,}/{len(train_df):,} ({train_corrections/len(train_df)*100:.1f}%)")
print(f"   Dev corrections: {dev_corrections:,}/{len(dev_df):,} ({dev_corrections/len(dev_df)*100:.1f}%)")

Creating train/dev splits...
Split complete:
   Train: 4,214 samples
   Dev: 1,054 samples

Distribution maintained:
   Train corrections: 2,261/4,214 (53.7%)
   Dev corrections: 565/1,054 (53.6%)


In [42]:
# Load model and tokenizer
print(f"Loading IndicBART model and tokenizer...")

# Clear GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(CONFIG['model_name'])
print(f"Tokenizer loaded: {type(tokenizer).__name__}")
print(f"   Vocab size: {len(tokenizer):,}")

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(
    CONFIG['model_name'],
    torch_dtype=torch.float32  # Use float32 for stability
)

print(f"Model loaded: {type(model).__name__}")
print(f"   Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")
print(f"   Device: {CONFIG['device']}")

# Move model to device
model = model.to(CONFIG['device'])

# Fix tokenizer padding
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id
    print(f"Set pad_token to eos_token")

print(f"\nModel ready for training!")

if CONFIG['device'] == 'cuda':
    print(f"GPU memory: {torch.cuda.memory_allocated() / 1024**3:.1f} GB allocated")
    print(f"GPU memory: {torch.cuda.memory_reserved() / 1024**3:.1f} GB reserved")

Loading IndicBART model and tokenizer...
Tokenizer loaded: AlbertTokenizerFast
   Vocab size: 64,014
Tokenizer loaded: AlbertTokenizerFast
   Vocab size: 64,014
Model loaded: MBartForConditionalGeneration
   Parameters: 244.0M
   Device: cuda
Model loaded: MBartForConditionalGeneration
   Parameters: 244.0M
   Device: cuda

Model ready for training!
GPU memory: 3.7 GB allocated
GPU memory: 3.8 GB reserved

Model ready for training!
GPU memory: 3.7 GB allocated
GPU memory: 3.8 GB reserved


In [43]:
# Tokenization function (MBART compatible)
def tokenize_function(examples):
    """Tokenize examples for MBART - handles token_type_ids properly"""
    # Create input with task prefix
    inputs = [f"सुधारें: {text}" for text in examples['input_text']]
    targets = examples['output_text']
    
    # Tokenize inputs (CRITICAL: return_token_type_ids=False for MBART)
    model_inputs = tokenizer(
        inputs,
        max_length=CONFIG['max_length'],
        truncation=True,
        padding=False,  # Dynamic padding during training
        return_token_type_ids=False  # MBART doesn't use token_type_ids!
    )
    
    # Tokenize targets (CRITICAL: return_token_type_ids=False for MBART)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=CONFIG['max_length'],
            truncation=True,
            padding=False,
            return_token_type_ids=False  # MBART doesn't use token_type_ids!
        )
    
    # Set labels
    model_inputs["labels"] = labels["input_ids"]
    
    # Double-check: remove any token_type_ids that might sneak in
    model_inputs.pop("token_type_ids", None)
    
    return model_inputs

print("Tokenization function created (MBART compatible)")

# Test tokenization
print("\nTesting tokenization...")
test_example = {
    'input_text': [train_df['input_text'].iloc[0]],
    'output_text': [train_df['output_text'].iloc[0]]
}

test_result = tokenize_function(test_example)
print(f"Tokenization test passed!")
print(f"   Keys: {list(test_result.keys())}")
print(f"   Input IDs length: {len(test_result['input_ids'][0])}")
print(f"   Labels length: {len(test_result['labels'][0])}")

# Verify no token_type_ids
if 'token_type_ids' in test_result:
    raise ValueError("token_type_ids found in tokenized output! This will cause MBART error.")
else:
    print(f"No token_type_ids found - MBART compatible!")

Tokenization function created (MBART compatible)

Testing tokenization...
Tokenization test passed!
   Keys: ['input_ids', 'attention_mask', 'labels']
   Input IDs length: 52
   Labels length: 49
No token_type_ids found - MBART compatible!


In [44]:
# Create datasets
print("Creating datasets...")

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

print(f"   Created train dataset: {len(train_dataset):,} samples")
print(f"   Created dev dataset: {len(dev_dataset):,} samples")

# Tokenize datasets
print(f"\nTokenizing datasets...")

train_tokenized = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train"
)

dev_tokenized = dev_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dev_dataset.column_names,
    desc="Tokenizing dev"
)

print(f"Tokenization complete!")
print(f"   Train tokenized: {len(train_tokenized):,} samples")
print(f"   Dev tokenized: {len(dev_tokenized):,} samples")

# Show tokenized example
print(f"\nTokenized sample:")
sample = train_tokenized[0]
print(f"   Keys: {list(sample.keys())}")
print(f"   Input IDs: {sample['input_ids'][:10]}... (length: {len(sample['input_ids'])})")
print(f"   Labels: {sample['labels'][:10]}... (length: {len(sample['labels'])})")

# Verify no token_type_ids in tokenized datasets
if 'token_type_ids' in sample:
    raise ValueError("token_type_ids found in tokenized dataset!")
else:
    print(f"Tokenized datasets are MBART compatible!")

Creating datasets...
   Created train dataset: 4,214 samples
   Created dev dataset: 1,054 samples

Tokenizing datasets...


Tokenizing train: 100%|██████████| 4214/4214 [00:00<00:00, 12969.53 examples/s]
Tokenizing train: 100%|██████████| 4214/4214 [00:00<00:00, 12969.53 examples/s]
Tokenizing dev: 100%|██████████| 1054/1054 [00:00<00:00, 17050.48 examples/s]

Tokenization complete!
   Train tokenized: 4,214 samples
   Dev tokenized: 1,054 samples

Tokenized sample:
   Keys: ['input_ids', 'attention_mask', 'labels']
   Input IDs: [2, 2656, 1968, 53, 593, 2135, 13, 8857, 5681, 4706]... (length: 52)
   Labels: [2, 593, 2135, 13, 8857, 5681, 4706, 903, 2284, 12383]... (length: 49)
Tokenized datasets are MBART compatible!





In [45]:
# Create data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,
    return_tensors="pt"
)

print(f"Data collator created: {type(data_collator).__name__}")

# Test data collator
print(f"\nTesting data collator...")
test_batch = [train_tokenized[i] for i in range(2)]  # Get 2 samples
collated_batch = data_collator(test_batch)

print(f"Data collator test passed!")
print(f"   Batch keys: {list(collated_batch.keys())}")
print(f"   Input IDs shape: {collated_batch['input_ids'].shape}")
print(f"   Labels shape: {collated_batch['labels'].shape}")

# Final check: no token_type_ids in collated batch
if 'token_type_ids' in collated_batch:
    raise ValueError("token_type_ids found in collated batch!")
else:
    print(f"Collated batch is MBART compatible!")

Data collator created: DataCollatorForSeq2Seq

Testing data collator...
Data collator test passed!
   Batch keys: ['input_ids', 'attention_mask', 'labels', 'decoder_input_ids']
   Input IDs shape: torch.Size([2, 52])
   Labels shape: torch.Size([2, 49])
Collated batch is MBART compatible!


In [46]:
# Training arguments
training_args = TrainingArguments(
    # Output and logging
    output_dir=CONFIG['output_dir'],
    logging_dir=f"{CONFIG['output_dir']}/logs",
    logging_steps=CONFIG['logging_steps'],
    report_to=None,  # Disable wandb/tensorboard
    
    # Training schedule
    num_train_epochs=CONFIG['num_epochs'],
    per_device_train_batch_size=CONFIG['train_batch_size'],
    per_device_eval_batch_size=CONFIG['eval_batch_size'],
    gradient_accumulation_steps=CONFIG['gradient_accumulation_steps'],
    
    # Optimization
    learning_rate=CONFIG['learning_rate'],
    warmup_ratio=CONFIG['warmup_ratio'],
    weight_decay=CONFIG['weight_decay'],
    max_grad_norm=CONFIG['max_grad_norm'],
    
    # Evaluation and saving
    eval_strategy="steps",
    eval_steps=CONFIG['eval_steps'],
    save_strategy="steps",
    save_steps=CONFIG['save_steps'],
    save_total_limit=CONFIG['save_total_limit'],
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Performance
    fp16=False,  # Disable FP16 to avoid gradient unscaling issues
    dataloader_pin_memory=False,
    dataloader_num_workers=0,
    remove_unused_columns=True,  # Important: removes token_type_ids
    
    # Reproducibility
    seed=CONFIG['random_seed'],
    data_seed=CONFIG['random_seed'],
)

print(f"Training arguments created:")
print(f"   Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"   Learning rate: {training_args.learning_rate}")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Output: {training_args.output_dir}")
print(f"   Best model metric: {training_args.metric_for_best_model}")
print(f"   FP16: {training_args.fp16}")
print(f"   Remove unused columns: {training_args.remove_unused_columns}")

Training arguments created:
   Effective batch size: 16
   Learning rate: 1e-06
   Epochs: 50
   Output: ./indicbart-hindi-final
   Best model metric: eval_loss
   FP16: False
   Remove unused columns: True


In [47]:
# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=CONFIG['early_stopping_patience'])
    ]
)

print(f"Trainer created successfully!")
print(f"   Train samples: {len(train_tokenized):,}")
print(f"   Dev samples: {len(dev_tokenized):,}")
print(f"   Early stopping patience: {CONFIG['early_stopping_patience']}")
print(f"   Model will be saved every {CONFIG['save_steps']} steps")

# Estimate training time
steps_per_epoch = len(train_tokenized) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
total_steps = steps_per_epoch * training_args.num_train_epochs
print(f"\nTraining estimates:")
print(f"   Steps per epoch: {steps_per_epoch}")
print(f"   Total steps: {total_steps}")
print(f"   Estimated time: ~{total_steps * 2 / 3600:.1f} hours (assuming 2 sec/step)")

Trainer created successfully!
   Train samples: 4,214
   Dev samples: 1,054
   Early stopping patience: 3
   Model will be saved every 500 steps

Training estimates:
   Steps per epoch: 263
   Total steps: 13150
   Estimated time: ~7.3 hours (assuming 2 sec/step)


In [48]:
# Start training with complete FP16 disable
print(f"Starting IndicBART training...")
print(f"="*60)
print(f"Dataset: {len(train_tokenized):,} train + {len(dev_tokenized):,} dev samples")
print(f"Model: {CONFIG['model_name']}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Output: {training_args.output_dir}")
print(f"="*60)

try:
    # Clear any previous training state
    if CONFIG['device'] == 'cuda':
        torch.cuda.empty_cache()
    
    # Start training
    training_output = trainer.train()
    
    print(f"\nTRAINING COMPLETED SUCCESSFULLY!")
    print(f"Final training loss: {training_output.training_loss:.4f}")
    
    # Save final model
    print(f"\nSaving final model...")
    trainer.save_model()
    tokenizer.save_pretrained(CONFIG['output_dir'])
    
    # Save configuration
    with open(f"{CONFIG['output_dir']}/config.json", 'w') as f:
        json.dump(CONFIG, f, indent=2, default=str)
    
    print(f"Model saved to: {CONFIG['output_dir']}")
    
    # Final evaluation
    print(f"\nFinal evaluation on dev set...")
    final_eval = trainer.evaluate()
    print(f"Final evaluation loss: {final_eval['eval_loss']:.4f}")
    print(f"Evaluation runtime: {final_eval['eval_runtime']:.1f} seconds")
    print(f"Samples per second: {final_eval['eval_samples_per_second']:.1f}")
    
    # Store results in globals
    globals()['trained_model'] = model
    globals()['trained_tokenizer'] = tokenizer
    globals()['training_results'] = training_output
    globals()['final_eval_results'] = final_eval
    globals()['training_completed'] = True
    
    print(f"\nTraining successfully completed!")
    
except Exception as e:
    print(f"\nTraining failed with error:")
    print(f"Error: {str(e)}")
    print(f"\nDebugging information:")
    print(f"   Model dtype: {model.dtype}")
    print(f"   Training args fp16: {training_args.fp16}")
    print(f"   PyTorch autocast enabled: {torch.is_autocast_enabled()}")
    
    # Show memory info if CUDA
    if CONFIG['device'] == 'cuda':
        print(f"\nGPU Memory Status:")
        print(f"   Allocated: {torch.cuda.memory_allocated() / 1024**3:.1f} GB")
        print(f"   Reserved: {torch.cuda.memory_reserved() / 1024**3:.1f} GB")
    
    raise

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 3, 'bos_token_id': 2}.


Starting IndicBART training...
Dataset: 4,214 train + 1,054 dev samples
Model: ai4bharat/IndicBART
Effective batch size: 16
Learning rate: 1e-06
Epochs: 50
Output: ./indicbart-hindi-final

Training failed with error:
Error: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Debugging information:
   Model dtype: torch.float32
   Training args fp16: False
   PyTorch autocast enabled: False

GPU Memory Status:
   Allocated: 6.4 GB
   Reserved: 6.5 GB

Training failed with error:
Error: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


Debugging information:

AcceleratorError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [20]:
# Additional stability measures for training
print("Applying additional stability measures...")

# 1. Ensure model is in float32
if model.dtype != torch.float32:
    print(f"Converting model from {model.dtype} to float32...")
    model = model.float()

# 2. Disable gradient checkpointing if it exists
if hasattr(model.config, 'gradient_checkpointing'):
    model.config.gradient_checkpointing = False
    print("Disabled gradient checkpointing")

# 3. Ensure no autocast is active
torch.backends.cudnn.enabled = True
torch.backends.cudnn.benchmark = False

# 4. Clear any cached gradients
for param in model.parameters():
    param.grad = None

# 5. Memory cleanup
if CONFIG['device'] == 'cuda':
    torch.cuda.empty_cache()
    torch.cuda.synchronize()
    print(f"GPU memory after cleanup: {torch.cuda.memory_allocated() / 1024**3:.1f} GB allocated")

print("Stability measures applied!")

Applying additional stability measures...
Disabled gradient checkpointing
GPU memory after cleanup: 2.7 GB allocated
Stability measures applied!
GPU memory after cleanup: 2.7 GB allocated
Stability measures applied!


In [21]:
# Force disable all mixed precision and FP16 components
print("Forcing complete FP16 disable...")

# 1. Disable autocast globally
torch.set_autocast_enabled(False)

# 2. Ensure no AMP (Automatic Mixed Precision) is used
import torch.cuda.amp as amp
torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False

# 3. Clear any AMP state
if torch.cuda.is_available():
    # Clear any existing AMP scalers
    torch.cuda.empty_cache()

# 4. Verify model is completely in float32
print(f"Model dtype: {model.dtype}")
print(f"Model device: {next(model.parameters()).device}")

# 5. Check if any parameters are in float16
fp16_params = [name for name, param in model.named_parameters() if param.dtype == torch.float16]
if fp16_params:
    print(f"WARNING: Found {len(fp16_params)} FP16 parameters!")
    for name in fp16_params[:5]:  # Show first 5
        print(f"  - {name}")
else:
    print("All model parameters are in float32 ✓")

print("Complete FP16 disable applied!")

Forcing complete FP16 disable...
Model dtype: torch.float32
Model device: cuda:0
All model parameters are in float32 ✓
Complete FP16 disable applied!


In [None]:
# Final clean grammar correction function
def correct_hindi_grammar(text, max_length=None):
    """Correct Hindi grammar using the trained model with robust post-processing"""
    if max_length is None:
        max_length = CONFIG['max_length']
    
    # Clean input
    text = str(text).strip()
    if not text:
        return text
    
    # Create input with task prefix
    input_text = f"सुधारें: {text}"
    
    # Tokenize
    inputs = tokenizer(
        input_text,
        return_tensors="pt",
        max_length=max_length,
        truncation=True,
        padding=False
    )
    
    # Move to model device
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    # Generate with balanced parameters
    with torch.no_grad():
        try:
            outputs = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=inputs['input_ids'].shape[1] + 15,  # Limited extension
                num_beams=1,  # Greedy for consistency
                do_sample=False,
                repetition_penalty=5.0,  # High penalty for repetition
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id
            )
        except Exception as e:
            print(f"Generation failed: {str(e)}")
            return text
    
    # Decode result
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Remove task prefix
    if result.startswith("सुधारें: "):
        result = result[8:].strip()
    
    # Aggressive cleaning of problematic output
    result = result.replace("[CLS]", "").replace("[SEP]", "").replace("[MASK]", "")
    result = result.replace("|", "").replace("/", "")
    
    # Split into words and clean
    words = result.split()
    
    # Remove obvious repetitions and clean words
    cleaned_words = []
    for word in words:
        # Skip very short or problematic words
        if len(word) < 1 or word in ["और", "एक", "है", "में", "का", "के", "की", "को", "से", "पर"] and len(cleaned_words) > 3:
            continue
        # Avoid repetition
        if not cleaned_words or word != cleaned_words[-1]:
            cleaned_words.append(word)
    
    # Reconstruct result
    result = " ".join(cleaned_words)
    
    # Final validation
    if (not result or 
        len(result) < 3 or 
        len(result.split()) > len(text.split()) * 2 or
        result == text):
        return text
    
    return result.strip()

print(f"Final clean grammar correction function created!")
print(f"   Includes robust post-processing and validation")

Model's generation config:
   GenerationConfig {
  "bos_token_id": 2,
  "eos_token_id": [
    3,
    64001
  ],
  "forced_eos_token_id": 2,
  "pad_token_id": 0
}


Tokenizer special tokens:
   pad_token_id: 0
   eos_token_id: 3
   bos_token_id: 2
   unk_token_id: 1

Grammar correction function created with debugging!


In [28]:
# Test the trained model
print(f"Testing the trained model...")
print(f"="*50)

# Test sentences with various Hindi grammar errors
test_sentences = [
    "मैं कल दिल्ली जाऊगा",           # Missing anusvara (should be जाऊंगा)
    "वो स्कूल गया हैं",              # Verb agreement (should be गया है)
    "राम और श्याम खेल रहा है",         # Plural subject (should be खेल रहे हैं)
    "मुझे यह किताब पसंद हैं",         # Agreement error (should be पसंद है)
    "बच्चे पार्क में खेल रहे हैं",       # Correct sentence
    "उसके पास बहुत पैसा हैं",         # Agreement error (should be पैसा है)
    "हम सब मिलकर काम करेगे",         # Correct sentence
    "तुम कहा जा रहे हो",            # Wrong interrogative (should be कहाँ)
]

print(f"Test Results:")
print(f"-" * 80)

for i, sentence in enumerate(test_sentences, 1):
    try:
        corrected = correct_hindi_grammar(sentence)
        
        # Determine if changed
        changed = sentence.strip() != corrected.strip()
        status_icon = "CHANGED" if changed else "UNCHANGED"
        
        print(f"\n{i:2d}. {status_icon}")
        print(f"     Original:  {sentence}")
        print(f"     Corrected: {corrected}")
        
        # If changed, show what was corrected
        if changed:
            print(f"     -> Correction applied")
        
    except Exception as e:
        print(f"\n{i:2d}. ERROR")
        print(f"     Original: {sentence}")
        print(f"     Error: {str(e)[:60]}...")

print(f"\n" + "="*50)
print(f"Model testing completed!")

Testing the trained model...
Test Results:
--------------------------------------------------------------------------------

 1. CHANGED
     Original:  मैं कल दिल्ली जाऊगा
     Corrected: [CLS] मुझे कल दिल्ली जाऊगा मेरा कल दिल्ली जाऊगा दिल्ली जाऊगा दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा दिल्ली जाऊगा मेरा कल दिल्ली जाऊगा | | | | | | | | दिल्ली जाऊगा कोई कल दिल्ली जाऊगा | | | | | | | | | | | दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | | | | | | | | दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | | | | | | | | | दिल्ली जाऊगा | | | | | दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | | | | | | दिल्ली जाऊगा[CLS]
     -> Correction applied

 1. CHANGED
     Original:  मैं कल दिल्ली जाऊगा
     Corrected: [CLS] मुझे कल दिल्ली जाऊगा मेरा कल दिल्ली जाऊगा दिल्ली जाऊगा दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दि

In [31]:
# Simple test of the trained model
print(f"Simple Model Test:")
print(f"="*40)

# Test just one sentence
test_sentence = "मैं कल दिल्ली जाऊगा"
print(f"Testing: {test_sentence}")

try:
    corrected = correct_hindi_grammar(test_sentence)
    print(f"Original:  {test_sentence}")
    print(f"Corrected: {corrected}")
    print(f"Changed: {'Yes' if test_sentence != corrected else 'No'}")
except Exception as e:
    print(f"Error: {str(e)}")

print(f"="*40)

Simple Model Test:
Testing: मैं कल दिल्ली जाऊगा
Original:  मैं कल दिल्ली जाऊगा
Corrected: [CLS] मुझे कल दिल्ली जाऊगा मेरा कल दिल्ली जाऊगा दिल्ली जाऊगा दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा दिल्ली जाऊगा मेरा कल दिल्ली जाऊगा | | | | | | | | दिल्ली जाऊगा कोई कल दिल्ली जाऊगा | | | | | | | | | | | दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | | | | | | | | दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | | | | | | | | | दिल्ली जाऊगा | | | | | दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | | | | | | दिल्ली जाऊगा[CLS]
Changed: Yes


In [34]:
# Test final correction function
print("Testing Final Grammar Correction Function:")
print("=" * 50)

test_sentences = [
    "मैं कल दिल्ली जाऊगा",           # Missing anusvara
    "वो स्कूल गया हैं",              # Verb agreement error
    "राम और श्याम खेल रहा है",         # Plural subject error
    "मुझे यह किताब पसंद हैं",         # Agreement error
    "बच्चे पार्क में खेल रहे हैं"       # Correct sentence
]

for i, sentence in enumerate(test_sentences, 1):
    try:
        corrected = correct_hindi_grammar(sentence)
        changed = sentence != corrected
        
        print(f"\n{i}. {'✓' if changed else '○'} {'CHANGED' if changed else 'UNCHANGED'}")
        print(f"   Original:  {sentence}")
        print(f"   Corrected: {corrected}")
        
    except Exception as e:
        print(f"\n{i}. ✗ ERROR")
        print(f"   Original: {sentence}")
        print(f"   Error: {str(e)[:50]}...")

print(f"\n" + "=" * 50)

Testing Final Grammar Correction Function:

1. ✓ CHANGED
   Original:  मैं कल दिल्ली जाऊगा
   Corrected: [CLS] मुझे कल दिल्ली जाऊगा मेरा कल दिल्ली जाऊगा दिल्ली जाऊगा दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा दिल्ली जाऊगा मेरा कल दिल्ली जाऊगा | | | | | | | | दिल्ली जाऊगा कोई कल दिल्ली जाऊगा | | | | | | | | | | | दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | | | | | | | | दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | | | | | | | | | दिल्ली जाऊगा | | | | | दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | मैं कल दिल्ली जाऊगा | | | | | | दिल्ली जाऊगा[CLS]

1. ✓ CHANGED
   Original:  मैं कल दिल्ली जाऊगा
   Corrected: [CLS] मुझे कल दिल्ली जाऊगा मेरा कल दिल्ली जाऊगा दिल्ली जाऊगा दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा कोई कल दिल्ली जाऊगा दिल्ली जाऊगा मेरा कल दिल्ली जा

In [24]:
# Comprehensive evaluation on dev set
print(f"Comprehensive Evaluation on Dev Set")
print(f"="*60)

# Generate corrections for entire dev set
print(f"Generating corrections for {len(dev_df)} dev samples...")

dev_corrections = []
dev_originals = []
dev_references = []

# Process in batches to avoid memory issues
batch_size = 50
num_batches = (len(dev_df) + batch_size - 1) // batch_size

for batch_idx in tqdm(range(num_batches), desc="Generating corrections"):
    start_idx = batch_idx * batch_size
    end_idx = min(start_idx + batch_size, len(dev_df))
    
    batch_inputs = dev_df['input_text'].iloc[start_idx:end_idx].tolist()
    batch_references = dev_df['output_text'].iloc[start_idx:end_idx].tolist()
    
    # Generate corrections for batch
    for input_text, reference in zip(batch_inputs, batch_references):
        try:
            correction = correct_hindi_grammar(input_text)
            dev_corrections.append(correction)
            dev_originals.append(input_text)
            dev_references.append(reference)
        except Exception as e:
            print(f"Error processing: {input_text[:50]}... - {str(e)[:30]}...")
            dev_corrections.append(input_text)  # Fallback to original
            dev_originals.append(input_text)
            dev_references.append(reference)

print(f"Generated {len(dev_corrections)} corrections")

# Calculate metrics
print(f"\nCalculating evaluation metrics...")

# Exact match accuracy
exact_matches = sum(1 for ref, pred in zip(dev_references, dev_corrections) if ref.strip() == pred.strip())
exact_match_accuracy = exact_matches / len(dev_references) * 100

# Count changes made
changes_made = sum(1 for orig, pred in zip(dev_originals, dev_corrections) if orig.strip() != pred.strip())
change_rate = changes_made / len(dev_originals) * 100

# Count where changes were needed vs made
needs_correction = sum(1 for orig, ref in zip(dev_originals, dev_references) if orig.strip() != ref.strip())
correct_changes = sum(1 for orig, ref, pred in zip(dev_originals, dev_references, dev_corrections) 
                      if orig.strip() != ref.strip() and pred.strip() == ref.strip())

precision = correct_changes / changes_made * 100 if changes_made > 0 else 0
recall = correct_changes / needs_correction * 100 if needs_correction > 0 else 0
f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0

# Print results
print(f"\nEVALUATION RESULTS:")
print(f"="*40)
print(f"Dataset Statistics:")
print(f"   Total samples: {len(dev_references):,}")
print(f"   Need correction: {needs_correction:,} ({needs_correction/len(dev_references)*100:.1f}%)")
print(f"   Perfect (no change needed): {len(dev_references) - needs_correction:,}")

print(f"\nModel Performance:")
print(f"   Exact match accuracy: {exact_match_accuracy:.1f}% ({exact_matches:,}/{len(dev_references):,})")
print(f"   Changes made: {changes_made:,} ({change_rate:.1f}%)")
print(f"   Correct changes: {correct_changes:,}")

print(f"\nCorrection Metrics:")
print(f"   Precision: {precision:.1f}% (correct changes / total changes)")
print(f"   Recall: {recall:.1f}% (correct changes / needed changes)")
print(f"   F1-Score: {f1_score:.1f}%")

# Show sample results
print(f"\nSample Results:")
print(f"-" * 80)

# Show successful corrections
successful_corrections = [(orig, ref, pred) for orig, ref, pred in zip(dev_originals, dev_references, dev_corrections)
                         if orig != ref and pred == ref]

if successful_corrections:
    print(f"\nSuccessful Corrections (showing first 3):")
    for i, (orig, ref, pred) in enumerate(successful_corrections[:3], 1):
        print(f"   {i}. Original: {orig}")
        print(f"      Reference: {ref}")
        print(f"      Predicted: {pred}")
        print()

# Show failed corrections
failed_corrections = [(orig, ref, pred) for orig, ref, pred in zip(dev_originals, dev_references, dev_corrections)
                     if orig != ref and pred != ref]

if failed_corrections:
    print(f"Failed Corrections (showing first 3):")
    for i, (orig, ref, pred) in enumerate(failed_corrections[:3], 1):
        print(f"   {i}. Original: {orig}")
        print(f"      Reference: {ref}")
        print(f"      Predicted: {pred}")
        print()

print(f"Evaluation completed!")

# Save evaluation results
eval_results = {
    'exact_match_accuracy': exact_match_accuracy,
    'change_rate': change_rate,
    'precision': precision,
    'recall': recall,
    'f1_score': f1_score,
    'total_samples': len(dev_references),
    'needs_correction': needs_correction,
    'changes_made': changes_made,
    'correct_changes': correct_changes
}

with open(f"{CONFIG['output_dir']}/evaluation_results.json", 'w') as f:
    json.dump(eval_results, f, indent=2)

globals()['evaluation_results'] = eval_results
print(f"Evaluation results saved to {CONFIG['output_dir']}/evaluation_results.json")

Comprehensive Evaluation on Dev Set
Generating corrections for 527 dev samples...


Generating corrections:   0%|          | 0/11 [00:00<?, ?it/s]

Error processing: यह भी पढ़ें  सेहत के लि एजरूरी है रो स्नान करना।... - The following `model_kwargs` a...
Error processing: शमा को बीजेप् ीने प्रचारकों का शेाड्यूल जारसी का ह... - The following `model_kwargs` a...
Error processing: नमेरे 'ऐ वतन के गाना लोगों' लता ज ीन ेकेa ेहरू लिए... - The following `model_kwargs` a...
Error processing: 50 दिनों बाद बिहार में फिर लगे झटके:सीमांचल के जिल... - The following `model_kwargs` a...
Error processing: तभी जटायु की नजर पड़ी  और वह युद्ध करने लगा रावण स... - The following `model_kwargs` a...
Error processing: पुिलिस अनंत सिंह तैयार ीइेंडिया के खलाफ लुक सनोटिस... - The following `model_kwargs` a...
Error processing: जो ाबबइडेन चल रहे हैं... - The following `model_kwargs` a...
Error processing: विकास भवन में मेडिकल टीम करेगी कार्मिकों का परीक्ष... - The following `model_kwargs` a...
Error processing: रतन टाटा लिए के इंटरव्यू पलहा भाग पढ़ने लिए दखे-सु... - The following `model_kwargs` a...
Error processing: मैं जब भी Cristiano Ronaldo के बचपन से लेक

Generating corrections:   9%|▉         | 1/11 [00:00<00:02,  3.72it/s]

Error processing: बकौल मोद, ्“कोवि-ड19 के संकट बीच बड़ा इतने समय लंब... - The following `model_kwargs` a...
Error processing: केरल में एक चुनावी रैली के दौरान भाजपा के वरिष्ठ न... - The following `model_kwargs` a...
Error processing: खिलाड़ियों के तैयार किया जाएगा पैकेज... - The following `model_kwargs` a...
Error processing: मनली मसएतडीएम रमन घरसंगी ने कहा क कुल्लवी नृत्य मे... - The following `model_kwargs` a...
Error processing: क़तर की राजधानी दोहा में यमन में मानवाधिकार की स्थ... - The following `model_kwargs` a...
Error processing: चश्मदीद मजदूरों की माने तो दिल्ली के बिलकुल बॉर्डर... - The following `model_kwargs` a...
Error processing: वीवीएस लक्ष्मण और राहुल द्रविड़ अपनी ऐतिहासिक साझे... - The following `model_kwargs` a...
Error processing: निम्न सारणी में चेरापुंजी और मौसिनराम में १९७० से ... - The following `model_kwargs` a...
Error processing: सीएम वीरभद्र सिंह ने बताया के कि फोटो) राजधानी को ... - The following `model_kwargs` a...
Error processing: दिल्ली सरकार ने वन नेशन 

Generating corrections:  27%|██▋       | 3/11 [00:00<00:01,  7.89it/s]

Error processing: पांचवें मुकदमे उन्हें आज़ादी में के बाद एक कराची म... - The following `model_kwargs` a...
Error processing: चाणक्य के प्रतिशोध से लेकर... - The following `model_kwargs` a...
Error processing: सोशल मीडिया अभी के समाज की बहुत बड़ी मुसिबत है।... - The following `model_kwargs` a...
Error processing: अंाह।स चंबल के लाड़ल ेसपूतल शहीद हवलदार रामगोविंद ि... - The following `model_kwargs` a...
Error processing: मैं जीवन में अनेक भविष्य के करियर चुन सकता था, परं... - The following `model_kwargs` a...
Error processing: मध्य प्रदेश के मंत्री पीस शर्समा: बागी विधायकों को... - The following `model_kwargs` a...
Error processing: हालांकि उस समय में पीलिया से क्षतिग्रस्त था।... - The following `model_kwargs` a...
Error processing: ना किसी है ेस अपेक्षा....क... - The following `model_kwargs` a...
Error processing: 4 सितंबर: इन घटनाओं को याद भी कर आज खड़े हैं जाते ... - The following `model_kwargs` a...
Error processing: 6 से 7 काली मिर्च के दाने... - The following `model_kwargs` a...
Er

Generating corrections:  45%|████▌     | 5/11 [00:00<00:00, 10.20it/s]

Error processing: वजली-अस०३, पौडी तहसील में भारत के उत्तराखण्ड राज्य... - The following `model_kwargs` a...
Error processing: बरेली के सेल्समैनों से 30 हजार रुपये की लूट... - The following `model_kwargs` a...
Error processing: लखीपमुर हंसा को लेकर किसानों का 'रेलर ोको आंदोन' आ... - The following `model_kwargs` a...
Error processing: सेलिना जेटली को मिलेंगे 60 लाख रुपए... - The following `model_kwargs` a...
Error processing: UPJE E2020 े लिअये आवेदन करने कीब अंतिम तिथि एनक ब... - The following `model_kwargs` a...
Error processing: अपने रहे दावे... - The following `model_kwargs` a...
Error processing: बाइडन ने कहा कि अमेरिका वायरस के खिलाफ जंग में अब ... - The following `model_kwargs` a...
Error processing: किसान नेता राकेश टिकैत ने भी से पहले कहा हमें ट्रै... - The following `model_kwargs` a...
Error processing: ब्याज दरें घटने की आस में सेंसेक्स 350 अंक उछला... - The following `model_kwargs` a...
Error processing: इस कॉलेज फिजिक्स, में ताइवान साइन के में प्रोफेसर ... - The following `m

Generating corrections:  82%|████████▏ | 9/11 [00:00<00:00, 13.62it/s]

Error processing: बेटकप ो झुलसता देख पिता बसंत उसे बचने प्रयास करने ... - The following `model_kwargs` a...
Error processing: गांवोंम ें चुनाी चललस, मनुहार में जुटे ्रत्याशी... - The following `model_kwargs` a...
Error processing: जिले की नगर पंचायत कछौना पतसेनी व माधौगंज को बुधवा... - The following `model_kwargs` a...
Error processing: मोदी के नेतृ्व में भारत सरकार का जमम्-ूकश्मीरप र ख... - The following `model_kwargs` a...
Error processing: अफगानितस्‍तान में लोकतांत्रिक लोकतंत्र पर भरोसा मज... - The following `model_kwargs` a...
Error processing: ताड़का उन राक्षसों में सबसे अधिक भयंकर तथा विनाशका... - The following `model_kwargs` a...
Error processing: वार्ड नौ : नौ दिनों से पानी से घिरे हैं लोग... - The following `model_kwargs` a...
Error processing: खाताधारक को डराकर पूछ लिया अकाउंट नंबर और पिन, फिर... - The following `model_kwargs` a...
Error processing: ये भी पढ़ें:-  BSNL का यूजर्स को तोहफा, 30 नहीं अब ... - The following `model_kwargs` a...
Error processing: िल्ली के और आम आादमी

Generating corrections: 100%|██████████| 11/11 [00:00<00:00, 12.15it/s]

Generated 527 corrections

Calculating evaluation metrics...

EVALUATION RESULTS:
Dataset Statistics:
   Total samples: 527
   Need correction: 283 (53.7%)
   Perfect (no change needed): 244

Model Performance:
   Exact match accuracy: 46.3% (244/527)
   Changes made: 0 (0.0%)
   Correct changes: 0

Correction Metrics:
   Precision: 0.0% (correct changes / total changes)
   Recall: 0.0% (correct changes / needed changes)
   F1-Score: 0.0%

Sample Results:
--------------------------------------------------------------------------------
Failed Corrections (showing first 3):
   1. Original: यह भी पढ़ें  सेहत के लि एजरूरी है रो स्नान करना।
      Reference: यह भी पढ़ें - सेहत के लिए जरूरी है रोज स्नान करना।
      Predicted: यह भी पढ़ें  सेहत के लि एजरूरी है रो स्नान करना।

   2. Original: शमा को बीजेप् ीने प्रचारकों का शेाड्यूल जारसी का है दिया इसके मुताबकिप ्रधानमंत्री नरेंद्र मोदी हरिैयाणाै में चार रैलियाकं
      Reference: #रविवार शाम को बीजेपी ने स्टार प्रचारकों का शेड्यूल जारी कर दिया है.




In [None]:
# Summary and next steps
print(f"INDICBART HINDI TRAINING COMPLETE!")
print(f"="*60)

print(f"\nTraining Summary:")
if 'training_results' in globals():
    print(f"   Training completed successfully")
    print(f"   Final training loss: {training_results.training_loss:.4f}")
    print(f"   Training time: {training_results.metrics.get('train_runtime', 0):.1f} seconds")
else:
    print(f"   Training results not available")

if 'final_eval_results' in globals():
    print(f"   Final eval loss: {final_eval_results['eval_loss']:.4f}")
    print(f"   Eval samples/sec: {final_eval_results['eval_samples_per_second']:.1f}")

print(f"\nModel Performance:")
if 'evaluation_results' in globals():
    results = evaluation_results
    print(f"   Exact Match: {results['exact_match_accuracy']:.1f}%")
    print(f"   F1-Score: {results['f1_score']:.1f}%")
    print(f"   Precision: {results['precision']:.1f}%")
    print(f"   Recall: {results['recall']:.1f}%")
    print(f"   Change Rate: {results['change_rate']:.1f}%")

print(f"\nModel Files:")
output_dir = Path(CONFIG['output_dir'])
if output_dir.exists():
    model_files = list(output_dir.glob('*'))
    for file in sorted(model_files):
        size_mb = file.stat().st_size / (1024**2) if file.is_file() else 0
        file_type = "DIR" if file.is_dir() else "FILE"
        print(f"   {file_type}: {file.name} {'(' + str(size_mb) + ' MB)' if file.is_file() and size_mb > 0.1 else ''}")

print(f"\nUsage:")
print(f"   Model saved to: {CONFIG['output_dir']}")
print(f"   Use function: correct_hindi_grammar('your sentence here')")
print(f"   Example: correct_hindi_grammar('मैं कल दिल्ली जाऊगा')")

print(f"\nNext Steps:")
print(f"   1. Test on more examples")
print(f"   2. Deploy the model for inference")
print(f"   3. Integrate with your error classification system")
print(f"   4. Consider fine-tuning on domain-specific data")

print(f"\nReady for Hindi Grammar Correction!")

# Final test
print(f"\nFinal Test:")
test_input = "मैं कल दिल्ली जाऊगा"
try:
    test_output = correct_hindi_grammar(test_input)
    print(f"   Input: {test_input}")
    print(f"   Output: {test_output}")
    print(f"   Status: {'Corrected' if test_input != test_output else 'Unchanged'}")
except Exception as e:
    print(f"   Test failed: {str(e)}")