# 📊 Hindi Dataset Integration for IndicBART

This notebook integrates the new 10k Hindi sentences dataset with the existing IndicBART training pipeline.

**Tasks:**
1. Load and analyze the combined dataset
2. Prepare proper train/dev splits
3. Update IndicBART training configuration
4. Implement better generation parameters
5. Resume training with new data

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
import shutil
import os
from datetime import datetime

print("📊 Dataset Integration Started")
print(f"⏰ Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("🎯 Target: Integrate 10k Hindi sentences for better IndicBART training")

In [None]:
# Load and analyze the combined dataset
print("📥 Loading combined dataset...")

dataset_path = Path("Hindi/combined_test_dataset.csv")

if not dataset_path.exists():
    raise FileNotFoundError(f"Dataset not found: {dataset_path}")

# Load with error handling
try:
    df = pd.read_csv(dataset_path, encoding='utf-8')
    print(f"✅ Dataset loaded successfully")
except UnicodeDecodeError:
    df = pd.read_csv(dataset_path, encoding='latin-1')
    print(f"✅ Dataset loaded with latin-1 encoding")

print(f"📊 Dataset Info:")
print(f"   Shape: {df.shape}")
print(f"   Columns: {list(df.columns)}")
print(f"   Memory: {dataset_path.stat().st_size / (1024**2):.1f} MB")

# Show sample data
print(f"\n🔍 Sample data:")
print(df.head())

print(f"\n📋 Data types:")
print(df.dtypes)

print(f"\n🧹 Missing values:")
print(df.isnull().sum())

In [None]:
# Clean and prepare the data
print("🧹 Cleaning and preparing data...")

# Ensure we have the right columns (input, output)
if 'input' in df.columns and 'output' in df.columns:
    input_col, output_col = 'input', 'output'
elif len(df.columns) >= 2:
    input_col, output_col = df.columns[0], df.columns[1]
    print(f"Using columns: '{input_col}' → '{output_col}'")
else:
    raise ValueError("Cannot identify input/output columns")

# Clean the data
print(f"Original size: {len(df)}")

# Remove null values
df = df.dropna(subset=[input_col, output_col])
print(f"After removing nulls: {len(df)}")

# Convert to string and strip whitespace
df[input_col] = df[input_col].astype(str).str.strip()
df[output_col] = df[output_col].astype(str).str.strip()

# Remove empty strings
df = df[(df[input_col] != '') & (df[output_col] != '')]
print(f"After removing empty strings: {len(df)}")

# Remove extremely short or long sentences
min_length, max_length = 5, 200
df = df[
    (df[input_col].str.len() >= min_length) & 
    (df[input_col].str.len() <= max_length) &
    (df[output_col].str.len() >= min_length) & 
    (df[output_col].str.len() <= max_length)
]
print(f"After length filtering ({min_length}-{max_length} chars): {len(df)}")

# Rename columns to standard format
df_clean = df[[input_col, output_col]].copy()
df_clean.columns = ['Input sentence', 'Output sentence']

print(f"\n✅ Data cleaning complete!")
print(f"📊 Final dataset: {len(df_clean)} samples")

# Show data distribution
identical_count = (df_clean['Input sentence'] == df_clean['Output sentence']).sum()
different_count = len(df_clean) - identical_count

print(f"\n📈 Data composition:")
print(f"   Identical (input = output): {identical_count} ({identical_count/len(df_clean)*100:.1f}%)")
print(f"   Different (corrections): {different_count} ({different_count/len(df_clean)*100:.1f}%)")

# Show sample corrections
print(f"\n🔍 Sample corrections:")
corrections = df_clean[df_clean['Input sentence'] != df_clean['Output sentence']].head(3)
for i, (_, row) in enumerate(corrections.iterrows()):
    print(f"  {i+1}. Input:  {row['Input sentence'][:100]}...")
    print(f"     Output: {row['Output sentence'][:100]}...")
    print()

In [None]:
# Create train/dev splits
print("🔄 Creating train/dev splits...")

# Stratified split to maintain ratio of identical vs different samples
df_clean['is_correction'] = (df_clean['Input sentence'] != df_clean['Output sentence']).astype(int)

# Split 90% train, 10% dev
train_df, dev_df = train_test_split(
    df_clean,
    test_size=0.1,
    random_state=42,
    stratify=df_clean['is_correction']
)

# Remove the helper column
train_df = train_df[['Input sentence', 'Output sentence']]
dev_df = dev_df[['Input sentence', 'Output sentence']]

print(f"✅ Split complete:")
print(f"   Train: {len(train_df)} samples")
print(f"   Dev:   {len(dev_df)} samples")

# Verify splits maintain distribution
train_corrections = (train_df['Input sentence'] != train_df['Output sentence']).sum()
dev_corrections = (dev_df['Input sentence'] != dev_df['Output sentence']).sum()

print(f"\n📊 Split distribution:")
print(f"   Train corrections: {train_corrections}/{len(train_df)} ({train_corrections/len(train_df)*100:.1f}%)")
print(f"   Dev corrections:   {dev_corrections}/{len(dev_df)} ({dev_corrections/len(dev_df)*100:.1f}%)")

In [None]:
# Backup existing files and save new splits
print("💾 Saving train/dev splits...")

hindi_dir = Path('Hindi')
hindi_dir.mkdir(exist_ok=True)

train_path = hindi_dir / 'train.csv'
dev_path = hindi_dir / 'dev.csv'

# Backup existing files
backup_suffix = datetime.now().strftime('%Y%m%d_%H%M%S')

if train_path.exists():
    backup_train = hindi_dir / f'train_backup_{backup_suffix}.csv'
    shutil.copy2(train_path, backup_train)
    print(f"📁 Backed up existing train.csv to {backup_train.name}")

if dev_path.exists():
    backup_dev = hindi_dir / f'dev_backup_{backup_suffix}.csv'
    shutil.copy2(dev_path, backup_dev)
    print(f"📁 Backed up existing dev.csv to {backup_dev.name}")

# Save new files
train_df.to_csv(train_path, index=False, encoding='utf-8')
dev_df.to_csv(dev_path, index=False, encoding='utf-8')

print(f"✅ New datasets saved:")
print(f"   📄 {train_path}: {len(train_df)} samples")
print(f"   📄 {dev_path}: {len(dev_df)} samples")

# Verify files were written correctly
train_size = train_path.stat().st_size / 1024
dev_size = dev_path.stat().st_size / 1024

print(f"\n📊 File sizes:")
print(f"   Train: {train_size:.1f} KB")
print(f"   Dev:   {dev_size:.1f} KB")

# Quick verification
train_verify = pd.read_csv(train_path)
dev_verify = pd.read_csv(dev_path)

print(f"\n✅ Verification:")
print(f"   Train loaded: {len(train_verify)} rows, {len(train_verify.columns)} columns")
print(f"   Dev loaded:   {len(dev_verify)} rows, {len(dev_verify.columns)} columns")
print(f"   Columns: {list(train_verify.columns)}")

In [None]:
# Create improved IndicBART training configuration
print("⚙️ Creating improved training configuration...")

# Updated training parameters for better results
improved_config = {
    'MODEL_NAME': 'ai4bharat/IndicBART',
    'LANGUAGE': 'hindi',
    'MAX_INPUT_LENGTH': 128,  # Reduced for faster training
    'MAX_TARGET_LENGTH': 128,
    'TRAIN_BATCH_SIZE': 2,    # Slightly larger batches
    'EVAL_BATCH_SIZE': 4,
    'GRADIENT_ACCUMULATION_STEPS': 8,  # Reduced accumulation
    'LEARNING_RATE': 5e-6,    # Lower learning rate
    'NUM_EPOCHS': 5,          # Fewer epochs to prevent overfitting
    'WARMUP_RATIO': 0.1,
    'WEIGHT_DECAY': 0.01,     # More regularization
    'MAX_GRAD_NORM': 1.0,     # Relaxed gradient clipping
    'SAVE_STEPS': 500,
    'EVAL_STEPS': 500,
    'LOGGING_STEPS': 100,
    'OUTPUT_DIR': './indicbart-hindi-improved',
    'SEED': 42
}

# Improved generation parameters to fix repetition issues
generation_config = {
    'max_new_tokens': 50,         # Limit output length
    'min_new_tokens': 1,          # Ensure some output
    'num_beams': 3,               # Moderate beam search
    'early_stopping': True,
    'repetition_penalty': 1.5,    # Strong repetition penalty
    'no_repeat_ngram_size': 3,    # Block 3-gram repetition
    'length_penalty': 0.8,        # Encourage shorter outputs
    'do_sample': False,           # Deterministic for consistency
    'temperature': 1.0,
    'top_p': 0.9,
    'top_k': 50
}

print("✅ Configuration created:")
print(f"\n🎯 Training Config:")
for key, value in improved_config.items():
    print(f"   {key}: {value}")

print(f"\n🔧 Generation Config:")
for key, value in generation_config.items():
    print(f"   {key}: {value}")

# Save configuration to file
config_path = Path('improved_training_config.py')

config_code = f"""
# Improved IndicBART Training Configuration
# Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

# Training parameters
IMPROVED_CONFIG = {improved_config}

# Generation parameters to fix repetition issues
GENERATION_CONFIG = {generation_config}

# Usage:
# from improved_training_config import IMPROVED_CONFIG, GENERATION_CONFIG
"""

with open(config_path, 'w', encoding='utf-8') as f:
    f.write(config_code)

print(f"\n💾 Configuration saved to: {config_path}")

In [None]:
# Create updated training code for IndicBART notebook
print("📝 Creating updated training code...")

training_code = '''
# Updated IndicBART Training with Improved Parameters
# Use this code in your indicBART.ipynb notebook

import torch
from transformers import (
    AutoModelForSeq2SeqLM, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForSeq2Seq,
    EarlyStoppingCallback
)
from datasets import Dataset
import pandas as pd
import numpy as np

print("🚀 Starting improved IndicBART training...")

# Load the new large dataset
train_df = pd.read_csv('Hindi/train.csv')
dev_df = pd.read_csv('Hindi/dev.csv')

print(f"📊 Dataset loaded:")
print(f"   Train: {len(train_df)} samples")
print(f"   Dev: {len(dev_df)} samples")

# Improved configuration
MODEL_NAME = 'ai4bharat/IndicBART'
OUTPUT_DIR = './indicbart-hindi-improved'
MAX_LENGTH = 128

# Load model and tokenizer
print("🔄 Loading model and tokenizer...")
model = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Ensure pad token is set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

# Tokenization function with simple prompting
def tokenize_function(examples):
    # Simple prompting - just add "सुधारें: " prefix
    inputs = [f"सुधारें: {text}" for text in examples['Input sentence']]
    targets = examples['Output sentence']
    
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding=False
    )
    
    labels = tokenizer(
        targets,
        max_length=MAX_LENGTH,
        truncation=True,
        padding=False
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Create datasets
print("🔄 Creating datasets...")
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

# Tokenize datasets
train_tokenized = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names
)

dev_tokenized = dev_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=dev_dataset.column_names
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

# Training arguments with improved parameters
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    learning_rate=5e-6,
    warmup_ratio=0.1,
    weight_decay=0.01,
    max_grad_norm=1.0,
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    dataloader_pin_memory=False,
    report_to=None,
    seed=42
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# Start training
print("🚀 Starting training...")
trainer.train()

# Save final model
print("💾 Saving final model...")
trainer.save_model()
tokenizer.save_pretrained(OUTPUT_DIR)

print("✅ Training completed!")

# Test function with improved generation
def test_correction(text, max_length=128):
    """Test grammar correction with improved parameters"""
    input_text = f"सुधारें: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=max_length, truncation=True)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=50,
            min_new_tokens=1,
            num_beams=3,
            early_stopping=True,
            repetition_penalty=1.5,
            no_repeat_ngram_size=3,
            length_penalty=0.8,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Clean output
    if result.startswith("सुधारें: "):
        result = result[8:].strip()
    
    return result

# Test examples
test_sentences = [
    "मैं कल दिल्ली जाऊगा",
    "वो स्कूल गया हैं",
    "राम और श्याम खेल रहा है"
]

print("\n🧪 Testing corrections:")
for sentence in test_sentences:
    corrected = test_correction(sentence)
    print(f"  Original: {sentence}")
    print(f"  Corrected: {corrected}")
    print()
'''

# Save the training code
training_code_path = Path('improved_indicbart_training.py')
with open(training_code_path, 'w', encoding='utf-8') as f:
    f.write(training_code)

print(f"✅ Updated training code saved to: {training_code_path}")
print(f"\n📋 Next steps:")
print(f"   1. Copy the training code to your indicBART.ipynb notebook")
print(f"   2. Run the improved training with the new large dataset")
print(f"   3. Test with the improved generation parameters")
print(f"   4. The repetition issues should be resolved")

In [None]:
# Summary and next steps
print("🎉 Dataset Integration Complete!")
print("=" * 50)

print(f"\n📊 What was accomplished:")
print(f"   ✅ Loaded {len(df_clean)} samples from combined dataset")
print(f"   ✅ Created train/dev splits ({len(train_df)}/{len(dev_df)})")
print(f"   ✅ Saved to Hindi/train.csv and Hindi/dev.csv")
print(f"   ✅ Created improved training configuration")
print(f"   ✅ Generated fixed training code")

print(f"\n🎯 Key improvements:")
print(f"   🔢 Dataset size: {len(df_clean):,} samples (vs ~600 before)")
print(f"   📈 More corrections: {different_count:,} error-correction pairs")
print(f"   ⚙️ Better hyperparameters: Lower LR, more regularization")
print(f"   🛠️ Fixed generation: Repetition penalty, ngram blocking")
print(f"   💾 Proper checkpointing: Early stopping, best model saving")

print(f"\n🚀 Next steps:")
print(f"   1. 📖 Open your indicBART.ipynb notebook")
print(f"   2. 🔄 Replace training cells with improved code")
print(f"   3. ▶️ Run training with new {len(df_clean):,} sample dataset")
print(f"   4. 🧪 Test improved generation (no more repetition!)")
print(f"   5. 📊 Evaluate using your Indic-BERT error classifier")

print(f"\n💡 Expected results:")
print(f"   📉 Better loss convergence (more data)")
print(f"   🎯 Meaningful corrections (fixed generation)")
print(f"   🚫 No repetition loops (improved parameters)")
print(f"   ⚡ Faster training (optimized batch sizes)")

print(f"\n📁 Files created:")
print(f"   📄 Hindi/train.csv - {len(train_df):,} training samples")
print(f"   📄 Hindi/dev.csv - {len(dev_df):,} development samples")
print(f"   ⚙️ improved_training_config.py - Configuration")
print(f"   📝 improved_indicbart_training.py - Training code")

print(f"\n🎯 Ready to proceed with improved IndicBART training!")