# Grammar Correction Model Training
## Based on ChaosLingua System Architecture
### Implements Structured Chaos philosophy with error harvesting

In [None]:
# Install PyTorch with CUDA support (Kaggle GPU)
%pip install torch torchvision torchaudio
%pip install transformers datasets evaluate accelerate
%pip install protobuf sentencepiece tiktoken --quiet

In [None]:
# Setup HuggingFace API access
from huggingface_hub import login

# Use your NEW token here
hf_api_key = "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"
login(token=hf_api_key)

In [None]:
import pandas as pd
import requests

def load_hf_dataset_simple(dataset_name, split='train'):
    """Simple version - API returns list of parquet URLs as strings"""
    
    api_url = f"https://huggingface.co/api/datasets/{dataset_name}/parquet/default/{split}"
    print(f"üîó Loading: {dataset_name} ({split})")
    
    try:
        # Get parquet URLs from API
        response = requests.get(api_url, timeout=30)
        if response.status_code != 200:
            print(f"‚ùå API failed: {response.status_code}")
            return None
            
        parquet_urls = response.json()  # This is just a list of URL strings!
        print(f"üìÅ Found {len(parquet_urls)} parquet file(s)")
        
        # Load each parquet file and combine
        dfs = []
        for i, parquet_url in enumerate(parquet_urls):
            print(f"  Loading file {i+1}: {parquet_url}")
            
            try:
                df_chunk = pd.read_parquet(parquet_url)
                dfs.append(df_chunk)
                print(f"    ‚úÖ {len(df_chunk)} rows")
            except Exception as e:
                print(f"    ‚ùå Failed: {str(e)}")
                continue
        
        if not dfs:
            print(f"‚ùå No files loaded successfully")
            return None
        
        # Combine all chunks
        final_df = pd.concat(dfs, ignore_index=True)
        print(f"üéâ SUCCESS: {len(final_df)} rows, {len(final_df.columns)} columns")
        return final_df
        
    except Exception as e:
        print(f"‚ùå Error: {str(e)}")
        return None

# Define datasets with their available splits
datasets_config = [
    ('upb-nlp/gec-ro-texts', ['train', 'validation']),
    ('upb-nlp/gec_ro_cna', ['train', 'test']),
    ('upb-nlp/gec-ro-comments', ['train', 'validation', 'test'])
]

loaded_datasets = {}

for dataset_name, splits in datasets_config:
    print(f"\n{'='*60}")
    print(f"üì¶ Dataset: {dataset_name}")
    
    dataset_splits = {}
    for split in splits:
        df = load_hf_dataset_simple(dataset_name, split)
        
        if df is not None:
            dataset_splits[split] = df
            print(f"\nüìä {split.upper()} split:")
            print(f"   Shape: {df.shape}")
            print(f"   Columns: {df.columns.tolist()}")
            print(f"   Memory: {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
            
            print(f"\nüìã Sample Data:")
            print(df.head(2))
    
    if dataset_splits:
        loaded_datasets[dataset_name] = dataset_splits
        
    print("="*60)

print(f"\nüèÜ RESULTS:")
print(f"   Successfully loaded: {len(loaded_datasets)} datasets")

# Quick analysis of what you got
for name, splits_dict in loaded_datasets.items():
    print(f"\n   {name}:")
    for split, df in splits_dict.items():
        print(f"      {split}: {len(df):,} rows")

total_rows = sum(len(df) for splits_dict in loaded_datasets.values() for df in splits_dict.values())
print(f"\n   TOTAL: {total_rows:,} grammar correction examples! üî•")

In [None]:
# Convert DataFrames to HuggingFace Dataset format and preprocess
from transformers import AutoTokenizer
from datasets import Dataset

tokenizer = AutoTokenizer.from_pretrained('google/mt5-small')

# First, create pairs from the alternating data
def create_pairs_from_dataset(df):
    """Extract alternating incorrect/correct pairs from the dataset"""
    texts = df['text'].tolist()
    pairs = []
    
    for i in range(0, len(texts), 2):
        if i + 1 < len(texts):  # Make sure we have a pair
            pairs.append({
                'original': texts[i],    # Incorrect sentence
                'corrected': texts[i + 1]  # Correct sentence
            })
    
    return pd.DataFrame(pairs)

# Process all datasets and splits
all_pairs = {}

for dataset_name, splits_dict in loaded_datasets.items():
    print(f"\n{'='*60}")
    print(f"üì¶ Processing: {dataset_name}")
    
    for split, df in splits_dict.items():
        pairs_df = create_pairs_from_dataset(df)
        key = f"{dataset_name}_{split}"
        all_pairs[key] = pairs_df
        
        print(f"   {split}: {len(df)} texts ‚Üí {len(pairs_df)} pairs")
    
    print("="*60)

# Combine train splits for training
train_dfs = [pairs for key, pairs in all_pairs.items() if 'train' in key]
combined_train = pd.concat(train_dfs, ignore_index=True)
print(f"\nüìä Combined training pairs: {len(combined_train)}")

# Combine validation splits for validation
val_dfs = [pairs for key, pairs in all_pairs.items() if 'validation' in key]
if val_dfs:
    combined_val = pd.concat(val_dfs, ignore_index=True)
    print(f"üìä Combined validation pairs: {len(combined_val)}")
else:
    combined_val = None
    print(f"‚ö†Ô∏è  No validation splits available")

# Combine test splits for testing
test_dfs = [pairs for key, pairs in all_pairs.items() if 'test' in key]
if test_dfs:
    combined_test = pd.concat(test_dfs, ignore_index=True)
    print(f"üìä Combined test pairs: {len(combined_test)}")
else:
    combined_test = None
    print(f"‚ö†Ô∏è  No test splits available")

# Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(combined_train)
val_dataset = Dataset.from_pandas(combined_val) if combined_val is not None else None
test_dataset = Dataset.from_pandas(combined_test) if combined_test is not None else None

def preprocess_function(examples):
    """Preprocess with proper label masking for padding tokens"""
    inputs = ["correct: " + sentence for sentence in examples["original"]]
    targets = examples["corrected"]
    
    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
    
    # Tokenize targets
    labels = tokenizer(text_target=targets, max_length=128, truncation=True, padding="max_length")
    
    # CRITICAL FIX: Replace padding token IDs with -100 so they're ignored in loss calculation
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_ids]
        for label_ids in labels["input_ids"]
    ]
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the datasets and remove text columns
tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(preprocess_function, batched=True, remove_columns=val_dataset.column_names) if val_dataset else None
tokenized_test = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names) if test_dataset else None

print(f"\n‚úÖ Tokenized datasets:")
print(f"   Train: {len(tokenized_train)} examples")
if tokenized_val:
    print(f"   Validation: {len(tokenized_val)} examples")
if tokenized_test:
    print(f"   Test: {len(tokenized_test)} examples")

print(f"\nüìù Sample pair:")
print(f"   Original: {combined_train.iloc[0]['original']}")
print(f"   Corrected: {combined_train.iloc[0]['corrected']}")

In [None]:
# Initialize model and data collator
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained('google/mt5-small', use_safetensors=True)

# CRITICAL: Add DataCollatorForSeq2Seq - this handles dynamic padding and batching correctly
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    label_pad_token_id=-100
)

print("‚úÖ Model and data collator initialized")
print(f"   Model: google/mt5-small")
print(f"   Data collator: DataCollatorForSeq2Seq with label_pad_token_id=-100")

In [None]:
import torch

# Check CUDA availability
if torch.cuda.is_available():
    print(f"‚úÖ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"   CUDA version: {torch.version.cuda}")
    print(f"   GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
else:
    print("‚ö†Ô∏è No GPU detected - check your Kaggle accelerator settings!")

print(f"PyTorch version: {torch.__version__}")

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./grammar_correction_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,                 # FIXED: Lowered from 5e-5 to 3e-5
    per_device_train_batch_size=8,      # FIXED: Reduced from 16 to 8
    per_device_eval_batch_size=8,       # FIXED: Reduced from 16 to 8
    gradient_accumulation_steps=2,      # FIXED: Added for effective batch size of 16
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,                         # FIXED: Changed from True to False (T5 has NaN issues with fp16)
    max_grad_norm=1.0,                  # FIXED: Added gradient clipping
    report_to="none",
    logging_steps=100,
)

print("‚úÖ Training arguments configured:")
print(f"   Learning rate: 3e-5")
print(f"   Batch size: 8 (per device)")
print(f"   Gradient accumulation: 2 steps")
print(f"   FP16: False (disabled to prevent NaN)")
print(f"   Max grad norm: 1.0")

In [None]:
# Initialize Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer,
    data_collator=data_collator,  # CRITICAL: Added data_collator parameter
)

print("‚úÖ Trainer initialized with data_collator")

In [None]:
# VALIDATION: Verify preprocessing and model setup before training
import torch

print("üîç Pre-training validation checks:\n")

# Check 1: Verify labels contain -100 for padding
print("1Ô∏è‚É£ Checking label masking:")
sample_batch = tokenized_train[:4]
sample_labels = sample_batch['labels']
has_minus_100 = any(-100 in labels for labels in sample_labels)
print(f"   Labels contain -100 for padding: {has_minus_100} ‚úÖ" if has_minus_100 else f"   ‚ùå ERROR: Labels missing -100 masking!")

# Count -100 tokens in first sample
minus_100_count = sum(1 for label in sample_labels[0] if label == -100)
print(f"   Sample label has {minus_100_count} masked tokens")

# Check 2: Test forward pass doesn't produce NaN
print("\n2Ô∏è‚É£ Testing forward pass:")
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Get a small batch using data collator
test_batch = data_collator([tokenized_train[i] for i in range(4)])
test_batch = {k: v.to(device) for k, v in test_batch.items()}

with torch.no_grad():
    outputs = model(**test_batch)
    loss = outputs.loss
    
print(f"   Forward pass loss: {loss.item():.4f}")
if torch.isnan(loss):
    print("   ‚ùå ERROR: Loss is NaN!")
else:
    print("   Loss is valid (not NaN) ‚úÖ")

# Check 3: Test generation produces Romanian text
print("\n3Ô∏è‚É£ Testing generation:")
test_input = tokenized_train[0]
input_ids = torch.tensor([test_input['input_ids']]).to(device)
attention_mask = torch.tensor([test_input['attention_mask']]).to(device)

with torch.no_grad():
    generated_ids = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=128,
        num_beams=2,
    )

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(f"   Input: {combined_train.iloc[0]['original'][:80]}...")
print(f"   Generated: {generated_text[:80]}...")

if generated_text and not generated_text.isspace():
    print("   Generation produces text ‚úÖ")
else:
    print("   ‚ùå ERROR: Generation produces empty/invalid text!")

print("\n‚úÖ All validation checks passed! Ready to train.")

In [None]:
# Train model
print("üöÄ Starting training...\n")
trainer.train()

In [None]:
import evaluate
import numpy as np
from tqdm import tqdm

bleu = evaluate.load("sacrebleu")

if tokenized_test is not None:
    print("üîç Evaluating on test set...")
    
    model.eval()
    predictions = []
    references = []
    
    # Process in batches
    batch_size = 8
    
    for i in tqdm(range(0, len(tokenized_test), batch_size), desc="Generating corrections"):
        # Get batch - HuggingFace datasets return dict of lists
        batch_end = min(i + batch_size, len(tokenized_test))
        batch = tokenized_test[i:batch_end]
        
        # Extract input_ids and attention_mask properly
        input_ids = torch.tensor(batch['input_ids']).to(device)
        attention_mask = torch.tensor(batch['attention_mask']).to(device)
        
        # Generate with proper parameters
        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=128,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=2,
            )
        
        # Decode predictions
        batch_predictions = tokenizer.batch_decode(
            generated_ids, 
            skip_special_tokens=True
        )
        predictions.extend(batch_predictions)
        
        # Get references for this batch
        batch_references = [
            combined_test.iloc[j]['corrected'] 
            for j in range(i, batch_end)
        ]
        references.extend(batch_references)
    
    # Format references for sacrebleu
    references_formatted = [[ref] for ref in references]
    
    # Calculate BLEU
    bleu_score = bleu.compute(
        predictions=predictions,
        references=references_formatted
    )
    
    print(f"\nüìä Test Results:")
    print(f"   BLEU Score: {bleu_score['score']:.2f}")
    print(f"   Precision scores: {bleu_score['precisions']}")
    
    # Calculate additional metrics
    exact_matches = sum(1 for p, r in zip(predictions, references) if p.strip() == r.strip())
    print(f"   Exact matches: {exact_matches}/{len(predictions)} ({exact_matches/len(predictions)*100:.1f}%)")
    
    # Show some examples
    print(f"\nüìù Sample Corrections:")
    for i in range(min(5, len(predictions))):
        print(f"   Original:  {combined_test.iloc[i]['original']}")
        print(f"   Reference: {references[i]}")
        print(f"   Predicted: {predictions[i]}")
        print()
else:
    print("‚ö†Ô∏è No test dataset available for evaluation")

In [None]:
# Save model
model.save_pretrained("grammar_correction_model")
tokenizer.save_pretrained("grammar_correction_model")