# Hindi Dataset Generator for Grammar Error Correction

This notebook:
1. Fetches 10,000 Hindi sentences from Hugging Face IndicCorpV2 dataset
2. Creates 5,000 identity pairs (input = output)
3. Creates 5,000 corrupted pairs with word/character level errors
4. Saves the generated dataset and combines with existing test.csv if present

In [None]:
# Install required packages
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# List of required packages
required_packages = [
    'requests',
    'pandas', 
    'tqdm',
    'datasets'  # Added datasets library for Hugging Face datasets
]

try:
    import requests
    import pandas as pd
    import random
    import re
    import json
    from tqdm import tqdm
    
    # Try to import datasets library
    try:
        import datasets
        print("Datasets library is available")
    except ImportError:
        print("Installing datasets library...")
        install_package('datasets')
        import datasets
        
except ImportError as e:
    missing_package = str(e).split("'")[1]
    print(f"Installing {missing_package}...")
    install_package(missing_package)
    
    # Re-import after installation
    import requests
    import pandas as pd
    import random
    import re
    import json
    from tqdm import tqdm
    
    try:
        import datasets
    except ImportError:
        print("Installing datasets library...")
        install_package('datasets')
        import datasets

print("All packages imported successfully!")

All packages imported successfully!


In [2]:
# Configuration
TARGET_SENTENCES = 10000
IDENTITY_PAIRS = 5000
CORRUPTED_PAIRS = 5000

# API endpoint for Hugging Face datasets-server
BASE_URL = "https://datasets-server.huggingface.co/rows"
DATASET = "ai4bharat/IndicCorpV2"
CONFIG = "indiccorp_v2"
SPLIT = "hin_Deva"

# Set random seed for reproducibility
random.seed(42)

In [None]:
def fetch_hindi_sentences(target_count=10000, batch_size=100):
    """
    Fetch Hindi sentences using datasets library directly
    """
    try:
        from datasets import load_dataset
        print(f"Fetching {target_count} Hindi sentences using datasets library...")
        
        # Load the IndicCorpV2 dataset directly
        print("Loading IndicCorpV2 dataset...")
        dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split="hin_Deva", streaming=True)
        
        sentences = []
        
        with tqdm(total=target_count, desc="Fetching sentences") as pbar:
            for example in dataset:
                if 'text' in example:
                    text = example['text'].strip()
                    if text and len(text) > 10:  # Filter out very short texts
                        sentences.append(text)
                        pbar.update(1)
                        
                        if len(sentences) >= target_count:
                            break
        
        print(f"Successfully fetched {len(sentences)} sentences")
        return sentences
        
    except Exception as e:
        print(f"Error loading dataset: {e}")
        raise Exception(f"Failed to load Hindi sentences from IndicCorpV2 dataset: {e}")

In [4]:
def introduce_word_level_errors(sentence, error_rate=0.3):
    """
    Introduce word-level errors: deletion, insertion, swapping
    """
    words = sentence.split()
    if len(words) < 2:
        return sentence
    
    corrupted_words = words.copy()
    
    # Determine number of errors to introduce
    num_errors = max(1, int(len(words) * error_rate))
    
    for _ in range(num_errors):
        error_type = random.choice(['delete', 'insert', 'swap'])
        
        if error_type == 'delete' and len(corrupted_words) > 1:
            # Delete a random word
            idx = random.randint(0, len(corrupted_words) - 1)
            corrupted_words.pop(idx)
            
        elif error_type == 'insert':
            # Insert a duplicate word at random position
            if corrupted_words:
                word_to_duplicate = random.choice(corrupted_words)
                idx = random.randint(0, len(corrupted_words))
                corrupted_words.insert(idx, word_to_duplicate)
                
        elif error_type == 'swap' and len(corrupted_words) > 1:
            # Swap two adjacent words
            idx = random.randint(0, len(corrupted_words) - 2)
            corrupted_words[idx], corrupted_words[idx + 1] = corrupted_words[idx + 1], corrupted_words[idx]
    
    return ' '.join(corrupted_words)

In [5]:
def introduce_character_level_errors(sentence, error_rate=0.1):
    """
    Introduce character-level errors: deletion, insertion, swapping
    """
    if len(sentence) < 2:
        return sentence
    
    chars = list(sentence)
    
    # Determine number of errors to introduce
    num_errors = max(1, int(len(chars) * error_rate))
    
    for _ in range(num_errors):
        if len(chars) < 2:
            break
            
        error_type = random.choice(['delete', 'insert', 'swap'])
        
        if error_type == 'delete' and len(chars) > 1:
            # Delete a random character (but not spaces at word boundaries)
            non_space_indices = [i for i, c in enumerate(chars) if c != ' ']
            if non_space_indices:
                idx = random.choice(non_space_indices)
                chars.pop(idx)
                
        elif error_type == 'insert':
            # Insert a duplicate character at random position
            if chars:
                char_to_duplicate = random.choice([c for c in chars if c != ' '])
                if char_to_duplicate:
                    idx = random.randint(0, len(chars))
                    chars.insert(idx, char_to_duplicate)
                
        elif error_type == 'swap' and len(chars) > 1:
            # Swap two adjacent characters
            idx = random.randint(0, len(chars) - 2)
            chars[idx], chars[idx + 1] = chars[idx + 1], chars[idx]
    
    return ''.join(chars)

In [6]:
def create_corrupted_sentence(sentence):
    """
    Apply both word-level and character-level errors to a sentence
    """
    # Randomly choose to apply word-level or character-level errors (or both)
    error_types = random.choice([
        ['word'],
        ['character'],
        ['word', 'character']
    ])
    
    corrupted = sentence
    
    if 'word' in error_types:
        corrupted = introduce_word_level_errors(corrupted)
    
    if 'character' in error_types:
        corrupted = introduce_character_level_errors(corrupted)
    
    return corrupted

In [7]:
# Fetch Hindi sentences
hindi_sentences = fetch_hindi_sentences(TARGET_SENTENCES)

Fetching 10000 Hindi sentences...


Fetching sentences:   0%|          | 0/10000 [00:02<?, ?it/s]

Error fetching data at offset 0: 501 Server Error: Not Implemented for url: https://datasets-server.huggingface.co/rows?dataset=ai4bharat%2FIndicCorpV2&config=indiccorp_v2&split=hin_Deva&offset=0&limit=100





HTTPError: 501 Server Error: Not Implemented for url: https://datasets-server.huggingface.co/rows?dataset=ai4bharat%2FIndicCorpV2&config=indiccorp_v2&split=hin_Deva&offset=0&limit=100

In [None]:
# Create training data pairs
print("Creating training data pairs...")

# Shuffle sentences for random selection
random.shuffle(hindi_sentences)

data_pairs = []

# Create identity pairs (first 5000 sentences)
print(f"Creating {IDENTITY_PAIRS} identity pairs...")
for i in tqdm(range(IDENTITY_PAIRS), desc="Identity pairs"):
    sentence = hindi_sentences[i]
    data_pairs.append({
        'input': sentence,
        'output': sentence,
        'type': 'identity'
    })

# Create corrupted pairs (next 5000 sentences)
print(f"Creating {CORRUPTED_PAIRS} corrupted pairs...")
for i in tqdm(range(IDENTITY_PAIRS, IDENTITY_PAIRS + CORRUPTED_PAIRS), desc="Corrupted pairs"):
    original_sentence = hindi_sentences[i]
    corrupted_sentence = create_corrupted_sentence(original_sentence)
    
    data_pairs.append({
        'input': corrupted_sentence,
        'output': original_sentence,
        'type': 'corrupted'
    })

print(f"Total pairs created: {len(data_pairs)}")

In [None]:
# Create DataFrame and save to CSV
df = pd.DataFrame(data_pairs)

# Display some examples
print("\nSample identity pairs:")
identity_samples = df[df['type'] == 'identity'].head(3)
for idx, row in identity_samples.iterrows():
    print(f"Input:  {row['input'][:100]}...")
    print(f"Output: {row['output'][:100]}...")
    print()

print("\nSample corrupted pairs:")
corrupted_samples = df[df['type'] == 'corrupted'].head(3)
for idx, row in corrupted_samples.iterrows():
    print(f"Input:  {row['input'][:100]}...")
    print(f"Output: {row['output'][:100]}...")
    print()

# Remove the 'type' column as it's just for our reference
df_final = df[['input', 'output']]

print(f"\nDataFrame shape: {df_final.shape}")
print(f"Columns: {list(df_final.columns)}")

In [None]:
# Save the generated dataset
generated_filename = 'generated_hindi_dataset.csv'
df_final.to_csv(generated_filename, index=False, encoding='utf-8')
print(f"Generated dataset saved as: {generated_filename}")

# Check if test.csv exists and combine if it does
import os

test_csv_path = 'test.csv'
combined_filename = 'combined_test_dataset.csv'

if os.path.exists(test_csv_path):
    print(f"\nFound existing {test_csv_path}. Combining datasets...")
    
    # Read existing test.csv
    existing_df = pd.read_csv(test_csv_path, encoding='utf-8')
    print(f"Existing dataset shape: {existing_df.shape}")
    print(f"Existing columns: {list(existing_df.columns)}")
    
    # Ensure column names match
    if list(existing_df.columns) != list(df_final.columns):
        print(f"Warning: Column names don't match!")
        print(f"Existing: {list(existing_df.columns)}")
        print(f"Generated: {list(df_final.columns)}")
        
        # Try to align columns if possible
        if len(existing_df.columns) == 2 and len(df_final.columns) == 2:
            print("Assuming both datasets have input/output columns in the same order...")
            df_final.columns = existing_df.columns
    
    # Combine datasets
    combined_df = pd.concat([existing_df, df_final], ignore_index=True)
    
    # Save combined dataset
    combined_df.to_csv(combined_filename, index=False, encoding='utf-8')
    print(f"Combined dataset saved as: {combined_filename}")
    print(f"Combined dataset shape: {combined_df.shape}")
    
else:
    print(f"\nNo existing {test_csv_path} found.")
    print(f"You can rename {generated_filename} to {test_csv_path} if needed.")

print("\nDataset generation completed successfully!")

In [None]:
# Summary statistics
print("\n" + "="*50)
print("SUMMARY")
print("="*50)
print(f"Total Hindi sentences fetched: {len(hindi_sentences)}")
print(f"Identity pairs created: {IDENTITY_PAIRS}")
print(f"Corrupted pairs created: {CORRUPTED_PAIRS}")
print(f"Total training pairs: {len(df_final)}")
print(f"\nFiles created:")
print(f"- {generated_filename}")
if os.path.exists(test_csv_path):
    print(f"- {combined_filename}")
print("\nDataset is ready for training!")