In [1]:
from datasets import load_dataset
import re
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from collections import Counter
import os


In [2]:
dataset = load_dataset("ai4bharat/IndicCorpV2", "indiccorp_v2", split="guj_Gujr", streaming=True)


In [3]:
sentence_end = re.compile(r'([\.!?])\s+')
def tokenize_sentences(paragraph: str) -> list:
    # Normalize whitespace
    text = paragraph.strip().replace("\n", " ")
    
    # Split on '.', '!' or '?' plus following space
    parts = sentence_end.split(text)
    
    sentences = []
    for i in range(0, len(parts) - 1, 2):
        sent = parts[i] + parts[i+1]
        sentences.append(sent.strip())
    
    if len(parts) % 2 == 1 and parts[-1].strip():
        sentences.append(parts[-1].strip())
    
    return sentences


In [4]:
# Word tokenization
# Unicode-aware regex patterns
URL_PATTERN    = r'https?://[^\s]+\.\w{2,}'
EMAIL_PATTERN  = r'[\w\.-]+@[\w\.-]+\.\w+'
DATE_PATTERN   = r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b'
DECIMAL_PATTERN = r'\d+\.\d+'
NUMBER_PATTERN = r'\d+'
# Gujarati words range: \u0A80–\u0AFF;
GUJ_WORD       = r'[\u0A80-\u0AFF]+'
PUNCT_PATTERN  = r'[^\w\s\u0A80-\u0AFF]'  # punctuation not part of Gujarati

# Combine all token rules
token_re = re.compile(
    f"{URL_PATTERN}|{EMAIL_PATTERN}|{DATE_PATTERN}|{DECIMAL_PATTERN}|{NUMBER_PATTERN}|{GUJ_WORD}|{PUNCT_PATTERN}"
)

def tokenize_words(sentence):
    return token_re.findall(sentence)


In [5]:
max_sentences = 100000  # Increased for better corpus size
batch_size = 10000     # Process in batches for memory efficiency
output_dir = "tokenized_gujarati_corpus"
os.makedirs(output_dir, exist_ok=True)


In [6]:
def process_and_save_batch(sentences_batch, batch_num):
    """Process a batch of sentences and save to parquet"""
    tokenized_sentences = []
    
    for sent in sentences_batch:
        tokens = tokenize_words(sent)
        if tokens:  # Only include sentences with tokens
            # Join tokens with spaces to create tokenized sentence
            tokenized_sentence = ' '.join(tokens)
            tokenized_sentences.append({
                'original_sentence': sent,
                'tokenized_sentence': tokenized_sentence,
                'token_count': len(tokens)
            })
    
    if tokenized_sentences:
        # Create DataFrame
        df = pd.DataFrame(tokenized_sentences)
        
        # Save as parquet with compression
        output_file = os.path.join(output_dir, f"gujarati_tokenized_batch_{batch_num:04d}.parquet")
        df.to_parquet(
            output_file, 
            compression='snappy',  # Fast compression with good ratio
            engine='pyarrow'
        )
        
        print(f"Saved batch {batch_num} with {len(tokenized_sentences)} sentences to {output_file}")
        return len(tokenized_sentences), df['tokenized_sentence'].tolist()
    
    return 0, []


In [7]:
# Process dataset in batches
sentence_count = 0
batch_num = 0
current_batch = []
all_tokens = []
batch_stats = []

print("Starting corpus processing...")


Starting corpus processing...


In [8]:
for data in dataset:
    if max_sentences is not None and sentence_count >= max_sentences:
        break
    
    text = data.get("text", "").strip()
    if not text:
        continue
    
    sentences = tokenize_sentences(text)
    for sent in sentences:
        if max_sentences is not None and sentence_count >= max_sentences:
            break
        
        current_batch.append(sent)
        sentence_count += 1
        
        # Process batch when it reaches batch_size
        if len(current_batch) >= batch_size:
            batch_sentences, batch_tokenized = process_and_save_batch(current_batch, batch_num)
            if batch_sentences > 0:
                # Collect tokens for overall statistics
                for tokenized_sent in batch_tokenized:
                    all_tokens.extend(tokenized_sent.split())
                
                batch_stats.append({
                    'batch_num': batch_num,
                    'sentences': batch_sentences
                })
            
            current_batch = []
            batch_num += 1


Saved batch 0 with 9999 sentences to tokenized_gujarati_corpus\gujarati_tokenized_batch_0000.parquet
Saved batch 1 with 9998 sentences to tokenized_gujarati_corpus\gujarati_tokenized_batch_0001.parquet
Saved batch 2 with 9998 sentences to tokenized_gujarati_corpus\gujarati_tokenized_batch_0002.parquet
Saved batch 3 with 9998 sentences to tokenized_gujarati_corpus\gujarati_tokenized_batch_0003.parquet
Saved batch 4 with 10000 sentences to tokenized_gujarati_corpus\gujarati_tokenized_batch_0004.parquet
Saved batch 5 with 9998 sentences to tokenized_gujarati_corpus\gujarati_tokenized_batch_0005.parquet
Saved batch 6 with 9998 sentences to tokenized_gujarati_corpus\gujarati_tokenized_batch_0006.parquet
Saved batch 7 with 9999 sentences to tokenized_gujarati_corpus\gujarati_tokenized_batch_0007.parquet
Saved batch 8 with 9999 sentences to tokenized_gujarati_corpus\gujarati_tokenized_batch_0008.parquet
Saved batch 9 with 9999 sentences to tokenized_gujarati_corpus\gujarati_tokenized_batch_00

In [9]:
# Process remaining sentences in the last batch
if current_batch:
    batch_sentences, batch_tokenized = process_and_save_batch(current_batch, batch_num)
    if batch_sentences > 0:
        for tokenized_sent in batch_tokenized:
            all_tokens.extend(tokenized_sent.split())
        
        batch_stats.append({
            'batch_num': batch_num,
            'sentences': batch_sentences
        })


In [10]:
total_sentences = sum(stat['sentences'] for stat in batch_stats)
total_words = len(all_tokens)
total_characters = sum(len(token) for token in all_tokens)
unique_tokens = set(all_tokens)


In [11]:
avg_sentence_length = total_words / total_sentences if total_sentences else 0
avg_word_length = total_characters / total_words if total_words else 0
type_token_ratio = len(unique_tokens) / total_words if total_words else 0


In [12]:
print("\n" + "="*50)
print("📊 FINAL CORPUS STATISTICS")
print("="*50)
print(f"i.   Total number of sentences         : {total_sentences:,}")
print(f"ii.  Total number of words             : {total_words:,}")
print(f"iii. Total number of characters        : {total_characters:,}")
print(f"iv.  Average sentence length (words)   : {avg_sentence_length:.2f}")
print(f"v.   Average word length (characters)  : {avg_word_length:.2f}")
print(f"vi.  Type/Token Ratio (TTR)            : {type_token_ratio:.4f}")
print(f"vii. Number of batches created         : {len(batch_stats)}")



📊 FINAL CORPUS STATISTICS
i.   Total number of sentences         : 99,986
ii.  Total number of words             : 1,678,173
iii. Total number of characters        : 7,360,779
iv.  Average sentence length (words)   : 16.78
v.   Average word length (characters)  : 4.39
vi.  Type/Token Ratio (TTR)            : 0.0802
vii. Number of batches created         : 10


In [None]:
token_counts = Counter(all_tokens)
most_common = token_counts.most_common(100)


In [14]:
print("\n🔢 Top 100 Most Frequent Words:")
for i, (word, freq) in enumerate(most_common, 1):
    print(f"{i:>2}. {word:<15}  —  {freq:,} times")



🔢 Top 100 Most Frequent Words:
 1. .                —  96,827 times
 2. છે               —  63,146 times
 3. ,                —  46,822 times
 4. અને              —  28,565 times
 5. આ                —  18,318 times
 6. કે               —  16,257 times
 7. પણ               —  13,070 times
 8. માટે             —  12,940 times
 9. -                —  10,646 times
10. એક               —  9,998 times
11. કરી              —  9,814 times
12. પર               —  9,669 times
13. તે               —  8,503 times
14. જ                —  8,414 times
15. સાથે             —  8,322 times
16. હતી              —  8,270 times
17. તો               —  6,216 times
18. હતો              —  5,776 times
19. હતા              —  5,490 times
20. નથી              —  5,162 times


In [15]:
most_common

[('.', 96827),
 ('છે', 63146),
 (',', 46822),
 ('અને', 28565),
 ('આ', 18318),
 ('કે', 16257),
 ('પણ', 13070),
 ('માટે', 12940),
 ('-', 10646),
 ('એક', 9998),
 ('કરી', 9814),
 ('પર', 9669),
 ('તે', 8503),
 ('જ', 8414),
 ('સાથે', 8322),
 ('હતી', 8270),
 ('તો', 6216),
 ('હતો', 5776),
 ('હતા', 5490),
 ('નથી', 5162)]

In [16]:
total_unique_tokens = len(token_counts)
hapax_legomena = sum(1 for count in token_counts.values() if count == 1)


In [17]:
print(f"\n📈 Frequency Distribution:")
print(f"Total unique tokens      : {total_unique_tokens:,}")
print(f"Hapax legomena (freq=1)  : {hapax_legomena:,}")
print(f"Hapax percentage         : {(hapax_legomena/total_unique_tokens)*100:.2f}%")



📈 Frequency Distribution:
Total unique tokens      : 134,613
Hapax legomena (freq=1)  : 76,792
Hapax percentage         : 57.05%


In [18]:
# Save overall statistics to a separate file
stats_data = {
    'metric': [
        'Total Sentences', 'Total Words', 'Total Characters', 
        'Average Sentence Length', 'Average Word Length', 'Type-Token Ratio',
        'Unique Tokens', 'Hapax Legomena'
    ],
    'value': [
        total_sentences, total_words, total_characters,
        avg_sentence_length, avg_word_length, type_token_ratio,
        total_unique_tokens, hapax_legomena
    ]
}

stats_df = pd.DataFrame(stats_data)
stats_file = os.path.join(output_dir, "corpus_statistics.parquet")
stats_df.to_parquet(stats_file, compression='snappy')

# Save top words frequency
freq_data = pd.DataFrame(most_common, columns=['word', 'frequency'])
freq_file = os.path.join(output_dir, "word_frequencies_top20.parquet")
freq_data.to_parquet(freq_file, compression='snappy')

print(f"\n💾 Files saved in directory: {output_dir}/")
print("   - gujarati_tokenized_batch_XXXX.parquet (tokenized sentences)")
print("   - corpus_statistics.parquet (overall statistics)")
print("   - word_frequencies_top20.parquet (most frequent words)")



💾 Files saved in directory: tokenized_gujarati_corpus/
   - gujarati_tokenized_batch_XXXX.parquet (tokenized sentences)
   - corpus_statistics.parquet (overall statistics)
   - word_frequencies_top20.parquet (most frequent words)


In [19]:
print("\n📖 Example: Reading saved data")
print("-" * 30)

# Read first batch
first_batch_file = os.path.join(output_dir, "gujarati_tokenized_batch_0000.parquet")
if os.path.exists(first_batch_file):
    sample_df = pd.read_parquet(first_batch_file)
    print(f"Sample from first batch (showing first 3 rows):")
    for idx in range(min(3, len(sample_df))):
        print(f"Original: {sample_df.iloc[idx]['original_sentence'][:50]}...")
        print(f"Tokenized: {sample_df.iloc[idx]['tokenized_sentence'][:50]}...")
        print(f"Tokens: {sample_df.iloc[idx]['token_count']}")
        print()



📖 Example: Reading saved data
------------------------------
Sample from first batch (showing first 3 rows):
Original: આ વીડિયો જુઓ: ઊંઝા માર્કેટયાર્ડ આજથી 25 જુલાઈ સુધી...
Tokenized: આ વીડિયો જુઓ : ઊંઝા માર્કેટયાર્ડ આજથી 25 જુલાઈ સુધ...
Tokens: 11

Original: મિથેનોલ આવ્યો ક્યાંથી?...
Tokenized: મિથેનોલ આવ્યો ક્યાંથી ?...
Tokens: 4

Original: આખરે ત્રણ રાજ્યોમાં મળેલ હાર પર કોંગ્રેસ અધ્યક્ષ ર...
Tokenized: આખરે ત્રણ રાજ્યોમાં મળેલ હાર પર કોંગ્રેસ અધ્યક્ષ ર...
Tokens: 17



In [20]:
def load_all_tokenized_sentences(directory):
    """Load all tokenized sentences from parquet files"""
    all_sentences = []
    
    for filename in sorted(os.listdir(directory)):
        if filename.startswith("gujarati_tokenized_batch_") and filename.endswith(".parquet"):
            filepath = os.path.join(directory, filename)
            df = pd.read_parquet(filepath)
            all_sentences.extend(df['tokenized_sentence'].tolist())
    
    return all_sentences


In [21]:
all_tokenized = load_all_tokenized_sentences(output_dir)
print(f"Total loaded sentences: {len(all_tokenized)}")


Total loaded sentences: 99986
