In [7]:
import os
import glob
import nltk
from nltk.util import ngrams
from collections import Counter

# Download required NLTK data
print("Setting up NLTK...")
try:
    nltk.data.find('tokenizers/punkt')
    print("✓ punkt tokenizer already installed")
except LookupError:
    print("Downloading NLTK's 'punkt' model...")
    nltk.download('punkt')
    print("✓ punkt tokenizer installed")

def load_tokens_from_folder(folder_path):
    """
    Reads all .txt files from a folder and returns a single list of all words.
    Optimized for Gujarati text.
    """
    all_tokens = []
    file_paths = glob.glob(os.path.join(folder_path, '*.txt'))
    
    if not file_paths:
        print(f"Error: No .txt files found in the folder '{folder_path}'.")
        return None

    print(f"Found {len(file_paths)} files to analyze:")
    for file_path in file_paths:
        print(f"  - {os.path.basename(file_path)}")
    
    for file_path in file_paths:
        print(f"Processing: {os.path.basename(file_path)}")
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                # Simple split works best for Gujarati
                tokens = line.strip().split()
                all_tokens.extend(tokens)
                
    return all_tokens

def analyze_unigrams(tokens, top_n=50):
    """
    Analyzes unigram frequencies and probabilities.
    """
    report = ["="*20 + " UNIGRAM ANALYSIS " + "="*20]
    report.append(f"Total words (tokens): {len(tokens)}")
    report.append(f"Unique words (vocabulary): {len(set(tokens))}\n")
    
    fdist = nltk.FreqDist(tokens)
    
    report.append(f"--- Top {top_n} Most Common Words ---\n")
    report.append(f"{'Rank':<5} {'Word':<20} {'Count':<10} {'Probability (%)':<15}")
    report.append("-" * 60)
    
    for i, (word, count) in enumerate(fdist.most_common(top_n), 1):
        probability = (count / len(tokens)) * 100
        report.append(f"{i:<5} {word:<20} {count:<10} {probability:.4f}%")
        
    return "\n".join(report)

def analyze_bigrams(tokens, top_n=50):
    """
    Analyzes bigram frequencies and conditional probabilities.
    """
    report = ["\n" + "="*20 + " BIGRAM ANALYSIS " + "="*20]
    bigram_list = list(ngrams(tokens, 2))
    report.append(f"Total bigrams: {len(bigram_list)}\n")
    
    fdist = nltk.FreqDist(bigram_list)
    
    report.append(f"--- Top {top_n} Most Common Word Pairs ---\n")
    report.append(f"{'Rank':<5} {'Bigram':<30} {'Count':<10} {'Probability (%)':<15}")
    report.append("-" * 70)
    
    for i, (bigram, count) in enumerate(fdist.most_common(top_n), 1):
        probability = (count / len(bigram_list)) * 100
        bigram_str = ' '.join(bigram)
        report.append(f"{i:<5} {bigram_str:<30} {count:<10} {probability:.4f}%")
        
    return "\n".join(report)

def analyze_trigrams(tokens, top_n=50):
    """
    Analyzes trigram frequencies and conditional probabilities.
    """
    report = ["\n" + "="*20 + " TRIGRAM ANALYSIS " + "="*20]
    trigram_list = list(ngrams(tokens, 3))
    report.append(f"Total trigrams: {len(trigram_list)}\n")
    
    fdist = nltk.FreqDist(trigram_list)
    
    report.append(f"--- Top {top_n} Most Common Word Sequences ---\n")
    report.append(f"{'Rank':<5} {'Trigram':<40} {'Count':<10} {'Probability (%)':<15}")
    report.append("-" * 80)
    
    for i, (trigram, count) in enumerate(fdist.most_common(top_n), 1):
        probability = (count / len(trigram_list)) * 100
        trigram_str = ' '.join(trigram)
        report.append(f"{i:<5} {trigram_str:<40} {count:<10} {probability:.4f}%")
        
    return "\n".join(report)

Setting up NLTK...
✓ punkt tokenizer already installed


In [8]:
# --- Main Execution ---
print("Starting N-gram Analysis...")

# Use your PREPROCESSED data folder
input_folder = '../../data/processed'
output_report_file = '../../data/ngram_analysis_report_ocr.txt'

# 1. Load all tokens from the preprocessed files
all_tokens = load_tokens_from_folder(input_folder)

if all_tokens:
    print(f"Successfully loaded {len(all_tokens)} tokens.")
    
    # 2. Perform analysis for each N-gram type
    print("Analyzing unigrams...")
    unigram_report = analyze_unigrams(all_tokens)
    
    print("Analyzing bigrams...")
    bigram_report = analyze_bigrams(all_tokens)
    
    print("Analyzing trigrams...")
    trigram_report = analyze_trigrams(all_tokens)
    
    # 3. Combine reports and save to a file
    import datetime
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    final_report = f"""
N-GRAM ANALYSIS REPORT
=======================
Dataset Folder: {os.path.abspath(input_folder)}
Analysis Date: {current_time}

{unigram_report}

{bigram_report}

{trigram_report}
    """
    
    with open(output_report_file, 'w', encoding='utf-8') as f:
        f.write(final_report)
        
    print(f"\n✅ Analysis complete. Report saved to '{output_report_file}'.")
    
    # Show quick preview
    print("\n--- Quick Preview ---")
    print(f"Total tokens: {len(all_tokens)}")
    print(f"Unique words: {len(set(all_tokens))}")
    
    # Show top 5 most common words
    word_freq = Counter(all_tokens)
    print("\nTop 5 most common words:")
    for word, count in word_freq.most_common(5):
        print(f"  '{word}': {count} times")
        
else:
    print("❌ Failed to load tokens. Please check the folder path.")

Starting N-gram Analysis...
Found 8 files to analyze:
  - class11_biology.txt
  - class11_chemistry.txt
  - class11_maths.txt
  - class11_physics.txt
  - class12_biology.txt
  - class12_chemistry.txt
  - class12_maths.txt
  - class12_physics.txt
Processing: class11_biology.txt
Processing: class11_chemistry.txt
Processing: class11_maths.txt
Processing: class11_physics.txt
Processing: class12_biology.txt
Processing: class12_chemistry.txt
Processing: class12_maths.txt
Processing: class12_physics.txt
Successfully loaded 867503 tokens.
Analyzing unigrams...
Analyzing bigrams...
Analyzing trigrams...

✅ Analysis complete. Report saved to '../../data/ngram_analysis_report_ocr.txt'.

--- Quick Preview ---
Total tokens: 867503
Unique words: 119901

Top 5 most common words:
  'છે.': 25673 times
  'અને': 18401 times
  '-': 12092 times
  'છે': 10840 times
  'કે': 8001 times
