In [2]:
from collections import defaultdict, Counter
import math
import sentencepiece as spm
import random

In [3]:
from collections import defaultdict, Counter
import math
import pickle
import sentencepiece as spm

### Train, Test, Val Split

In [27]:
def split_data(input_file, train_file, val_file, test_file, 
               train_ratio=0.8, val_ratio=0.1, test_ratio=0.1, seed=42):
    """
    Split data into train, validation, and test sets.
    
    Args:
        input_file: Path to input text file (one sentence per line)
        train_file: Output path for training data
        val_file: Output path for validation data
        test_file: Output path for test data
        train_ratio: Fraction for training (default 0.8 = 80%)
        val_ratio: Fraction for validation (default 0.1 = 10%)
        test_ratio: Fraction for test (default 0.1 = 10%)
        seed: Random seed for reproducibility
    """
    # Check ratios sum to 1
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 0.001, \
        "Ratios must sum to 1.0"
    
    # Set random seed for reproducibility
    random.seed(seed)
    
    # Read all lines
    print(f"Reading data from {input_file}...")
    with open(input_file, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    
    total_lines = len(lines)
    print(f"Total sentences: {total_lines}")
    
    # Shuffle the data
    random.shuffle(lines)
    
    # Calculate split points
    train_end = int(total_lines * train_ratio)
    val_end = train_end + int(total_lines * val_ratio)
    
    # Split the data
    train_data = lines[:train_end]
    val_data = lines[train_end:val_end]
    test_data = lines[val_end:]
    
    # Write to files
    print(f"\nWriting splits...")
    with open(train_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(train_data))
    print(f"  Train: {len(train_data)} sentences -> {train_file}")
    
    with open(val_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(val_data))
    print(f"  Validation: {len(val_data)} sentences -> {val_file}")
    
    with open(test_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(test_data))
    print(f"  Test: {len(test_data)} sentences -> {test_file}")
    
    print(f"\nSplit complete!")
    print(f"  Train: {len(train_data)/total_lines*100:.1f}%")
    print(f"  Val: {len(val_data)/total_lines*100:.1f}%")
    print(f"  Test: {len(test_data)/total_lines*100:.1f}%")


# Example usage
if __name__ == "__main__":
    # Split your data
    split_data(
        input_file="cleaned-text.txt",
        train_file="train.txt",
        val_file="val.txt",
        test_file="test.txt",
        train_ratio=0.8,
        val_ratio=0.0,
        test_ratio=0.2,
        seed=42  # Change this for different splits
    )

Reading data from cleaned-text.txt...
Total sentences: 204627

Writing splits...
  Train: 163701 sentences -> train.txt
  Validation: 0 sentences -> val.txt
  Test: 40926 sentences -> test.txt

Split complete!
  Train: 80.0%
  Val: 0.0%
  Test: 20.0%


In [4]:
class TrigramModel:
    def __init__(self, sp_model_path):
        # Load the SentencePiece model
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(sp_model_path)
        
        # Get special token IDs
        self.bos_id = self.sp.bos_id()  # Should be 1
        self.eos_id = self.sp.eos_id()  # Should be 2
        self.unk_id = self.sp.unk_id()  # Should be 0
        self.pad_id = self.sp.pad_id()  # Should be 3
        
        # Count how many times we see each trigram (id1, id2, id3)
        self.trigram_counts = defaultdict(int)
        # Count how many times we see each bigram (id1, id2)
        self.bigram_counts = defaultdict(int)
        # Vocabulary size from SentencePiece
        self.vocab_size = self.sp.vocab_size()
        
        print(f"Loaded SentencePiece model: {sp_model_path}")
        print(f"Vocabulary size: {self.vocab_size}")
        print(f"BOS ID: {self.bos_id}, EOS ID: {self.eos_id}")
        
    def train(self, text_file):
        """
        Train the model on a text file.
        The file should contain one sentence per line in Khmer.
        """
        print(f"\nTraining on {text_file}...")
        sentence_count = 0
        
        with open(text_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:  # Skip empty lines
                    continue
                
                # Tokenize the sentence into token IDs
                # add_bos=True adds one BOS token, add_eos=True adds EOS token
                token_ids = self.sp.encode(line, add_bos=True, add_eos=True)
                
                # Add one more BOS at the beginning (so we have 2 total for trigrams)
                token_ids = [self.bos_id] + token_ids
                
                # Count trigrams and bigrams
                for i in range(len(token_ids) - 2):
                    id1, id2, id3 = token_ids[i], token_ids[i+1], token_ids[i+2]
                    
                    # Count this specific trigram
                    self.trigram_counts[(id1, id2, id3)] += 1
                    
                    # Count the bigram context
                    self.bigram_counts[(id1, id2)] += 1
                
                sentence_count += 1
                if sentence_count % 10000 == 0:
                    print(f"  Processed {sentence_count} sentences...")
        
        print(f"Training complete! Processed {sentence_count} sentences")
        print(f"Unique trigrams: {len(self.trigram_counts)}")
        print(f"Unique bigrams: {len(self.bigram_counts)}")
    
    def get_probability(self, id1, id2, id3):
        """
        Calculate P(id3 | id1, id2) with Laplacian smoothing.
        
        Formula: P(id3|id1,id2) = (count(id1,id2,id3) + 1) / (count(id1,id2) + V)
        where V is the vocabulary size
        """
        # Get counts (will be 0 if not seen)
        trigram_count = self.trigram_counts[(id1, id2, id3)]
        bigram_count = self.bigram_counts[(id1, id2)]
        
        # Apply Laplacian smoothing
        probability = (trigram_count + 1) / (bigram_count + self.vocab_size)
        
        return probability
    
    def calculate_perplexity(self, test_file):
        """
        Calculate perplexity on a test file.
        Lower perplexity = better model.
        """
        print(f"\nCalculating perplexity on {test_file}...")
        total_log_prob = 0
        total_tokens = 0
        sentence_count = 0
        
        with open(test_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                
                # Tokenize the sentence
                token_ids = self.sp.encode(line, add_bos=True, add_eos=True)
                token_ids = [self.bos_id] + token_ids
                
                # Calculate probability for each token given its context
                for i in range(2, len(token_ids)):
                    id1, id2, id3 = token_ids[i-2], token_ids[i-1], token_ids[i]
                    
                    prob = self.get_probability(id1, id2, id3)
                    
                    # Add log probability
                    total_log_prob += math.log(prob)
                    total_tokens += 1
                
                sentence_count += 1
                if sentence_count % 10000 == 0:
                    print(f"  Processed {sentence_count} sentences...")
        
        # Calculate perplexity
        avg_log_prob = total_log_prob / total_tokens
        perplexity = math.exp(-avg_log_prob)
        
        print(f"Perplexity calculation complete!")
        print(f"Total tokens evaluated: {total_tokens}")
        
        return perplexity
    
    def generate_text(self, max_length=20, seed_text=None):
        """
        Generate text using the trigram model (optional - for fun!).
        """
        if seed_text:
            token_ids = [self.bos_id] + self.sp.encode(seed_text, add_bos=False, add_eos=False)
        else:
            token_ids = [self.bos_id, self.bos_id]
        
        for _ in range(max_length):
            if len(token_ids) < 2:
                break
                
            # Get the last two tokens as context
            id1, id2 = token_ids[-2], token_ids[-1]
            
            # Find the most probable next token
            best_id = None
            best_prob = 0
            
            for id3 in range(self.vocab_size):
                prob = self.get_probability(id1, id2, id3)
                if prob > best_prob:
                    best_prob = prob
                    best_id = id3
            
            # Stop if we generated EOS
            if best_id == self.eos_id:
                break
                
            token_ids.append(best_id)
        
        # Decode back to text
        generated_text = self.sp.decode(token_ids[2:])  # Skip the two BOS tokens
        return generated_text

In [10]:
# Path to SentencePiece model
sp_model_path = "khmer_sp.model"
    
# Path to training and test data
train_file = "train.txt"
test_file = "test.txt"

In [11]:
# Create and train model
model = TrigramModel(sp_model_path)
model.train(train_file)

Loaded SentencePiece model: khmer_sp.model
Vocabulary size: 8000
BOS ID: 1, EOS ID: 2

Training on train.txt...
  Processed 10000 sentences...
  Processed 20000 sentences...
  Processed 30000 sentences...
  Processed 40000 sentences...
  Processed 50000 sentences...
  Processed 60000 sentences...
  Processed 70000 sentences...
  Processed 80000 sentences...
  Processed 90000 sentences...
  Processed 100000 sentences...
  Processed 110000 sentences...
  Processed 120000 sentences...
  Processed 130000 sentences...
  Processed 140000 sentences...
  Processed 150000 sentences...
  Processed 160000 sentences...
Training complete! Processed 163701 sentences
Unique trigrams: 3125694
Unique bigrams: 1221720


In [12]:
# Calculate perplexity
perplexity = model.calculate_perplexity(test_file)
print(f"\nFinal Perplexity: {perplexity:.2f}")
print("(Lower perplexity = better model)")


Calculating perplexity on test.txt...
  Processed 10000 sentences...
  Processed 20000 sentences...
Perplexity calculation complete!
Total tokens evaluated: 642369

Final Perplexity: 3044.87
(Lower perplexity = better model)


In [16]:
# Optional: Generate some text (just for fun!)
print("\n--- Generating sample text ---")
generated = model.generate_text(max_length=20)
print(f"Generated: {generated}")


--- Generating sample text ---
Generated: ជាកិច្ចចាប់ផ្តើមឯកឧត្តមបានមកប្រមូលផ្តុំគ្នានៅទីលានធាន៤ផងកាន់អនិមិត្តវិមោក្ខផង


### Testing with khmer word Tokenizer

In [17]:
import json

In [18]:
class TrigramModel:
    def __init__(self, use_unk=True):
        # Count how many times we see each trigram (w1, w2, w3)
        self.trigram_counts = defaultdict(int)
        # Count how many times we see each bigram (w1, w2)
        self.bigram_counts = defaultdict(int)
        # Store all unique words we've seen
        self.vocab = set()
        
        # Special tokens
        self.BOS = "<BOS>"  # Beginning of sentence
        self.EOS = "<EOS>"  # End of sentence
        self.UNK = "<UNK>"  # Unknown word (for unseen words in test)
        self.use_unk = use_unk
        
    def train(self, sentences):
        """
        Train the model on pre-tokenized sentences.
        
        Args:
            sentences: List of lists, where each inner list contains Khmer words
                      Example: [["សាលា", "រាជធានី", "ថា"], ["ខ្ញុំ", "ស្រលាញ់"]]
        """
        print(f"Training on {len(sentences)} sentences...")
        
        for sentence in sentences:
            # Add start and end tokens (we need 2 BOS for trigram context)
            words = [self.BOS, self.BOS] + sentence + [self.EOS]
            
            # Add all words to vocabulary (including special tokens)
            self.vocab.update(words)
            
            # Count trigrams and bigrams
            for i in range(len(words) - 2):
                w1, w2, w3 = words[i], words[i+1], words[i+2]
                
                # Count this specific trigram
                self.trigram_counts[(w1, w2, w3)] += 1
                
                # Count the bigram context (first 2 words)
                self.bigram_counts[(w1, w2)] += 1
        
        # Add UNK to vocabulary if enabled
        if self.use_unk:
            self.vocab.add(self.UNK)
        
        print(f"Training complete!")
        print(f"  Vocabulary size: {len(self.vocab)}")
        print(f"  Unique trigrams: {len(self.trigram_counts)}")
        print(f"  Unique bigrams: {len(self.bigram_counts)}")
    
    def _handle_unk(self, word):
        """Replace unseen words with UNK token if enabled."""
        if self.use_unk and word not in self.vocab:
            return self.UNK
        return word
    
    def get_probability(self, w1, w2, w3):
        """
        Calculate P(w3 | w1, w2) with Laplacian smoothing.
        
        Formula: P(w3|w1,w2) = (count(w1,w2,w3) + 1) / (count(w1,w2) + V)
        where V is the vocabulary size
        """
        # Handle unknown words
        w1 = self._handle_unk(w1)
        w2 = self._handle_unk(w2)
        w3 = self._handle_unk(w3)
        
        vocab_size = len(self.vocab)
        
        # Get counts (will be 0 if not seen)
        trigram_count = self.trigram_counts[(w1, w2, w3)]
        bigram_count = self.bigram_counts[(w1, w2)]
        
        # Apply Laplacian smoothing (add 1 to numerator, add V to denominator)
        probability = (trigram_count + 1) / (bigram_count + vocab_size)
        
        return probability
    
    def calculate_perplexity(self, test_sentences):
        """
        Calculate perplexity on test sentences.
        Lower perplexity = better model.
        
        Args:
            test_sentences: List of lists of pre-tokenized words
        
        Returns:
            perplexity: Float value
        """
        print(f"\nCalculating perplexity on {len(test_sentences)} test sentences...")
        total_log_prob = 0
        total_words = 0
        
        for sentence in test_sentences:
            # Add two BOS at start and EOS at end
            words = [self.BOS, self.BOS] + sentence + [self.EOS]
            
            # Calculate probability for each word given its context
            for i in range(2, len(words)):
                w1 = words[i-2]  # Two words back
                w2 = words[i-1]  # Previous word
                w3 = words[i]     # Current word
                
                prob = self.get_probability(w1, w2, w3)
                
                # Add log probability
                total_log_prob += math.log(prob)
                total_words += 1
        
        # Calculate perplexity
        avg_log_prob = total_log_prob / total_words
        perplexity = math.exp(-avg_log_prob)
        
        print(f"Words evaluated: {total_words}")
        print(f"Perplexity: {perplexity:.2f}")
        
        return perplexity
    
    def save_model(self, filepath):
        """Save the trained model to a file."""
        model_data = {
            'trigram_counts': dict(self.trigram_counts),
            'bigram_counts': dict(self.bigram_counts),
            'vocab': list(self.vocab),
            'BOS': self.BOS,
            'EOS': self.EOS,
            'UNK': self.UNK,
            'use_unk': self.use_unk
        }
        
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(model_data, f, ensure_ascii=False, indent=2)
        
        print(f"Model saved to {filepath}")
    
    def load_model(self, filepath):
        """Load a trained model from a file."""
        with open(filepath, 'r', encoding='utf-8') as f:
            model_data = json.load(f)
        
        # Convert string keys back to tuples for trigrams and bigrams
        self.trigram_counts = defaultdict(int)
        for key, value in model_data['trigram_counts'].items():
            tuple_key = eval(key)
            self.trigram_counts[tuple_key] = value
        
        self.bigram_counts = defaultdict(int)
        for key, value in model_data['bigram_counts'].items():
            tuple_key = eval(key)
            self.bigram_counts[tuple_key] = value
        
        self.vocab = set(model_data['vocab'])
        self.BOS = model_data['BOS']
        self.EOS = model_data['EOS']
        self.UNK = model_data['UNK']
        self.use_unk = model_data['use_unk']
        
        print(f"Model loaded from {filepath}")
        print(f"  Vocabulary size: {len(self.vocab)}")


# Helper function for loading data
def load_tokenized_data(filepath):
    """
    Load pre-tokenized data from a JSON file.
    
    Expected format: [["word1", "word2"], ["word3", "word4"], ...]
    """
    with open(filepath, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data


In [19]:
with open('train_set.json', 'r', encoding='utf-8') as f:
    train_set = json.load(f)

with open('test_set.json', 'r', encoding='utf-8') as f:
    test_set = json.load(f)

In [20]:
# Create and train model (with UNK handling)
model = TrigramModel(use_unk=True)
model.train(train_set)

Training on 160442 sentences...
Training complete!
  Vocabulary size: 85031
  Unique trigrams: 2627579
  Unique bigrams: 999442


In [21]:
# Test some probabilities
print("\nExample probabilities:")
print(f"P('ថា' | 'សាលា', 'រាជធានី') = {model.get_probability('សាលា', 'រាជធានី', 'ថា'):.6f}")
print(f"P('មិន' | 'រាជធានី', 'ថា') = {model.get_probability('រាជធានី', 'ថា', 'មិន'):.6f}")
print(f"P('unseen_word' | 'ខ្ញុំ', 'ស្រលាញ់') = {model.get_probability('ខ្ញុំ', 'ស្រលាញ់', 'unseen_word'):.6f}")


Example probabilities:
P('ថា' | 'សាលា', 'រាជធានី') = 0.000024
P('មិន' | 'រាជធានី', 'ថា') = 0.000024
P('unseen_word' | 'ខ្ញុំ', 'ស្រលាញ់') = 0.000012


In [22]:
# Calculate perplexity on test data
perplexity = model.calculate_perplexity(test_set)


Calculating perplexity on 40111 test sentences...
Words evaluated: 1307684
Perplexity: 19671.06


In [None]:
# Example usage
if __name__ == "__main__":
    
    # Create and train model (with UNK handling)
    model = TrigramModel(use_unk=True)
    model.train(train_data)
    
    # Test some probabilities
    print("\nExample probabilities:")
    print(f"P('ថា' | 'សាលា', 'រាជធានី') = {model.get_probability('សាលា', 'រាជធានី', 'ថា'):.6f}")
    print(f"P('មិន' | 'រាជធានី', 'ថា') = {model.get_probability('រាជធានី', 'ថា', 'មិន'):.6f}")
    print(f"P('unseen_word' | 'ខ្ញុំ', 'ស្រលាញ់') = {model.get_probability('ខ្ញុំ', 'ស្រលាញ់', 'unseen_word'):.6f}")
    
    # Calculate perplexity on test data
    perplexity = model.calculate_perplexity(test_data)
    
    # If you have your data in JSON files:
    # train_data = load_tokenized_data("train_tokenized.json")
    # test_data = load_tokenized_data("test_tokenized.json")
    # model.train(train_data)
    # perplexity = model.calculate_perplexity(test_data)
    
    # Save/load model
    # model.save_model("trigram_model.json")
    # model2 = TrigramModel()
    # model2.load_model("trigram_model.json")

### Interpolated N-Gram

In [23]:
from collections import defaultdict, Counter
import math
import pickle
import sentencepiece as spm

In [28]:
class InterpolatedNgramLM:
    def __init__(self, sp_model_path, n=3, lambdas=None, smoothing='none', k=1.0):
        """
        Initialize N-gram Language Model with Interpolation and optional Smoothing
        
        Args:
            sp_model_path: Path to trained SentencePiece model (.model file)
            n: Maximum n-gram order (default: 3 for trigrams)
            lambdas: Interpolation weights for each n-gram order (should sum to 1.0)
            smoothing: Smoothing method - 'none', 'laplace', or 'add-k' (default: 'none')
            k: Smoothing parameter (default: 1.0 for Laplace, can use smaller values like 0.1)
        """
        # Load SentencePiece model
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(sp_model_path)
        
        # Get special token IDs
        self.bos_id = self.sp.bos_id()
        self.eos_id = self.sp.eos_id()
        self.unk_id = self.sp.unk_id()
        self.pad_id = self.sp.pad_id()
        
        self.n = n
        # Lambdas should sum to 1.0 for proper interpolation
        self.lambdas = lambdas or [0.1, 0.3, 0.6]  # unigram, bigram, trigram
        assert abs(sum(self.lambdas) - 1.0) < 1e-6, "Lambdas should sum to 1.0"
        
        self.smoothing = smoothing
        self.k = k
        
        # N-gram storage: ngrams[k-1] stores k-grams
        self.ngrams = [defaultdict(Counter) for _ in range(n)]
        self.token_freq = Counter()  # Frequency of each token ID
        self.total_tokens = 0
        self.vocab_size = self.sp.vocab_size()
        
        print(f"Loaded SentencePiece model: {sp_model_path}")
        print(f"Vocabulary size: {self.vocab_size}")

    # -----------------------------
    # Training
    # -----------------------------
    def train(self, text_file):
        """
        Train the model on a text file.
        The file should contain one sentence per line.
        
        Args:
            text_file: Path to text file with one sentence per line
        """
        print(f"\nTraining on {text_file}...")
        sentence_count = 0
        
        with open(text_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                
                # Tokenize the sentence into token IDs
                token_ids = self.sp.encode(line, add_bos=True, add_eos=True)
                
                # Update token frequencies
                self.token_freq.update(token_ids)
                self.total_tokens += len(token_ids)
                
                # Build n-grams of all orders
                for i in range(len(token_ids)):
                    for order in range(1, self.n + 1):
                        if i - order + 1 >= 0:
                            # Context: previous (order-1) tokens
                            context = tuple(token_ids[i - order + 1:i])
                            # Current token
                            token = token_ids[i]
                            # Store in ngrams[order-1]
                            self.ngrams[order - 1][context][token] += 1
                
                sentence_count += 1
                if sentence_count % 10000 == 0:
                    print(f"  Processed {sentence_count} sentences...")
        
        print(f"Training complete! Processed {sentence_count} sentences")
        print(f"Total tokens: {self.total_tokens}")
        for order in range(1, self.n + 1):
            print(f"  Unique {order}-grams: {len(self.ngrams[order - 1])}")

    # -----------------------------
    # Probability Calculation
    # -----------------------------
    def prob(self, token_id, context):
        """
        Calculate interpolated probability P(token_id|context)
        Uses linear interpolation across all n-gram orders
        Supports Laplace/Add-k smoothing
        
        Args:
            token_id: Token ID to calculate probability for
            context: Tuple of previous token IDs
        
        Returns:
            Probability value
        """
        context = tuple(context)
        prob = 0.0

        for order in range(1, self.n + 1):
            lambda_k = self.lambdas[order - 1]
            
            # Get the appropriate context length for this n-gram order
            if order == 1:
                # Unigram with smoothing
                p_k = self._unigram_prob(token_id)
            else:
                # Higher order n-grams with smoothing
                ctx_len = min(order - 1, len(context))
                ctx = context[-ctx_len:] if ctx_len > 0 else ()
                p_k = self._ngram_prob(token_id, ctx, order)
            
            prob += lambda_k * p_k

        return prob
    
    def _unigram_prob(self, token_id):
        """Calculate unigram probability with optional smoothing"""
        if self.smoothing in ['laplace', 'add-k']:
            # P(token) = (count(token) + k) / (total_tokens + k * vocab_size)
            numerator = self.token_freq.get(token_id, 0) + self.k
            denominator = self.total_tokens + self.k * self.vocab_size
            return numerator / denominator
        else:
            # No smoothing
            return self.token_freq.get(token_id, 0) / max(self.total_tokens, 1)
    
    def _ngram_prob(self, token_id, context, order):
        """Calculate n-gram probability with optional smoothing"""
        counter = self.ngrams[order - 1].get(context)
        
        if self.smoothing in ['laplace', 'add-k']:
            # P(token|context) = (count(context, token) + k) / (count(context) + k * vocab_size)
            token_count = counter[token_id] if counter else 0
            context_count = sum(counter.values()) if counter else 0
            
            numerator = token_count + self.k
            denominator = context_count + self.k * self.vocab_size
            return numerator / denominator
        else:
            # No smoothing
            if counter and token_id in counter:
                return counter[token_id] / sum(counter.values())
            else:
                return 0.0

    # -----------------------------
    # Text Prediction
    # -----------------------------
    def predict_next_token(self, text, top_k=5):
        """
        Predict next token given input text.
        
        Args:
            text: Input text string
            top_k: Number of predictions to return
        
        Returns:
            List of (token_string, probability) tuples
        """
        # Encode the input text
        token_ids = self.sp.encode(text, add_bos=False, add_eos=False)
        
        # Get context (last n-1 tokens)
        context = tuple(token_ids[-(self.n - 1):]) if len(token_ids) >= self.n - 1 else tuple(token_ids)
        
        # Calculate probability for all tokens in vocabulary
        token_probs = []
        for token_id in range(self.vocab_size):
            prob = self.prob(token_id, context)
            if prob > 0:  # Only consider tokens with non-zero probability
                token_probs.append((token_id, prob))
        
        # Sort by probability and get top_k
        token_probs.sort(key=lambda x: x[1], reverse=True)
        top_predictions = token_probs[:top_k]
        
        # Decode token IDs to strings
        results = []
        for token_id, prob in top_predictions:
            token_string = self.sp.id_to_piece(token_id)
            results.append((token_string, prob))
        
        return results

    # -----------------------------
    # Perplexity Evaluation
    # -----------------------------
    def perplexity(self, test_file):
        """
        Calculate perplexity on test data
        Lower perplexity = better model
        
        Args:
            test_file: Path to test text file (one sentence per line)
        
        Returns:
            Perplexity value
        """
        print(f"\nCalculating perplexity on {test_file}...")
        log_prob = 0.0
        count = 0
        sentence_count = 0

        with open(test_file, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                
                # Tokenize the sentence
                token_ids = self.sp.encode(line, add_bos=True, add_eos=True)
                
                # Calculate probability for each token
                for i in range(len(token_ids)):
                    # Get context (previous n-1 tokens)
                    context = tuple(token_ids[max(0, i - self.n + 1):i])
                    
                    # Get probability of current token
                    p = self.prob(token_ids[i], context)
                    
                    # Add log probability (with smoothing to avoid log(0))
                    log_prob += math.log(p + 1e-10)
                    count += 1
                
                sentence_count += 1
                if sentence_count % 10000 == 0:
                    print(f"  Processed {sentence_count} sentences...")

        perplexity_value = math.exp(-log_prob / count)
        print(f"Perplexity calculation complete!")
        print(f"Total tokens evaluated: {count}")
        print(f"Perplexity: {perplexity_value:.2f}")
        
        return perplexity_value

    # -----------------------------
    # Save and Load Model
    # -----------------------------
    def save(self, filepath):
        """
        Save the trained model to a file
        
        Args:
            filepath: path where to save the model (e.g., 'khmer_lm.pkl')
        """
        model_data = {
            'sp_model_path': self.sp.model_file(),
            'n': self.n,
            'lambdas': self.lambdas,
            'smoothing': self.smoothing,
            'k': self.k,
            'ngrams': self.ngrams,
            'token_freq': self.token_freq,
            'total_tokens': self.total_tokens,
            'vocab_size': self.vocab_size
        }
        
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        
        print(f"Model saved to {filepath}")
    
    @classmethod
    def load(cls, filepath):
        """
        Load a trained model from a file
        
        Args:
            filepath: path to the saved model file
        
        Returns:
            InterpolatedNgramLM: loaded model instance
        """
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        # Create new instance
        model = cls(
            sp_model_path=model_data['sp_model_path'],
            n=model_data['n'], 
            lambdas=model_data['lambdas'],
            smoothing=model_data.get('smoothing', 'none'),
            k=model_data.get('k', 1.0)
        )
        
        # Restore all attributes
        model.ngrams = model_data['ngrams']
        model.token_freq = model_data['token_freq']
        model.total_tokens = model_data['total_tokens']
        model.vocab_size = model_data.get('vocab_size', model.sp.vocab_size())
        
        print(f"Model loaded from {filepath}")
        return model


In [29]:
model = InterpolatedNgramLM(
        sp_model_path="khmer_sp.model",
        n=3,  # Trigram model
        lambdas=[0.1, 0.3, 0.6],  # Weights for unigram, bigram, trigram
        smoothing='laplace',  # Use Laplace smoothing
        k=1.0  # Smoothing parameter
    )

Loaded SentencePiece model: khmer_sp.model
Vocabulary size: 8000


In [30]:
# Train on your data
model.train("train.txt")

# Calculate perplexity on test set
perplexity = model.perplexity("test.txt")


Training on train.txt...
  Processed 10000 sentences...
  Processed 20000 sentences...
  Processed 30000 sentences...
  Processed 40000 sentences...
  Processed 50000 sentences...
  Processed 60000 sentences...
  Processed 70000 sentences...
  Processed 80000 sentences...
  Processed 90000 sentences...
  Processed 100000 sentences...
  Processed 110000 sentences...
  Processed 120000 sentences...
  Processed 130000 sentences...
  Processed 140000 sentences...
  Processed 150000 sentences...
  Processed 160000 sentences...
Training complete! Processed 163701 sentences
Total tokens: 5282449
  Unique 1-grams: 1
  Unique 2-grams: 7997
  Unique 3-grams: 1221719

Calculating perplexity on test.txt...
  Processed 10000 sentences...
  Processed 20000 sentences...
  Processed 30000 sentences...
  Processed 40000 sentences...
Perplexity calculation complete!
Total tokens evaluated: 1325577
Perplexity: 1296.73


In [None]:
# Example usage
if __name__ == "__main__":
    # Initialize model with SentencePiece
    
    
    
    
    # Predict next token
    predictions = model.predict_next_token("ខ្ញុំ ស្រលាញ់", top_k=5)
    print("\nTop 5 predictions:")
    for token, prob in predictions:
        print(f"  {token}: {prob:.6f}")
    
    # Save the model
    model.save("khmer_interpolated_lm.pkl")
    
    # Load the model later
    # loaded_model = InterpolatedNgramLM.load("khmer_interpolated_lm.pkl")

## Testing with word tokens

In [1]:
class InterpolatedNgramLM:
    def __init__(self, n=3, lambdas=None, smoothing='none', k=1.0):
        """
        Initialize N-gram Language Model with Interpolation and optional Smoothing
        
        Args:
            n: Maximum n-gram order (default: 3 for trigrams)
            lambdas: Interpolation weights for each n-gram order (should sum to 1.0)
            smoothing: Smoothing method - 'none', 'laplace', or 'add-k' (default: 'none')
            k: Smoothing parameter (default: 1.0 for Laplace, can use smaller values like 0.1)
        """
        self.n = n
        # Lambdas should sum to 1.0 for proper interpolation
        self.lambdas = lambdas or [0.1, 0.3, 0.6]  # unigram, bigram, trigram
        assert abs(sum(self.lambdas) - 1.0) < 1e-6, "Lambdas should sum to 1.0"
        
        self.smoothing = smoothing
        self.k = k
        
        self.ngrams = [defaultdict(Counter) for _ in range(n)]
        self.word_freq = Counter()
        self.vocabulary = set()
        self.prefix_set = set()
        self.total_tokens = 0
        self.vocab_size = 0

    # -----------------------------
    # Training
    # -----------------------------
    def train(self, tokenized_sentences):
        for sent in tokenized_sentences:
            self.word_freq.update(sent)
            self.vocabulary.update(sent)
            self.total_tokens += len(sent)

            for i in range(len(sent)):
                # Build n-grams of all orders
                for k in range(1, self.n + 1):
                    if i - k + 1 >= 0:
                        context = tuple(sent[i - k + 1:i])
                        word = sent[i]
                        self.ngrams[k - 1][context][word] += 1

        self.vocab_size = len(self.vocabulary)
        self._build_prefix_set()

    def _build_prefix_set(self):
        """Build set of all possible prefixes from vocabulary"""
        for word in self.vocabulary:
            # Add all prefixes of each word (except the full word itself)
            for i in range(1, len(word)):
                self.prefix_set.add(word[:i])

    # -----------------------------
    # Probability Calculation
    # -----------------------------
    def prob(self, word, context):
        """
        Calculate interpolated probability P(word|context)
        Uses linear interpolation across all n-gram orders
        Supports Laplace/Add-k smoothing
        """
        context = tuple(context)
        prob = 0.0

        for k in range(1, self.n + 1):
            lambda_k = self.lambdas[k - 1]
            
            # Get the appropriate context length for this n-gram order
            if k == 1:
                # Unigram with smoothing
                ctx = ()
                p_k = self._unigram_prob(word)
            else:
                # Higher order n-grams with smoothing
                ctx_len = min(k - 1, len(context))
                ctx = context[-ctx_len:] if ctx_len > 0 else ()
                p_k = self._ngram_prob(word, ctx, k)
            
            prob += lambda_k * p_k

        return prob
    
    def _unigram_prob(self, word):
        """Calculate unigram probability with optional smoothing"""
        if self.smoothing in ['laplace', 'add-k']:
            # P(w) = (count(w) + k) / (total_tokens + k * vocab_size)
            numerator = self.word_freq.get(word, 0) + self.k
            denominator = self.total_tokens + self.k * self.vocab_size
            return numerator / denominator
        else:
            # No smoothing
            return self.word_freq.get(word, 0) / max(self.total_tokens, 1)
    
    def _ngram_prob(self, word, context, k):
        """Calculate n-gram probability with optional smoothing"""
        counter = self.ngrams[k - 1].get(context)
        
        if self.smoothing in ['laplace', 'add-k']:
            # P(w|context) = (count(context, w) + k) / (count(context) + k * vocab_size)
            word_count = counter[word] if counter else 0
            context_count = sum(counter.values()) if counter else 0
            
            numerator = word_count + self.k
            denominator = context_count + self.k * self.vocab_size
            return numerator / denominator
        else:
            # No smoothing
            if counter and word in counter:
                return counter[word] / sum(counter.values())
            else:
                return 0.0

    def get_distribution(self, context, vocab_subset=None):
        """
        Get probability distribution over vocabulary (or a subset)
        """
        if vocab_subset is None:
            vocab_subset = self.vocabulary
        
        return {
            word: self.prob(word, context)
            for word in vocab_subset
        }

    # -----------------------------
    # Prefix-aware prediction
    # -----------------------------
    def predict(self, tokens, top_k=5):
        """
        Predict next word given context tokens.
        If last token is a prefix, restrict predictions to words starting with that prefix.
        
        Args:
            tokens: list of words/tokens representing the context
            top_k: number of predictions to return
        
        Returns:
            list of (word, probability) tuples
        """
        prefix = ""
        context = tuple(tokens)

        # Check if last token is an incomplete word (prefix)
        if tokens:
            last = tokens[-1]
            # It's a prefix if it's in prefix_set OR not in vocabulary
            if last in self.prefix_set or (last not in self.vocabulary and len(last) > 0):
                prefix = last
                context = tuple(tokens[:-1])

        # Get probability distribution over all words
        dist = self.get_distribution(context)

        # Filter by prefix if applicable
        if prefix:
            candidates = {
                w: p for w, p in dist.items()
                if w.startswith(prefix)
            }
        else:
            candidates = dist

        # Renormalize probabilities after filtering
        Z = sum(candidates.values())
        if Z > 0:
            candidates = {w: p / Z for w, p in candidates.items()}
        else:
            # Fallback: if no candidates, return empty or most common words
            return []

        # Sort by probability and return top_k with probabilities
        sorted_predictions = sorted(candidates.items(), key=lambda x: x[1], reverse=True)[:top_k]
        
        return sorted_predictions

    def predict_words_only(self, tokens, top_k=5):
        """Convenience method that returns only words without probabilities"""
        predictions = self.predict(tokens, top_k)
        return [word for word, prob in predictions]

    # -----------------------------
    # Perplexity Evaluation
    # -----------------------------
    def perplexity(self, tokenized_sentences):
        """
        Calculate perplexity on test data
        Lower perplexity = better model
        """
        log_prob = 0.0
        count = 0

        for sent in tokenized_sentences:
            for i in range(len(sent)):
                # Get context (previous n-1 words)
                context = tuple(sent[max(0, i - self.n + 1):i])
                
                # Get probability of current word
                p = self.prob(sent[i], context)
                
                # Add log probability (with smoothing to avoid log(0))
                log_prob += math.log(p + 1e-10)
                count += 1

        return math.exp(-log_prob / count)

    # -----------------------------
    # Save and Load Model
    # -----------------------------
    def save(self, filepath):
        """
        Save the trained model to a file
        
        Args:
            filepath: path where to save the model (e.g., 'khmer_lm.pkl')
        """
        model_data = {
            'n': self.n,
            'lambdas': self.lambdas,
            'smoothing': self.smoothing,
            'k': self.k,
            'ngrams': self.ngrams,
            'word_freq': self.word_freq,
            'vocabulary': self.vocabulary,
            'prefix_set': self.prefix_set,
            'total_tokens': self.total_tokens,
            'vocab_size': self.vocab_size
        }
        
        with open(filepath, 'wb') as f:
            pickle.dump(model_data, f)
        
        print(f"Model saved to {filepath}")
    
    @classmethod
    def load(cls, filepath):
        """
        Load a trained model from a file
        
        Args:
            filepath: path to the saved model file
        
        Returns:
            InterpolatedNgramLM: loaded model instance
        """
        with open(filepath, 'rb') as f:
            model_data = pickle.load(f)
        
        # Create new instance
        model = cls(
            n=model_data['n'], 
            lambdas=model_data['lambdas'],
            smoothing=model_data.get('smoothing', 'none'),
            k=model_data.get('k', 1.0)
        )
        
        # Restore all attributes
        model.ngrams = model_data['ngrams']
        model.word_freq = model_data['word_freq']
        model.vocabulary = model_data['vocabulary']
        model.prefix_set = model_data['prefix_set']
        model.total_tokens = model_data['total_tokens']
        model.vocab_size = model_data.get('vocab_size', len(model_data['vocabulary']))
        
        print(f"Model loaded from {filepath}")
        return model

In [5]:
def calculate_top_k_accuracy(model, tokenized_sentences, k=5):
    """
    Calculate top-k accuracy on test data.
    
    Top-k accuracy measures how often the actual next word appears 
    in the model's top-k predictions.
    
    Args:
        model: Trained InterpolatedNgramLM instance
        tokenized_sentences: List of tokenized sentences to evaluate on
        k: Number of top predictions to consider (default: 5)
    
    Returns:
        float: Top-k accuracy (between 0 and 1)
    """
    correct = 0
    total = 0
    
    for sent in tokenized_sentences:
        for i in range(len(sent)):
            # Get context (previous words)
            context = sent[max(0, i - model.n + 1):i]
            
            # Get actual next word
            actual_word = sent[i]
            
            # Get top-k predictions (returns list of words only)
            predictions = model.predict_words_only(context, top_k=k)
            
            # Check if actual word is in top-k predictions
            if actual_word in predictions:
                correct += 1
            
            total += 1
    
    accuracy = correct / total if total > 0 else 0.0
    return accuracy


In [4]:
import json

In [7]:
with open('train_set.json', 'r', encoding='utf-8') as f:
    train_set = json.load(f)

with open('test_set.json', 'r', encoding='utf-8') as f:
    test_set = json.load(f)

In [9]:
topk_test = test_set[:50]

In [10]:
# Train model
model = InterpolatedNgramLM(n=3, lambdas=[0.6, 0.35, 0.05], smoothing="laplace")
model.train(train_set)

In [11]:
accuracy_top1 = calculate_top_k_accuracy(model, topk_test, k=1)
accuracy_top5 = calculate_top_k_accuracy(model, topk_test, k=5)
accuracy_top10 = calculate_top_k_accuracy(model, topk_test, k=10)

print(f"Top-1 Accuracy: {accuracy_top1:.4f}")
print(f"Top-5 Accuracy: {accuracy_top5:.4f}")
print(f"Top-10 Accuracy: {accuracy_top10:.4f}")

Top-1 Accuracy: 0.0059
Top-5 Accuracy: 0.0236
Top-10 Accuracy: 0.0361


In [24]:
# Calculate perplexity on training data (lambdas=[0.8, 0.15, 0.05])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1495.0354525241103


In [22]:
# Calculate perplexity on training data (lambdas=[0.7, 0.25, 0.05])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1450.129298427265


In [20]:
# Calculate perplexity on training data (lambdas=[0.6, 0.35, 0.05])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1444.8114943683643


In [None]:
# Calculate perplexity on training data (lambdas=[0.45, 0.5, 0.05])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1495.3259092829674


In [16]:
# Calculate perplexity on training data (lambdas=[0.35, 0.6, 0.05])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1573.1683602674264


In [14]:
# Calculate perplexity on training data (lambdas=[0.25, 0.7, 0.05])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1852.5274411902947


In [13]:
# Calculate perplexity on training data (lambdas=[0.1, 0.7, 0.2])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1852.5274411902947


In [None]:
# Calculate perplexity on training data (lambdas=[0.1, 0.6, 0.3])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 2485.2679134048326


## Bi-Gram

In [17]:
# Train model
model = InterpolatedNgramLM(n=2, lambdas=[0.8, 0.2], smoothing="laplace")
model.train(train_set)

In [7]:
# Calculate perplexity on training data (lambdas=[0.2, 0.8])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1758.8460583745057


In [9]:
# Calculate perplexity on training data (lambdas=[0.4, 0.6])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1487.708893326101


In [12]:
# Calculate perplexity on training data (lambdas=[0.0, 1.0])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 3590.6880641120074


In [14]:
# Calculate perplexity on training data (lambdas=[0.7, 0.3])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1396.5005677310155


In [16]:
# Calculate perplexity on training data (lambdas=[0.9, 0.1])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1512.084487718711


In [18]:
# Calculate perplexity on training data (lambdas=[0.8, 0.2])
ppl = model.perplexity(test_set)
print(f"\nPerplexity: {ppl}")


Perplexity: 1427.3634439645875
