In [1]:
import pandas as pd
import string

data = []
with open("/kaggle/input/khmer-text/general-text.txt") as f:
    for i, line in enumerate(f, 1):
        data.append({'text' : line.replace('\n', '')})
        
df = pd.DataFrame(data)
df.head()
print(df.shape)

(582511, 1)


In [2]:
import numpy as np
df['text'] = df['text'].replace('', np.nan)
df.dropna(inplace=True)
print(df.shape)
df.head(5)

(471632, 1)


Unnamed: 0,text
0,សាលារាជធានីថា មិនទាន់ទទួលបាន លិខិតសុំធ្វើបាតុក...
2,"យប់នេះប៉ូលិសដាក់ប៉ុស្តិ៍រហូតដល់ ៧កន្លែង, បងប្អ..."
4,លោកស្រី ឃួន សុដារី អនុប្រធានកាកបាទក្រហមកម្ពុជា...
6,គ្រោះថ្នាក់ចរាចរណ៍ទូទាំងប្រទេសថ្ងៃ១៥ ខែកុម្ភៈម...
8,លោក ហ៊ុន ម៉ានី ជួបប្រជុំជាមួយអភិបាល​ខេត្តសៀមរា...


In [3]:
import re
import string

unwanted_chars = ['\u200b','\u200c','\u200d','\ufeff','៙','៚','៖','ៗ','៛']

khmer_punct = '។៕'

def clean_text(text):
    text = ''.join(c for c in text if c not in unwanted_chars)
    text = re.sub(r'[A-Za-z0-9]+', '', text)
    allowed_chars = string.ascii_letters + string.digits
    text = ''.join(c for c in text if c not in string.punctuation)
    text = re.sub(r'[^\u1780-\u17FF\u17E0-\u17E9\s' + khmer_punct + ']', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

df['text'] = df['text'].apply(clean_text)


In [4]:
df.head()

Unnamed: 0,text
0,សាលារាជធានីថា មិនទាន់ទទួលបាន លិខិតសុំធ្វើបាតុក...
2,យប់នេះប៉ូលិសដាក់ប៉ុស្តិ៍រហូតដល់ ៧កន្លែង បងប្អូ...
4,លោកស្រី ឃួន សុដារី អនុប្រធានកាកបាទក្រហមកម្ពុជា...
6,គ្រោះថ្នាក់ចរាចរណ៍ទូទាំងប្រទេសថ្ងៃ១៥ ខែកុម្ភៈម...
8,លោក ហ៊ុន ម៉ានី ជួបប្រជុំជាមួយអភិបាលខេត្តសៀមរាប...


In [5]:
def split_sentences(text):
    sentences = re.split(r'[។៕]', text)
    return [s.strip() for s in sentences if s.strip()]

df['sentences'] = df['text'].apply(split_sentences)

def chunk_text(sentence, chunk_size=120):
    return [sentence[i:i + chunk_size] for i in range(0, len(sentence), chunk_size)]

df['chunks'] = df['sentences'].apply(lambda sents: [chunk for sent in sents for chunk in chunk_text(sent)])

df_exploded = df.explode('chunks', ignore_index=True)
df_exploded = df_exploded[df_exploded['chunks'].notna() & (df_exploded['chunks'] != '')]

In [6]:
df_new = pd.DataFrame({
    'sentence': df_exploded['chunks'],
    'target': df_exploded['chunks']
})

df_new = df_new.iloc[:100000, :]

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from collections import Counter
import numpy as np

In [12]:
def tokenize(text):
    return list(text)

special_tokens = ["<pad>", "<unk>", "<sos>", "<eos>"]

all_text = df_new['sentence'].tolist()
tokens = [t for sentence in all_text for t in tokenize(sentence)]
tokens = tokens + special_tokens

vocab = sorted(set(tokens))
stoi = {ch: i for i, ch in enumerate(vocab)}
itos = {i: ch for ch, i in stoi.items()}

vocab_size = len(vocab)
print(vocab_size)
print(stoi)

95
{' ': 0, '<eos>': 1, '<pad>': 2, '<sos>': 3, '<unk>': 4, 'ក': 5, 'ខ': 6, 'គ': 7, 'ឃ': 8, 'ង': 9, 'ច': 10, 'ឆ': 11, 'ជ': 12, 'ឈ': 13, 'ញ': 14, 'ដ': 15, 'ឋ': 16, 'ឌ': 17, 'ឍ': 18, 'ណ': 19, 'ត': 20, 'ថ': 21, 'ទ': 22, 'ធ': 23, 'ន': 24, 'ប': 25, 'ផ': 26, 'ព': 27, 'ភ': 28, 'ម': 29, 'យ': 30, 'រ': 31, 'ល': 32, 'វ': 33, 'ឝ': 34, 'ឞ': 35, 'ស': 36, 'ហ': 37, 'ឡ': 38, 'អ': 39, 'ឣ': 40, 'ឤ': 41, 'ឥ': 42, 'ឦ': 43, 'ឧ': 44, 'ឩ': 45, 'ឪ': 46, 'ឫ': 47, 'ឬ': 48, 'ឭ': 49, 'ឮ': 50, 'ឯ': 51, 'ឰ': 52, 'ឱ': 53, 'ឲ': 54, 'ឳ': 55, 'ា': 56, 'ិ': 57, 'ី': 58, 'ឹ': 59, 'ឺ': 60, 'ុ': 61, 'ូ': 62, 'ួ': 63, 'ើ': 64, 'ឿ': 65, 'ៀ': 66, 'េ': 67, 'ែ': 68, 'ៃ': 69, 'ោ': 70, 'ៅ': 71, 'ំ': 72, 'ះ': 73, 'ៈ': 74, '៉': 75, '៊': 76, '់': 77, '៌': 78, '៍': 79, '៏': 80, '័': 81, '៑': 82, '្': 83, '៝': 84, '០': 85, '១': 86, '២': 87, '៣': 88, '៤': 89, '៥': 90, '៦': 91, '៧': 92, '៨': 93, '៩': 94}


In [None]:
def sentence_to_char_indices(sentence, vocab):
    chars = list(sentence)
    return [vocab["<sos>"]] + [vocab.get(c, vocab["<unk>"]) for c in chars] + [vocab["<eos>"]]

df['input_ids'] = df['normal'].apply(lambda s: sentence_to_char_indices(s, vocab))
df['target_ids'] = df['royal'].apply(lambda s: sentence_to_char_indices(s, vocab))

In [None]:
import torch.nn as nn

class LSTMTST(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        emb = self.embedding(x)
        out, hidden = self.lstm(emb, hidden)
        logits = self.fc(out)
        return logits, hidden


In [None]:
import torch
import math
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = LSTMTST(vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 20
patience = 3  # stop after 3 epochs with no improvement
best_val_loss = float('inf')
wait = 0  

for epoch in range(epochs):
    model.train()
    total_train_loss = 0

    train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Train]", leave=False)
    for x_batch, y_batch in train_bar:
        x_batch, y_batch = x_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        logits, _ = model(x_batch)
        loss = criterion(logits.reshape(-1, vocab_size), y_batch.reshape(-1))
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()
        train_bar.set_postfix(loss=f"{loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_loader)
    train_ppl = math.exp(avg_train_loss)

    model.eval()
    total_val_loss = 0
    val_bar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Val]", leave=False)
    with torch.no_grad():
        for x_batch, y_batch in val_bar:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            logits, _ = model(x_batch)
            loss = criterion(logits.reshape(-1, vocab_size), y_batch.reshape(-1))
            total_val_loss += loss.item()
            val_bar.set_postfix(loss=f"{loss.item():.4f}")

    avg_val_loss = total_val_loss / len(val_loader)
    val_ppl = math.exp(avg_val_loss)

    print(
        f"Epoch {epoch+1}/{epochs} | "
        f"Train Loss: {avg_train_loss:.4f} (PPL {train_ppl:.2f}) | "
        f"Val Loss: {avg_val_loss:.4f} (PPL {val_ppl:.2f})"
    )

    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        wait = 0
        torch.save(model.state_dict(), "best_model.pt")
        print("  ** Validation improved, model saved.")
    else:
        wait += 1
        print(f"  ** No improvement ({wait}/{patience})")

        if wait >= patience:
            print("Early stopping triggered.")
            break

print(f"Training finished. Best Validation Loss: {best_val_loss:.4f}")


In [None]:
import math

def evaluate_model(model, dataloader, criterion, device):
    model.eval()  # set model to evaluation mode
    total_loss = 0
    total_correct = 0
    total_tokens = 0

    with torch.no_grad():  # no gradients needed for evaluation
        for x_batch, y_batch in dataloader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            logits, _ = model(x_batch)

            # compute loss
            loss = criterion(logits.view(-1, logits.size(-1)), y_batch.view(-1))
            total_loss += loss.item() * x_batch.size(0)  # multiply by batch size

            # compute token-level accuracy
            predictions = logits.argmax(dim=-1)
            total_correct += (predictions == y_batch).sum().item()
            total_tokens += y_batch.numel()

    avg_loss = total_loss / len(dataloader.dataset)
    perplexity = math.exp(avg_loss)
    accuracy = total_correct / total_tokens

    return avg_loss, perplexity, accuracy

# Usage
avg_loss, perplexity, accuracy = evaluate_model(model, test_loader, criterion, device)
print(f"Test Loss: {avg_loss:.4f}")
print(f"Test Perplexity: {perplexity:.2f}")
print(f"Token-level Accuracy: {accuracy:.4f}")

In [None]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'

# # Recreate the model architecture
# model = LSTMTST(vocab_size).to(device)

# # Load the saved weights
# model.load_state_dict(torch.load("/kaggle/input/pretrained-lstm/pytorch/default/1/best_model.pt", map_location=device))

# # Set model to evaluation mode
# model.eval()


In [None]:
seed_text = "គ្រោះថ្នាក់ចរាចរណ៍ទូទាំងប្រទេសថ្ងៃ"
tokens = [stoi[ch] for ch in seed_text if ch in stoi]

# generate next 100 characters
generated = tokens.copy()
for _ in range(100):
    input_seq = torch.tensor([generated[-seq_len:]]).to(device)  # last seq_len tokens
    logits, _ = model(input_seq)
    next_token = logits[:, -1, :].argmax(dim=-1).item()
    generated.append(next_token)

# convert back to characters
generated_text = "".join([itos[t] for t in generated])
print(generated_text)
