In [1]:
import pandas as pd
import string

data = []
with open("/kaggle/input/khmer-text/general-text.txt") as f:
    for i, line in enumerate(f, 1):
        data.append({'text' : line.replace('\n', '')})
        
df = pd.DataFrame(data)
df.head()
print(df.shape)

(582511, 1)


In [2]:
import numpy as np
df['text'] = df['text'].replace('', np.nan)
df.dropna(inplace=True)
print(df.shape)
df.head(5)

(471632, 1)


Unnamed: 0,text
0,សាលារាជធានីថា មិនទាន់ទទួលបាន លិខិតសុំធ្វើបាតុក...
2,"យប់នេះប៉ូលិសដាក់ប៉ុស្តិ៍រហូតដល់ ៧កន្លែង, បងប្អ..."
4,លោកស្រី ឃួន សុដារី អនុប្រធានកាកបាទក្រហមកម្ពុជា...
6,គ្រោះថ្នាក់ចរាចរណ៍ទូទាំងប្រទេសថ្ងៃ១៥ ខែកុម្ភៈម...
8,លោក ហ៊ុន ម៉ានី ជួបប្រជុំជាមួយអភិបាល​ខេត្តសៀមរា...


In [3]:
import re

df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
pattern = r'[^\u1780-\u17FF\u17E0-\u17E9\s។៕៚៙៛ៜ៖]+'
# Apply regex to clean each row
df['text'] = df['text'].apply(lambda x: re.sub(pattern, '', x))
df['text'] = df['text'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [4]:
df.head()

Unnamed: 0,text
0,សាលារាជធានីថា មិនទាន់ទទួលបាន លិខិតសុំធ្វើបាតុក...
2,យប់នេះប៉ូលិសដាក់ប៉ុស្តិ៍រហូតដល់ ៧កន្លែង បងប្អូ...
4,លោកស្រី ឃួន សុដារី អនុប្រធានកាកបាទក្រហមកម្ពុជា...
6,គ្រោះថ្នាក់ចរាចរណ៍ទូទាំងប្រទេសថ្ងៃ១៥ ខែកុម្ភៈម...
8,លោក ហ៊ុន ម៉ានី ជួបប្រជុំជាមួយអភិបាលខេត្តសៀមរាប...


In [5]:
def split_sentences(text):
    sentences = re.split(r'[។៕]', text)
    return [s.strip() for s in sentences if s.strip()]

df['sentences'] = df['text'].apply(split_sentences)

def chunk_text(sentence, chunk_size=120):
    return [sentence[i:i + chunk_size] for i in range(0, len(sentence), chunk_size)]

df['chunks'] = df['sentences'].apply(lambda sents: [chunk for sent in sents for chunk in chunk_text(sent)])

df_exploded = df.explode('chunks', ignore_index=True)
df_exploded = df_exploded[df_exploded['chunks'].notna() & (df_exploded['chunks'] != '')]

In [6]:
df_new = pd.DataFrame({
    'sentence': df_exploded['chunks'],
    'target': df_exploded['chunks']
})

df_new = df_new.iloc[:50000, :]

In [7]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from collections import Counter
import numpy as np

In [8]:
def tokenize(text):
    return list(text)

all_text = df_new['sentence'].tolist()
tokens = [t for sentence in all_text for t in tokenize(sentence)]
vocab = sorted(set(tokens))
stoi = {ch: i for i, ch in enumerate(vocab)}
itos = {i: ch for ch, i in stoi.items()}

vocab_size = len(vocab)

In [9]:
class TSTDataset(Dataset):
    def __init__(self, texts, seq_len=50):
        self.seq_len = seq_len
        self.data = []
        for sentence in texts:
            token_ids = [stoi[ch] for ch in tokenize(sentence) if ch in stoi]
            for i in range(len(token_ids) - seq_len):
                self.data.append((token_ids[i:i+seq_len], token_ids[i+1:i+seq_len+1]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x), torch.tensor(y)

seq_len = 50
batch_size = 64

train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1  # must sum to 1.0

n_total = len(all_text)
train_end = int(n_total * train_ratio)
val_end = train_end + int(n_total * val_ratio)

train_texts = all_text[:train_end]
val_texts = all_text[train_end:val_end]
test_texts = all_text[val_end:]

# Create datasets
train_dataset = TSTDataset(train_texts, seq_len=seq_len)
val_dataset = TSTDataset(val_texts, seq_len=seq_len)
test_dataset = TSTDataset(test_texts, seq_len=seq_len)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"Train samples: {len(train_dataset)}")
print(f"Val samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")


Train samples: 1821741
Val samples: 239909
Test samples: 245100


In [10]:
import torch.nn as nn

class LSTMTST(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        emb = self.embedding(x)
        out, hidden = self.lstm(emb, hidden)
        logits = self.fc(out)
        return logits, hidden


In [11]:
# import torch
# import math

# device = 'cuda' if torch.cuda.is_available() else 'cpu'

# model = LSTMTST(vocab_size).to(device)
# criterion = nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# epochs = 20
# patience = 3  # stop after 3 epochs with no improvement
# best_val_loss = float('inf')
# wait = 0  

# for epoch in range(epochs):
#     model.train()
#     total_train_loss = 0

#     for x_batch, y_batch in train_loader:
#         x_batch, y_batch = x_batch.to(device), y_batch.to(device)

#         optimizer.zero_grad()
#         logits, _ = model(x_batch)
#         loss = criterion(logits.view(-1, vocab_size), y_batch.view(-1))
#         loss.backward()
#         optimizer.step()

#         total_train_loss += loss.item()

#     avg_train_loss = total_train_loss / len(train_loader)
#     train_ppl = math.exp(avg_train_loss)

#     model.eval()
#     total_val_loss = 0
#     with torch.no_grad():
#         for x_batch, y_batch in val_loader:
#             x_batch, y_batch = x_batch.to(device), y_batch.to(device)
#             logits, _ = model(x_batch)
#             loss = criterion(logits.view(-1, vocab_size), y_batch.view(-1))
#             total_val_loss += loss.item()

#     avg_val_loss = total_val_loss / len(val_loader)
#     val_ppl = math.exp(avg_val_loss)

#     print(
#         f"Epoch {epoch+1}/{epochs} | "
#         f"Train Loss: {avg_train_loss:.4f} (PPL {train_ppl:.2f}) | "
#         f"Val Loss: {avg_val_loss:.4f} (PPL {val_ppl:.2f})"
#     )

#     if avg_val_loss < best_val_loss:
#         best_val_loss = avg_val_loss
#         wait = 0
#         torch.save(model.state_dict(), "best_model.pt")
#         print("  ** Validation improved, model saved.")
#     else:
#         wait += 1
#         print(f"  ** No improvement ({wait}/{patience})")

#         if wait >= patience:
#             print("Early stopping triggered.")
#             break

# print(f"Training finished. Best Validation Loss: {best_val_loss:.4f}")


In [23]:
# import math

# def evaluate_model(model, dataloader, criterion, device):
#     model.eval()  # set model to evaluation mode
#     total_loss = 0
#     total_correct = 0
#     total_tokens = 0

#     with torch.no_grad():  # no gradients needed for evaluation
#         for x_batch, y_batch in dataloader:
#             x_batch, y_batch = x_batch.to(device), y_batch.to(device)
#             logits, _ = model(x_batch)

#             # compute loss
#             loss = criterion(logits.view(-1, logits.size(-1)), y_batch.view(-1))
#             total_loss += loss.item() * x_batch.size(0)  # multiply by batch size

#             # compute token-level accuracy
#             predictions = logits.argmax(dim=-1)
#             total_correct += (predictions == y_batch).sum().item()
#             total_tokens += y_batch.numel()

#     avg_loss = total_loss / len(dataloader.dataset)
#     perplexity = math.exp(avg_loss)
#     accuracy = total_correct / total_tokens

#     return avg_loss, perplexity, accuracy

# # Usage
# avg_loss, perplexity, accuracy = evaluate_model(model, test_loader, criterion, device)
# print(f"Test Loss: {avg_loss:.4f}")
# print(f"Test Perplexity: {perplexity:.2f}")
# print(f"Token-level Accuracy: {accuracy:.4f}")

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Recreate the model architecture
model = LSTMTST(vocab_size).to(device)

# Load the saved weights
model.load_state_dict(torch.load("/kaggle/input/pretrained-lstm/pytorch/default/1/best_model.pt", map_location=device))

# Set model to evaluation mode
model.eval()


LSTMTST(
  (embedding): Embedding(95, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=95, bias=True)
)

In [22]:
seed_text = "ជ្រើសរើសជាតំណាងសាជីវកម្មហិរញ្ញវត្ថុអន្តរជាតិ ប្រចាំ កម្ពុជា"
tokens = [stoi[ch] for ch in seed_text if ch in stoi]

# generate next 100 characters
generated = tokens.copy()
for _ in range(100):
    input_seq = torch.tensor([generated[-seq_len:]]).to(device)  # last seq_len tokens
    logits, _ = model(input_seq)
    next_token = logits[:, -1, :].argmax(dim=-1).item()
    generated.append(next_token)

# convert back to characters
generated_text = "".join([itos[t] for t in generated])
print(generated_text)


ជ្រើសរើសជាតំណាងសាជីវកម្មហិរញ្ញវត្ថុអន្តរជាតិ ប្រចាំ កម្ពុជា និងថៃ បានប្រាប់ ថា នៅពេលនេះ គឺជាការប្រមូលផលដើមរបស់ សម្តេចមហាបវរធិបតី ហ៊ុន ម៉ាណែត នាយករដ្ឋមន្ត្រី នៃ
