In [1]:
import pandas as pd
import numpy as np
import re
import os

In [2]:
data = []
with open("general-text.txt", encoding="utf-8") as f:
    for i, line in enumerate(f, 1):
        # Split on Khmer punctuation: ៘ ។ ៕ ? and newline
        # Note: We keep the punctuation with the text
        parts = re.split(r'([៘។៕?])', line)
        
        # Build sentences while preserving punctuation
        current_sentence = ""
        for part in parts:
            if part.strip():  # If part is not empty/whitespace
                if part in ["៘", "។", "៕", "?", ";", ","]:
                    # When we hit punctuation, end the sentence
                    current_sentence += part
                    data.append({'text': current_sentence.strip()})
                    current_sentence = ""
                else:
                    current_sentence += part
        
        # Don't forget any remaining text
        if current_sentence.strip():
            data.append({'text': current_sentence.strip()})

df = pd.DataFrame(data)

In [3]:
# Clean unwanted token
unwanted_tokens = ["៘","។", "៕", "឴", "'", "ឝ", ":", "៙", ";", "៚", '"', "៛", "៖", " ", "ឞ", "\t"]

def clean_khmer_text(text):
    """
    Keep only Khmer characters and basic punctuation
    Remove specific unwanted tokens
    """
    # Khmer Unicode ranges and allowed characters
    khmer_range = ('\u1780-\u17FF')
    
    # Allow basic punctuation and numbers (EXCEPT unwanted ones)
    allowed_extras = '០១២៣៤៥៦៧៨៩"\' :;\n\t'
    
    import re
    # First: Keep only Khmer + allowed extras
    pattern = f'[{khmer_range}{re.escape(allowed_extras)}]'
    cleaned = re.findall(pattern, text)
    result = ''.join(cleaned)
    
    # Second: Remove your specific unwanted tokens
    for token in unwanted_tokens:
        result = result.replace(token, '')
    
    return result

# Apply to your dataframe
df['text'] = df['text'].apply(clean_khmer_text)

In [4]:
count_empty_string = 0
string_length = []
for str in df["text"]:
    string_length.append(len(str))
    if len(str) >= 70:
        count_empty_string+=1

print(f"Total Rows : {len(df['text'])}")
print(f"Number of string >= 70 : {count_empty_string} ")
print(f"Min : {min(string_length)}")
print(f"Max : {max(string_length)}")

Total Rows : 556974
Number of string >= 70 : 215981 
Min : 0
Max : 3375


In [5]:
count_empty_string = 0
string_length = []
for str in df["text"]:
    string_length.append(len(str))
    if len(str) < 700:
        count_empty_string+=1

print(f"Total Rows : {len(df['text'])}")
print(f"Number of string < 1000 : {count_empty_string} ")
print(f"Min : {min(string_length)}")
print(f"Max : {max(string_length)}")

Total Rows : 556974
Number of string < 1000 : 556486 
Min : 0
Max : 3375


In [6]:
# Alternative using .loc
df_filtered = df.loc[(df['text'].str.len() >= 75) & (df['text'].str.len() <= 700)].copy()

print(f"Filtered {len(df_filtered)} rows with length between 75 and 700")

Filtered 204627 rows with length between 75 and 700


In [7]:
out_path = "../data/cleaned-general-text.txt"

with open(out_path, "w", encoding="utf-8") as f:
    for line in df["text"]:
        if pd.notna(line):
            text = line.strip()
            if text:
                f.write(text + "\n")

print(f"Saved cleaned text to {out_path}")

Saved cleaned text to ../data/cleaned-general-text.txt
