#Raw Text Pre-LLM Preformatter

This code preformat raw ocr-result text by cleaning and appending word candidates to each token.

In [None]:
# Install SymSpell
!pip install symspellpy

import os
import re
from pathlib import Path
from symspellpy import SymSpell, Verbosity
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Collecting symspellpy
  Downloading symspellpy-6.9.0-py3-none-any.whl.metadata (3.9 kB)
Collecting editdistpy>=0.1.3 (from symspellpy)
  Downloading editdistpy-0.1.6-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading symspellpy-6.9.0-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading editdistpy-0.1.6-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (158 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.4/158.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: editdistpy, symspellpy
Successfully installed editdistpy-0.1.6 symspellpy-6.9.0
Mounted at /content/drive


In [None]:
CORPUS_ROOT = '/content/drive/MyDrive/tugas-akhir/korpus-teks/korpus-mentah/korpus-mentah-omdta-5'
OUTPUT_ROOT = '/content/drive/MyDrive/tugas-akhir/korpus-teks/korpus-terproses/korpus-symspell-only'

In [None]:
# Initialize SymSpell
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)

# This will track all unique words loaded
unique_words = set()

# Helper to load words from a txt file into SymSpell (if not already added)
def load_dict_from_txt(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            word = line.strip().lower()
            if word and word not in unique_words:
                sym_spell.create_dictionary_entry(word, 1)
                unique_words.add(word)

# === Load dictionary from multiple sources ===

# Load Orthographically-reversed Indonesian Wikipedia Based Dictionary
main_dict_folder = "/content/drive/MyDrive/tugas-akhir/dicts/tokenized-idwiki-dict-reversal-4"
for filename in sorted(os.listdir(main_dict_folder)):
    if filename.endswith(".txt"):
        filepath = os.path.join(main_dict_folder, filename)
        load_dict_from_txt(filepath)

# Load KBBI dictionary
additional_dict_folder = "/content/drive/MyDrive/tugas-akhir/dicts/dict-kbbi-iv-approx-extract"
for filename in sorted(os.listdir(additional_dict_folder)):
    if filename.endswith(".txt"):
        filepath = os.path.join(additional_dict_folder, filename)
        load_dict_from_txt(filepath)

# Load dutch dictionary
dutch_dict = "/content/drive/MyDrive/tugas-akhir/dicts/opentaal-dutch-wordlist.txt"
load_dict_from_txt(dutch_dict)

# Load english dictionary
english_dict = "/content/drive/MyDrive/tugas-akhir/dicts/english-words.txt"
load_dict_from_txt(english_dict)

print(f"Total unique words added to SymSpell: {len(unique_words)}")


Total unique words added to SymSpell: 2755620


TODO

The preprocessing in correct_text() regarding symbol '2' handling need further improvement

In [None]:
def clean_word(w):
    return re.sub(r'[^a-zA-Z]', '', w).lower()

def preprocess_text(text):
    lines = text.splitlines()
    processed_lines = []
    i = 0
    while i < len(lines):
        line = lines[i].rstrip('\n')

        # Check if the current line ends with a dash and there's a next line
        while line.endswith('-') and i + 1 < len(lines):

            # Remove the dash at the end of the current line
            line = line[:-1]

            # Remove leading spaces from the next line
            next_line = lines[i + 1].lstrip()

            # Merge the current line with the next line
            line = line + next_line

            i += 1  # Skip the next line as it's already merged

        # Add the final processed line to the list
        processed_lines.append(line)

        # Move to the next line
        i += 1

    # Return the final processed text as one long block of text (with spaces but no newlines)
    return ' '.join(processed_lines)  # Joining lines with space, not newline

def correct_text(text, max_candidates=10, max_edit_distance=2):
    def clean_inner_non_latin(word):
        prefix_match = re.match(r'^\W*', word)
        suffix_match = re.match(r'.*?(\W*)$', word)

        prefix = prefix_match.group() if prefix_match else ''
        suffix = suffix_match.group(1) if suffix_match else ''

        core = word[len(prefix):len(word)-len(suffix) if suffix else None]
        cleaned_core = re.sub(r'[^a-zA-Z]', '', core)

        return prefix + cleaned_core + suffix

    corrected_words = []

    for word in text.split():
        # Special handling for digit '2' in the middle
        if '2' in word[1:-1]:
            parts = word.split('2')
            if len(parts) == 2 and parts[0].isalpha() and parts[1].isalpha():
                left = parts[0].lower()
                right = parts[1].lower()

                left_suggestions = sym_spell.lookup(left, Verbosity.ALL, max_edit_distance=max_edit_distance)
                right_suggestions = sym_spell.lookup(right, Verbosity.ALL, max_edit_distance=max_edit_distance)

                left_candidates = [s.term for s in left_suggestions if s.distance <= max_edit_distance][:max_candidates]
                right_candidates = [s.term for s in right_suggestions if s.distance <= max_edit_distance][:max_candidates]

                if left_candidates or right_candidates:
                    left_result = left_candidates[0] if left_candidates else left
                    right_result = right_candidates[0] if right_candidates else right
                    replacement = f"[ORI: {word}, CAND: {left_result}2{right_result}]"
                    corrected_words.append(replacement)
                else:
                    corrected_words.append(word)
                continue

        # Clean word by removing non-latin characters inside (not prefix/suffix)
        word_cleaned = clean_inner_non_latin(word)

        match = re.match(r'^([^a-zA-Z]*)([a-zA-Z]+)([^a-zA-Z]*)$', word_cleaned)

        if match:
            prefix, base_word, suffix = match.groups()
            cleaned = base_word.lower()

            suggestions = sym_spell.lookup(cleaned, Verbosity.ALL, max_edit_distance=max_edit_distance)
            candidates = [s.term for s in suggestions if s.distance <= max_edit_distance][:max_candidates]

            if candidates:
                replacement = f"[ORI: {base_word}, CAND: {', '.join(candidates)}]"
                corrected_words.append(f"{prefix}{replacement}{suffix}")
            else:
                corrected_words.append(word)
        else:
            corrected_words.append(word)  # Leave as-is if still doesn't match

    return ' '.join(corrected_words)


In [None]:
sample_text = """pemerintah
nenghac api matjam2 soal. Po litiek Belanca. Maaspode. BeJanda. idjalankan. BRAHIM. BARA. Tjikeong. Tjilamaja. eliau. Andjeéran. Mempoenjai djakarta jang pacific. P.K.R.I . lap!op. ADA tg. 24 Djan. C. H. T.-H
Tegal. mengirim kawat pro-
tes kepada Belanda dengan peran-
taraan Menteri Kemakmoeran
dan Menteri Loear Negeri Repoc-
blik Indonesia dan Konsol Djen-
deral Tiongkok serta Thpa Siang
Iwee Djakarta atas perlakoean
marine Belanda atas kapal2 jang
memoeat dan barang import dan
export saudagar Tionghoa dilacet
antara Tegal dan Tjirebon, Kepa-
da Thoa Siang Hwee diminta me-
neroeskan protes ini pada Chine-
se Chamber of Commerce di Si-
ngapofra."""

In [None]:
cleaned_text = preprocess_text(sample_text)
print(cleaned_text)

pemerintah nenghac api matjam2 soal. Po litiek Belanca. Maaspode. BeJanda. idjalankan. BRAHIM. BARA. Tjikeong. Tjilamaja. eliau. Andjeéran. Mempoenjai djakarta jang pacific. P.K.R.I . lap!op. ADA tg. 24 Djan. C. H. T.-H Tegal. mengirim kawat protes kepada Belanda dengan perantaraan Menteri Kemakmoeran dan Menteri Loear Negeri Repocblik Indonesia dan Konsol Djenderal Tiongkok serta Thpa Siang Iwee Djakarta atas perlakoean marine Belanda atas kapal2 jang memoeat dan barang import dan export saudagar Tionghoa dilacet antara Tegal dan Tjirebon, Kepada Thoa Siang Hwee diminta meneroeskan protes ini pada Chinese Chamber of Commerce di Singapofra.


In [None]:
corrected_test_text = correct_text(cleaned_text)
print(corrected_test_text)

[ORI: pemerintah, CAND: pemerintah, pemerinah, pemerinatah, pemerindah, pemerinmtah, pemerinntah, pemerinrtah, pemerinta, pemerintaha, pemerintahg] [ORI: nenghac, CAND: nengahan, benghal, benghar, denghan, enghag, fenghai, fenghao, menghai, menghan, mengha] [ORI: api, CAND: api, aapi, abpi, adpi, aepi, afpi, aipi, akpi, alpi, ampi] [ORI: matjam, CAND: matjam, ematjam, matjama, matjamo, matdjam, matjram, matjham, matajam, majtjam, nmatjam] [ORI: soal, CAND: soal, somal, sowal, soeal, soala, soale, soali, sobal, sohal, sokal]. [ORI: Po, CAND: po, apo, bpo, dpo, epo, fpo, gpo, hpo, ipo, kpo] [ORI: litiek, CAND: titiek, liliek, litjek, litik, itjiek, kritiek, vaitiek, diëtiek, diptiek, ritmiek] [ORI: Belanca, CAND: blanca, belanda, belana, belanfa, belanga, belanta, belanja, bellanca, melancar, pelancar]. [ORI: Maaspode, CAND: maasbode, baasrode, haasrode, manopode, maampoe, maasde, maaskade]. [ORI: BeJanda, CAND: belanda, beranda, bekanda, benanda, bevanda, bejana, betjanda, berjanda, baj

In [None]:
for root, dirs, files in os.walk(CORPUS_ROOT):
    # Determine relative path from root
    rel_path = os.path.relpath(root, CORPUS_ROOT)
    output_dir = os.path.join(OUTPUT_ROOT, rel_path)
    os.makedirs(output_dir, exist_ok=True)

    for filename in files:
        file_path = os.path.join(root, filename)

        if filename.startswith(".partition"):
            # Directly copy the original .partition text
            with open(file_path, 'r', encoding='utf-8') as f:
                partition_text = f.read()

            # Write the original partition text to the output directory
            output_path = os.path.join(output_dir, filename)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(partition_text)

        elif filename.startswith("block"):
            # Preprocess and correct the block text
            with open(file_path, 'r', encoding='utf-8') as f:
                original_text = f.read()

            preprocessed_text = preprocess_text(original_text)
            corrected_text = correct_text(preprocessed_text)

            # Write the corrected block text to the output directory
            output_path = os.path.join(output_dir, filename)
            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(corrected_text)