# Rutooro Text Cleaning Pipeline
This notebook batches through raw Rutooro text files on Google Drive, cleans and deduplicates them, and outputs one sentence per line ready for language model training.

## Setup Google Drive and Paths
Mount your Google Drive and set where raw and processed data should live. Edit the paths below if needed.

In [None]:
from google.colab import drive
from pathlib import Path

drive.mount('/content/drive')

RAW_DATA_DIR = Path('/content/drive/MyDrive/rutooro-mlm/data/raw')
PROCESSED_DATA_DIR = Path('/content/drive/MyDrive/rutooro-mlm/data/processed')

RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)
print('Raw data dir:', RAW_DATA_DIR)
print('Processed data dir:', PROCESSED_DATA_DIR)


## Preview a Raw Text File
Here we preview part of the first raw file to understand the formatting.

In [None]:
import itertools
raw_files = sorted(RAW_DATA_DIR.glob('*.txt'))
if not raw_files:
    print('No raw text files found in', RAW_DATA_DIR)
else:
    example_file = raw_files[0]
    print('Example file:', example_file)
    with open(example_file, 'r', encoding='utf-8', errors='ignore') as f:
        text_sample = ''.join(itertools.islice(f, 20))
    print(text_sample)


## Define Cleaning Functions
Remove page numbers, stray numbers, extra whitespace and split sentences using NLTK.

In [None]:
import re
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

def clean_text(text: str) -> str:
    text = re.sub(r'\f', ' ', text)           # remove form feed characters
    text = re.sub(r'\n+', ' ', text)          # collapse newlines
    text = re.sub(r'page\s*\d+', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\b\d+\b', '', text)      # remove stray numbers
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def split_sentences(text: str):
    try:
        sentences = sent_tokenize(text)
    except Exception:
        sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]


## Clean All Files and Collect Sentences

In [None]:
all_sentences = []
files = sorted(RAW_DATA_DIR.glob('*.txt'))
print('Found', len(files), 'raw text files')
for path in files:
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        raw_text = f.read()
    cleaned = clean_text(raw_text)
    sentences = split_sentences(cleaned)
    sentences = [s for s in sentences if len(s.split()) >= 3]
    all_sentences.extend(sentences)
    print(f"{path.name}: {len(sentences)} sentences")


## Remove Duplicates and Save

In [None]:
unique_sentences = list(dict.fromkeys(all_sentences))
output_path = PROCESSED_DATA_DIR / 'rutooro_sentences_clean.txt'
with open(output_path, 'w', encoding='utf-8') as f:
    for s in unique_sentences:
        f.write(s + "\n")
print('Total sentences:', len(all_sentences))
print('Unique sentences:', len(unique_sentences))
print('Saved to', output_path)

## Random Sample of Cleaned Sentences

In [None]:
import random
sample = random.sample(unique_sentences, min(10, len(unique_sentences)))
for s in sample:
    print('-', s)
