In [2]:
import os
import glob

def get_txt_files(root_folder):
    """Recursively find all .txt files in root_folder and subfolders."""
    return glob.glob(os.path.join(root_folder, "**", "*.txt"), recursive=True)

def clean_text(text):
    """Remove extra spaces and line breaks while preserving content."""
    return " ".join(text.split())

def process_text_file(file_path, output_folder, slice_ratio = 0.2):
    """Load, clean, restrict, and save the processed text."""
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()
    
    # Clean text
    cleaned_text = clean_text(content)
    
    # Restrict to 20% of words
    words = cleaned_text.split()
    restricted_text = " ".join(words[:int(len(words) * slice_ratio)])
    
    # Define new filename
    base_name = os.path.basename(file_path)
    new_name = os.path.splitext(base_name)[0] + "_cleaned_restricted.txt"
    output_path = os.path.join(output_folder, new_name)
    
    # Save the processed text
    with open(output_path, "w", encoding="utf-8") as file:
        file.write(restricted_text)
    
    print(f"Processed and saved: {output_path}")



In [None]:
# Example usage:
root_folder = "../data/04_transcripts_corrected_selected/"  # Change to your input folder
output_folder = "../data/04_transcripts_corrected_selected/"  # Change to where you want to save results
os.makedirs(output_folder, exist_ok=True)
txt_files = get_txt_files(root_folder)

for file_path in txt_files:
    process_text_file(file_path, output_folder)