## **Preprocess .txt file**

- RUN previous to datasetGenerator
- Remove non vocab signs
- Repeated lines
- Whitespaces
- Remove qs (not in Copiale)
- ...

In [None]:
import unicodedata

def clean_text_file(input_path, output_path):
    seen_lines = set()
    
    # mapping for Spanish accented vowels to non-accented counterparts
    accent_map = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'Á': 'A', 'É': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U'
    }
    
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:
        
        for line in infile:
            # normalize Unicode to NFC form first
            line = unicodedata.normalize('NFC', line)
            
            # Remove letter 'q' and 'Q'
            line = line.replace('q', '').replace('Q', '')
            
            # Replace accented vowels with non-accented counterparts
            for accented, non_accented in accent_map.items():
                line = line.replace(accented, non_accented)
            
            # Remove all punctuation and symbols (including Unicode)
            line = ''.join(
                char for char in line
                if not (unicodedata.category(char).startswith('P') or 
                       unicodedata.category(char).startswith('S'))
            )
            
            # Remove numbers
            line = ''.join(char for char in line if not char.isdigit())
            
            # Strip whitespace
            stripped_line = line.strip()
            
            # Remove 'ß' and 'ñ' characters
            stripped_line = stripped_line.replace('ß', '')
            stripped_line = stripped_line.replace('ñ', 'n')
            stripped_line = stripped_line.replace('Ñ', 'N')
            
            # Skip if empty or already seen
            if stripped_line and stripped_line not in seen_lines:
                seen_lines.add(stripped_line)
                outfile.write(stripped_line + '\n')

### Main

In [None]:

clean_text_file('', '')