### **Text-cleaning pipeline**

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
import re

Mounted at /content/drive


In [None]:
def remove_sections_and_titles(text):
    """
    Remove specific sections and titles from the text based on predefined patterns,
    and clean up empty lines.
    """
    # Define regex pattern for all possible sections to remove
    section_pattern = re.compile(r"==\s*(Quelle|Weblinks|Literatur|Film|Einzelnachweise|Siehe auch|Video|Note|Voci correlate|Altri progetti|Collegamenti esterni|Bibliografia|Véase también|Referencias|Enlaces externos|Bibliografía|Notes et références|Annexes|Articles connexes|Voir aussi|Bibliographie|Références|Littérature|Cinéma|See also|References|Further reading|External links)\s*==", re.IGNORECASE)
    # Define regex pattern to remove all titles between "=" pairs
    title_pattern = re.compile(r"==+.*?==+", re.DOTALL)
    # Find section match
    match = section_pattern.search(text)
    if match:
        # If section found, remove everything after it
        index_notes = match.start()
        text_without_notes = text[:index_notes]
    else:
        text_without_notes = text
    # Remove titles from the remaining text
    text_without_titles = re.sub(title_pattern, "", text_without_notes)
    # Remove empty lines from the text
    non_empty_lines = [line for line in text_without_titles.splitlines() if line.strip()]
    return '\n'.join(non_empty_lines)

def create_directory_if_not_exists(directory):
    """
    Create directory if it doesn't exist.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

def process_file(input_file, output_folder, root_folder):
    """
    Process a single file: read input, apply text processing functions, and write output.
    """
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()
    processed_text = remove_sections_and_titles(text)
    # Get relative path of the file relative to the root folder
    rel_path = os.path.relpath(input_file, root_folder)
    # Generate full path for the output file in the output folder
    output_file = os.path.join(output_folder, rel_path)
    # Create subdirectories if necessary
    output_folder_rel = os.path.dirname(output_file)
    create_directory_if_not_exists(output_folder_rel)

    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(processed_text)

def process_folder(root_folder, output_folder):
    """
    Process all files within the root folder.
    """
    # Iterate over all files and subfolders in the root folder
    for root, dirs, files in os.walk(root_folder):
        for file in files:
            file_path = os.path.join(root, file)
            # If it's a .txt file, process the file
            if file.endswith('.txt'):
                process_file(file_path, output_folder, root_folder)
                print(f"File '{file}' processed successfully.")

# Define root folder
root_folder = "/content/drive/MyDrive/Wiki download/GastronomiaL_en_it_es_de_fr"
# Define output folder for processed files
output_folder ="/content/drive/MyDrive/Wiki download/Gastr_L_puliti"

# Process the root folder
process_folder(root_folder, output_folder)

File 'Zuccotto_de.txt' processed successfully.
File 'Zuccotto_en.txt' processed successfully.
File 'Zuccotto_es.txt' processed successfully.
File 'Zuccotto_fr.txt' processed successfully.
File 'Zuccotto_it.txt' processed successfully.
File 'Cappelletti_en.txt' processed successfully.
File 'Cappelletti_de.txt' processed successfully.
File 'Cappelletti_es.txt' processed successfully.
File 'Cappelletti_fr.txt' processed successfully.
File 'Cappelletti_it.txt' processed successfully.
File 'Farinata_de.txt' processed successfully.
File 'Farinata_en.txt' processed successfully.
File 'Farinata_es.txt' processed successfully.
File 'Farinata_fr.txt' processed successfully.
File 'Farinata_it.txt' processed successfully.
File 'Focaccia_de.txt' processed successfully.
File 'Focaccia_en.txt' processed successfully.
File 'Focaccia_es.txt' processed successfully.
File 'Focaccia_fr.txt' processed successfully.
File 'Focaccia_it.txt' processed successfully.
File 'Fregula_de.txt' processed successfully.