In [1]:
import os
import xml.etree.ElementTree as ET
import re

In [2]:
def extract_metadata(root):
    """Extract metadata from the XML header section."""
    metadata = {}
    header = root.find('.//HEADER')  # Find the HEADER section in the XML
    if header is not None:
        for elem in header.iter():  # Iterate through all elements in the HEADER
            tag = elem.tag.lower()
            if tag not in metadata:
                metadata[tag] = elem.text.strip() if elem.text else ''
            else:
                # Append additional text if the tag already exists
                additional_text = elem.text.strip() if elem.text else ''
                if additional_text:
                    metadata[tag] += f"; {additional_text}"
    return metadata

In [3]:
def clean_text(text):
    """Clean and format text content."""
    text = text.replace('∣', '').replace('▪', '')  # Remove specific unwanted characters
    text = re.sub(r'<[^>]+>', '', text)  # Remove other HTML-like tags
    text = re.sub(r'[ \t]+', ' ', text)  # Replace multiple spaces/tabs with a single space
    return text.strip()

In [4]:
def process_body(body):
    """Process the body content and clean it."""
    paragraphs = []

    for elem in body.iter():
        if elem.tag == 'P':  # Process paragraphs
            paragraph_text = ""
            if elem.text:
                paragraph_text += elem.text + " "
            for sub_elem in elem:
                # Handle the specific tag <ABBR><HI>l</HI></ABBR>
                if sub_elem.tag == 'ABBR' and sub_elem.find('HI') is not None and sub_elem.find('HI').text == 'l':
                    paragraph_text += "(Pounds)"
                if sub_elem.text:
                    paragraph_text += sub_elem.text + " "
                if sub_elem.tail:
                    paragraph_text += sub_elem.tail + " "
            paragraphs.append(paragraph_text.strip())

    # Clean and format each paragraph separately
    clean_paragraphs = [clean_text(paragraph) for paragraph in paragraphs]

    # Join paragraphs with double newlines
    clean_content = "\n\n".join(clean_paragraphs)

    return clean_content

In [5]:
def process_xml_file(xml_file_path):
    """Parse the XML file and process its content."""
    tree = ET.parse(xml_file_path)  # Parse the XML file
    root = tree.getroot()  # Get the root element of the XML

    # Extract metadata
    metadata = extract_metadata(root)

    # Extract and clean body text
    body = root.find('.//BODY')  # Find the BODY section in the XML
    if body is None:
        return metadata, ""

    clean_content = process_body(body)
    return metadata, clean_content

In [6]:
def format_metadata(metadata):
    """Format metadata for display."""
    return "\n".join([f"{key.upper()}: {value}" for key, value in metadata.items() if value])

In [7]:
def process_directory(directory_path, output_directory):
    """Process all XML files in a directory and save output to a specified directory."""
    for filename in os.listdir(directory_path):
        if filename.endswith('.xml'):
            xml_file_path = os.path.join(directory_path, filename)
            metadata, clean_content = process_xml_file(xml_file_path)
            formatted_metadata = format_metadata(metadata)
            full_content = f"{formatted_metadata}\n\n{clean_content}"

            # Ensure the output directory exists
            os.makedirs(output_directory, exist_ok=True)

            # Save combined content for each file in the output directory
            output_file_path = os.path.join(output_directory, f"{os.path.splitext(filename)[0]}_output.txt")
            with open(output_file_path, 'w', encoding='utf-8') as f:
                f.write(full_content)

            print(f"Processed and saved: {output_file_path}")

In [10]:
directory_path = '../A0_Ph1/A0'
output_directory = '../XML to Text Code/Output'
process_directory(directory_path, output_directory)

Processed and saved: ../XML to Text Code/Output/A06191.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A03992.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A08935.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A07471.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A01404.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A03164.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A02571.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A02606.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A00596.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A08255.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A02951.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A08063.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A05047.P4_output.txt
Processed and saved: ../XML to Text Code/Output/A06978.P4_output.txt
Processed and saved: ../XML to Tex