In [None]:
import os
import logging
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
from datetime import datetime

# Set up logging configuration
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger('requirements_processor')

def sanitize_content(content):
    """
    Sanitize content to ensure XML/JSON compatibility.
    Replace or remove problematic characters while preserving meaningful whitespace.
    """
    logger.debug("Sanitizing content")
    
    # Define the XML 1.0 specification valid characters
    # Valid ranges are:
    # tab, newline, carriage return (0x9, 0xA, 0xD)
    # ASCII characters (0x20-0x7E)
    # Extended ASCII and Unicode (0x7F-0xFFFD)
    # Excluding surrogate blocks (0xFDD0-0xFDEF)
    
    # First, replace common problematic characters
    replacements = {
        '\u2018': "'",  # Left single quote
        '\u2019': "'",  # Right single quote
        '\u201C': '"',  # Left double quote
        '\u201D': '"',  # Right double quote
        '\u2013': '-',  # En dash
        '\u2014': '--', # Em dash
        '\u2026': '...' # Ellipsis
    }
    
    for old, new in replacements.items():
        content = content.replace(old, new)
    
    # Function to check if a character is valid XML
    def is_valid_xml_char(char):
        codepoint = ord(char)
        return (
            codepoint == 0x9 or
            codepoint == 0xA or
            codepoint == 0xD or
            (0x20 <= codepoint <= 0x7E) or
            (0x7F <= codepoint <= 0xFFFD and not (0xFDD0 <= codepoint <= 0xFDEF))
        )
    
    # Replace invalid characters with space
    sanitized = ''.join(char if is_valid_xml_char(char) else ' ' for char in content)
    
    # Remove multiple spaces while preserving newlines
    sanitized = re.sub(r' +', ' ', sanitized)
    sanitized = re.sub(r'\n +', '\n', sanitized)
    
    # Clean up empty lines while preserving paragraph structure
    sanitized = re.sub(r'\n\s*\n\s*\n+', '\n\n', sanitized)
    
    logger.debug("Content sanitization completed")
    return sanitized.strip()

def create_xml_structure(collection_id, name, description):
    """Create the basic XML structure for artifacts collection."""
    logger.debug(f"Creating XML structure for collection: {collection_id}")
    
    root = ET.Element("artifacts_collection")
    
    # Add collection info
    collection_info = ET.SubElement(root, "collection_info")
    id_elem = ET.SubElement(collection_info, "id")
    id_elem.text = collection_id
    name_elem = ET.SubElement(collection_info, "name")
    name_elem.text = name
    version_elem = ET.SubElement(collection_info, "version")
    version_elem.text = "1.32"
    desc_elem = ET.SubElement(collection_info, "description")
    desc_elem.text = description
    
    # Add artifacts container
    artifacts = ET.SubElement(root, "artifacts")
    
    return root

def process_requirement_file(filepath):
    """Process a single requirement file and extract its content."""
    logger.debug(f"Processing requirement file: {filepath}")
    
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read().strip()
            # Sanitize content before returning
            sanitized_content = sanitize_content(content)
            return sanitized_content
    except UnicodeDecodeError:
        logger.warning(f"UTF-8 decode error for {filepath}, trying with latin-1 encoding")
        try:
            with open(filepath, 'r', encoding='latin-1') as file:
                content = file.read().strip()
                sanitized_content = sanitize_content(content)
                return sanitized_content
        except Exception as e:
            logger.error(f"Error reading file {filepath} with latin-1 encoding: {e}")
            return None
    except Exception as e:
        logger.error(f"Error reading file {filepath}: {e}")
        return None

def add_artifact(artifacts_elem, req_id, content):
    """Add an artifact to the artifacts element."""
    logger.debug(f"Adding artifact with ID: {req_id}")
    
    try:
        artifact = ET.SubElement(artifacts_elem, "artifact")
        
        id_elem = ET.SubElement(artifact, "id")
        id_elem.text = req_id
        
        content_elem = ET.SubElement(artifact, "content")
        content_elem.text = content
        
        parent_id = ET.SubElement(artifact, "parent_id")
        
        logger.debug(f"Successfully added artifact: {req_id}")
    except Exception as e:
        logger.error(f"Error adding artifact {req_id}: {e}")
        raise

def validate_xml(xml_string):
    """Validate that the XML is well-formed."""
    try:
        ET.fromstring(xml_string)
        return True
    except ET.ParseError as e:
        logger.error(f"XML validation failed: {e}")
        return False

def prettify_xml(elem):
    """Return a pretty-printed XML string for the Element."""
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    # Skip the XML declaration since we'll add it manually
    pretty_xml = reparsed.toprettyxml(indent="  ")
    # Remove the first line containing the XML declaration
    return '\n'.join(pretty_xml.split('\n')[1:])

def process_requirements(source_dir, prefix, output_file, collection_id, collection_name, collection_desc):
    """Process all requirement files in a directory and create XML output."""
    logger.info(f"Processing requirements from directory: {source_dir}")
    logger.info(f"Creating output file: {output_file}")
    
    # Create XML structure
    root = create_xml_structure(collection_id, collection_name, collection_desc)
    artifacts_elem = root.find("artifacts")
    
    # Process each file in the directory
    try:
        for filename in os.listdir(source_dir):
            if filename.startswith(prefix):
                filepath = os.path.join(source_dir, filename)
                logger.debug(f"Found matching file: {filepath}")
                
                # Extract requirement ID from filename
                req_id = filename
                
                # Process the file
                content = process_requirement_file(filepath)
                if content:
                    add_artifact(artifacts_elem, req_id, content)
                else:
                    logger.warning(f"Skipping file due to processing error: {filepath}")
        
        # Generate XML string
        xml_string = prettify_xml(root)
        
        # Validate XML before writing
        if validate_xml(('<?xml version="1.0" encoding="utf-8"?>\n' + xml_string).encode('utf-8')):
            # Write the XML file with a single XML declaration
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write('<?xml version="1.0" encoding="utf-8"?>\n')
                f.write(xml_string)
            
            logger.info(f"Successfully created XML file: {output_file}")
        else:
            logger.error("XML validation failed, file not written")
            
    except Exception as e:
        logger.error(f"Error processing requirements: {e}")
        raise

def main():
    logger.info("Starting requirements processing")
    
    # Process high-level requirements
    process_requirements(
        source_dir="./high",
        prefix="SRS",
        output_file="CM1-sourceArtifacts.xml",
        collection_id="cm1-high",
        collection_name="CM 1 Source Artifacts",
        collection_desc="Collection of Source Artifacts for the CM1 dataset (high)"
    )
    
    # Process low-level requirements
    process_requirements(
        source_dir="./low",
        prefix="DPU",
        output_file="CM1-targetArtifacts.xml",
        collection_id="cm1-low",
        collection_name="CM 1 Target Artifacts",
        collection_desc="Collection of Target Artifacts for the CM1 dataset (low)"
    )
    
    logger.info("Requirements processing completed")

if __name__ == "__main__":
    main()