In [1]:
# Cell 1: Imports and Logging Setup

import os
import logging
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
from datetime import datetime

# Set up logging configuration
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger('requirements_processor')

In [2]:
# Cell 2: Binary-Level Character Replacement for Special Characters

def sanitize_content(content):
    """
    Sanitize content to ensure XML/JSON compatibility.
    
    Args:
        content (str): The raw text content to sanitize
        
    Returns:
        str: Sanitized content with problematic characters replaced
    """
    logger.debug("Sanitizing content")
    
    # Define the XML 1.0 specification valid characters
    # Valid ranges are:
    # tab, newline, carriage return (0x9, 0xA, 0xD)
    # ASCII characters (0x20-0x7E)
    # Extended ASCII and Unicode (0x7F-0xFFFD)
    # Excluding surrogate blocks (0xFDD0-0xFDEF)
    
    # Replace common problematic characters
    replacements = {
        '\u2018': "'",  # Left single quote
        '\u2019': "'",  # Right single quote
        '\u201C': '"',  # Left double quote
        '\u201D': '"',  # Right double quote
        '\u2013': '-',  # En dash
        '\u2014': '--', # Em dash
        '\u2026': '...', # Ellipsis
        '\u2022': '-',  # Bullet
        '\u00B7': '-',  # Middle dot
        '\u00A0': ' ',  # Non-breaking space
        '\u00B0': '-',  # Degree sign
        # Add any other replacements needed
    }
    
    for old, new in replacements.items():
        content = content.replace(old, new)
    
    # Function to check if a character is valid XML
    def is_valid_xml_char(char):
        codepoint = ord(char)
        return (
            codepoint == 0x9 or
            codepoint == 0xA or
            codepoint == 0xD or
            (0x20 <= codepoint <= 0x7E) or
            (0x7F <= codepoint <= 0xFFFD and not (0xFDD0 <= codepoint <= 0xFDEF))
        )
    
    # Replace invalid characters with dash instead of space
    sanitized = ''.join(char if is_valid_xml_char(char) else '-' for char in content)
    
    # Remove multiple spaces while preserving newlines
    sanitized = re.sub(r' +', ' ', sanitized)
    sanitized = re.sub(r'\n +', '\n', sanitized)
    
    # Clean up empty lines while preserving paragraph structure
    sanitized = re.sub(r'\n\s*\n\s*\n+', '\n\n', sanitized)
    
    logger.debug("Content sanitization completed")
    return sanitized.strip()

In [3]:
# Cell 3: XML Structure Functions

def create_xml_structure(collection_id, name, description):
    """
    Create the basic XML structure for artifacts collection.
    
    Args:
        collection_id (str): Unique identifier for the collection
        name (str): Name of the collection
        description (str): Description of the collection
        
    Returns:
        Element: Root XML element with basic structure
    """
    logger.debug(f"Creating XML structure for collection: {collection_id}")
    
    root = ET.Element("artifacts_collection")
    
    # Add collection info
    collection_info = ET.SubElement(root, "collection_info")
    id_elem = ET.SubElement(collection_info, "id")
    id_elem.text = collection_id
    name_elem = ET.SubElement(collection_info, "name")
    name_elem.text = name
    version_elem = ET.SubElement(collection_info, "version")
    version_elem.text = "1.1"
    desc_elem = ET.SubElement(collection_info, "description")
    desc_elem.text = description
    
    # Add artifacts container
    artifacts = ET.SubElement(root, "artifacts")
    
    return root

def add_artifact(artifacts_elem, req_id, content):
    """
    Add an artifact to the artifacts element.
    
    Args:
        artifacts_elem (Element): The artifacts XML element to add to
        req_id (str): Unique identifier for the artifact
        content (str): Content of the artifact
    """
    logger.debug(f"Adding artifact with ID: {req_id}")
    
    try:
        artifact = ET.SubElement(artifacts_elem, "artifact")
        
        id_elem = ET.SubElement(artifact, "id")
        id_elem.text = req_id
        
        content_elem = ET.SubElement(artifact, "content")
        content_elem.text = content
        
        parent_id = ET.SubElement(artifact, "parent_id")
        
        logger.debug(f"Successfully added artifact: {req_id}")
    except Exception as e:
        logger.error(f"Error adding artifact {req_id}: {e}")
        raise

def prettify_xml(elem):
    """
    Return a pretty-printed XML string for the Element.
    
    Args:
        elem (Element): XML element to prettify
        
    Returns:
        str: Pretty-printed XML string
    """
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    # Skip the XML declaration since we'll add it manually
    pretty_xml = reparsed.toprettyxml(indent="  ")
    # Remove the first line containing the XML declaration
    return '\n'.join(pretty_xml.split('\n')[1:])

def validate_xml(xml_string):
    """
    Validate that the XML is well-formed.
    
    Args:
        xml_string (str): XML string to validate
        
    Returns:
        bool: True if XML is valid, False otherwise
    """
    try:
        ET.fromstring(xml_string)
        return True
    except ET.ParseError as e:
        logger.error(f"XML validation failed: {e}")
        return False

In [4]:
# Cell 4: Binary-Level File Processing for Special Characters

def process_requirement_file(filepath):
    """
    Process a single requirement file and extract its content.
    Works at the binary level to handle special characters.
    
    Args:
        filepath (str): Path to the requirement file
        
    Returns:
        str: Sanitized content of the file or None if error
    """
    logger.debug(f"Processing requirement file: {filepath}")
    
    try:
        # Read the file as binary
        with open(filepath, 'rb') as file:
            binary_content = file.read()
        
        # Replace problematic byte sequences directly
        # The special character � often appears as specific byte sequences
        # Common encodings for bullet points and similar characters:
        binary_replacements = [
            (b'\xc2\xb7', b'-'),  # UTF-8 middle dot
            (b'\xc2\xb0', b'-'),  # UTF-8 degree sign
            (b'\xc2\xa0', b' '),  # UTF-8 non-breaking space
            (b'\xc2\xbb', b'>'),  # UTF-8 right-pointing double angle quotation mark
            (b'\xc2\xab', b'<'),  # UTF-8 left-pointing double angle quotation mark
            (b'\xc2\xb6', b'-'),  # UTF-8 pilcrow sign
            (b'\xc2\xa7', b'-'),  # UTF-8 section sign
            (b'\xc2\xa9', b'(c)'), # UTF-8 copyright sign
            (b'\xc2\xae', b'(r)'), # UTF-8 registered sign
            (b'\xe2\x80\xa2', b'-'), # UTF-8 bullet
            (b'\xe2\x80\x93', b'-'), # UTF-8 en dash
            (b'\xe2\x80\x94', b'--'), # UTF-8 em dash
            (b'\xe2\x80\x98', b"'"), # UTF-8 left single quote
            (b'\xe2\x80\x99', b"'"), # UTF-8 right single quote
            (b'\xe2\x80\x9c', b'"'), # UTF-8 left double quote
            (b'\xe2\x80\x9d', b'"'), # UTF-8 right double quote
            (b'\xe2\x80\xa6', b'...'), # UTF-8 ellipsis
            # Latin-1 encodings
            (b'\xb7', b'-'),  # Latin-1 middle dot
            (b'\xb0', b'-'),  # Latin-1 degree sign
            (b'\xa0', b' '),  # Latin-1 non-breaking space
            (b'\xbb', b'>'),  # Latin-1 right-pointing double angle quotation mark
            (b'\xab', b'<'),  # Latin-1 left-pointing double angle quotation mark
            (b'\xb6', b'-'),  # Latin-1 pilcrow sign
            (b'\xa7', b'-'),  # Latin-1 section sign
            (b'\xa9', b'(c)'), # Latin-1 copyright sign
            (b'\xae', b'(r)'), # Latin-1 registered sign
            # Windows-1252 specific characters
            (b'\x95', b'-'),  # Windows-1252 bullet
            (b'\x96', b'-'),  # Windows-1252 en dash
            (b'\x97', b'--'), # Windows-1252 em dash
            (b'\x91', b"'"),  # Windows-1252 left single quote
            (b'\x92', b"'"),  # Windows-1252 right single quote
            (b'\x93', b'"'),  # Windows-1252 left double quote
            (b'\x94', b'"'),  # Windows-1252 right double quote
            (b'\x85', b'...'), # Windows-1252 ellipsis
            # Replacement character
            (b'\xef\xbf\xbd', b'-'), # UTF-8 replacement character �
        ]
        
        # Apply all binary replacements
        for old, new in binary_replacements:
            binary_content = binary_content.replace(old, new)
        
        # Try to decode with UTF-8 first
        try:
            content = binary_content.decode('utf-8')
        except UnicodeDecodeError:
            # Fall back to latin-1 which can handle any byte value
            logger.warning(f"UTF-8 decode error for {filepath}, falling back to latin-1")
            content = binary_content.decode('latin-1')
        
        # Apply sanitization to the decoded content
        sanitized_content = sanitize_content(content)
        return sanitized_content
        
    except Exception as e:
        logger.error(f"Error processing file {filepath}: {e}")
        import traceback
        logger.error(traceback.format_exc())
        return None

def extract_requirement_id_uc(filename, prefix):
    """
    Extract the requirement ID from UC filename and add appropriate prefix.
    
    Args:
        filename (str): The filename to process (e.g. 'UC140.txt' or 'UC1.txt')
        prefix (str): The prefix to add ('UC')
        
    Returns:
        str: Formatted ID (e.g. 'UC140' or 'UC1')
    """
    logger.debug(f"Extracting requirement ID from UC filename: {filename}")
    
    try:
        # Remove .txt extension and extract number
        base_name = os.path.splitext(filename)[0]  # Remove .txt
        number = ''.join(filter(str.isdigit, base_name))  # Extract just the numbers
        
        if not number:
            logger.warning(f"No numeric ID found in filename: {filename}")
            return None
            
        formatted_id = f"{prefix}{number}"
        logger.debug(f"Extracted ID: {formatted_id}")
        return formatted_id
        
    except Exception as e:
        logger.error(f"Error extracting requirement ID from {filename}: {e}")
        return None

def extract_requirement_id_cc(filename):
    """
    Extract the requirement ID from CC filename.
    For CC files, we use the filename without extension as the ID.
    
    Args:
        filename (str): The filename to process (e.g. 'Autenticazione.txt')
        
    Returns:
        str: Formatted ID (e.g. 'CC_Autenticazione')
    """
    logger.debug(f"Extracting requirement ID from CC filename: {filename}")
    
    try:
        # Remove .txt extension
        base_name = os.path.splitext(filename)[0]
        
        # Format the ID with CC_ prefix
        formatted_id = f"CC_{base_name}"
        logger.debug(f"Extracted ID: {formatted_id}")
        return formatted_id
        
    except Exception as e:
        logger.error(f"Error extracting requirement ID from {filename}: {e}")
        return None

In [5]:
# Cell 5: Main Processing Functions

def process_uc_requirements(source_dir, output_file, collection_id, collection_name, collection_desc):
    """
    Process all UC requirement files in a directory and create XML output.
    
    Args:
        source_dir (str): Directory containing UC files
        output_file (str): Path to output XML file
        collection_id (str): ID for the collection
        collection_name (str): Name for the collection
        collection_desc (str): Description for the collection
    """
    logger.info(f"Processing UC requirements from directory: {source_dir}")
    logger.info(f"Creating output file: {output_file}")
    
    # Create XML structure
    root = create_xml_structure(collection_id, collection_name, collection_desc)
    artifacts_elem = root.find("artifacts")
    
    try:
        # Process each file in the directory
        for filename in sorted(os.listdir(source_dir)):
            if filename.lower().endswith('.txt'):
                filepath = os.path.join(source_dir, filename)
                logger.debug(f"Processing UC file: {filepath}")
                
                # Extract requirement ID with prefix
                req_id = extract_requirement_id_uc(filename, "UC")
                if not req_id:
                    logger.warning(f"Skipping file due to invalid ID format: {filepath}")
                    continue
                
                # Process the file
                content = process_requirement_file(filepath)
                if content:
                    add_artifact(artifacts_elem, req_id, content)
                else:
                    logger.warning(f"Skipping file due to processing error: {filepath}")
        
        # Generate and validate XML
        xml_string = prettify_xml(root)
        
        if validate_xml(('<?xml version="1.0" encoding="utf-8"?>\n' + xml_string).encode('utf-8')):
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write('<?xml version="1.0" encoding="utf-8"?>\n')
                f.write(xml_string)
            
            logger.info(f"Successfully created XML file: {output_file}")
        else:
            logger.error("XML validation failed, file not written")
            
    except Exception as e:
        logger.error(f"Error processing UC requirements: {e}")
        raise

def process_cc_requirements(source_dir, output_file, collection_id, collection_name, collection_desc):
    """
    Process all CC requirement files in a directory and create XML output.
    
    Args:
        source_dir (str): Directory containing CC files
        output_file (str): Path to output XML file
        collection_id (str): ID for the collection
        collection_name (str): Name for the collection
        collection_desc (str): Description for the collection
    """
    logger.info(f"Processing CC requirements from directory: {source_dir}")
    logger.info(f"Creating output file: {output_file}")
    
    # Create XML structure
    root = create_xml_structure(collection_id, collection_name, collection_desc)
    artifacts_elem = root.find("artifacts")
    
    try:
        # Process each file in the directory
        for filename in sorted(os.listdir(source_dir)):
            if filename.lower().endswith('.txt'):
                filepath = os.path.join(source_dir, filename)
                logger.debug(f"Processing CC file: {filepath}")
                
                # Extract requirement ID using the filename
                req_id = extract_requirement_id_cc(filename)
                if not req_id:
                    logger.warning(f"Skipping file due to invalid ID format: {filepath}")
                    continue
                
                # Process the file
                content = process_requirement_file(filepath)
                if content:
                    add_artifact(artifacts_elem, req_id, content)
                else:
                    logger.warning(f"Skipping file due to processing error: {filepath}")
        
        # Generate and validate XML
        xml_string = prettify_xml(root)
        
        if validate_xml(('<?xml version="1.0" encoding="utf-8"?>\n' + xml_string).encode('utf-8')):
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write('<?xml version="1.0" encoding="utf-8"?>\n')
                f.write(xml_string)
            
            logger.info(f"Successfully created XML file: {output_file}")
        else:
            logger.error("XML validation failed, file not written")
            
    except Exception as e:
        logger.error(f"Error processing CC requirements: {e}")
        raise

In [6]:
# Cell 6: Main Function

def main():
    """
    Main function to process both UC and CC requirements.
    Handles the different naming conventions for each type.
    """
    logger.info("Starting requirements processing")
    
    # Process use cases (UC)
    process_uc_requirements(
        source_dir="./UC",
        output_file="eTOUR-sourceArtifacts.xml",
        collection_id="UC",
        collection_name="eTOUR Source Artifacts",
        collection_desc="Use cases"
    )
    
    # Process class codes (CC)
    process_cc_requirements(
        source_dir="./CC",
        output_file="eTOUR-targetArtifacts.xml",
        collection_id="CC",
        collection_name="eTOUR Target Artifacts",
        collection_desc="Class code"
    )
    
    logger.info("Requirements processing completed")

In [None]:
# Cell 7: Execution Cell

if __name__ == "__main__":
    main()