In [1]:
# Cell 1: Imports and Setup

import os
import logging
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
import unicodedata
from datetime import datetime
from pathlib import Path
import sys
import traceback

# Get the actual project root by going up to the main project directory
notebook_path = Path.cwd()  # Current working directory (notebook location)
project_root = notebook_path.parent.parent.parent  # Go up 3 levels to reach project root
src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger("requirements_processor")

In [2]:
# Cell 2: Utility Classes and Functions

class DebugTimer:
    """
    Utility class for timing code execution and logging checkpoints
    
    Args:
        name: Name of the timer for logging
        logger: Logger instance to use
    """
    def __init__(self, name, logger):
        self.name = name
        self.logger = logger
        self.start_time = datetime.now()
        self.last_checkpoint = self.start_time
        self.logger.debug(f"Started timer '{self.name}'")
        
    def checkpoint(self, message):
        """Log time since last checkpoint"""
        now = datetime.now()
        elapsed = now - self.last_checkpoint
        self.logger.debug(f"[{self.name}] {message} - {elapsed.total_seconds():.3f}s")
        self.last_checkpoint = now
        
    def end(self):
        """Log total execution time and end timer"""
        now = datetime.now()
        total = now - self.start_time
        self.logger.debug(f"[{self.name}] Total execution time: {total.total_seconds():.3f}s")

def handle_exception(func):
    """
    Decorator to handle exceptions in functions
    
    Args:
        func: Function to wrap with exception handling
    
    Returns:
        Wrapped function with exception handling
    """
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            logger.error(f"Exception in {func.__name__}: {str(e)}")
            logger.debug(traceback.format_exc())
            # Re-raise the exception after logging
            raise
    return wrapper

In [3]:
# Cell 3: Content Processing Functions

def sanitize_content(content):
    """
    Sanitize content to ensure XML/JSON compatibility.
    Replace or remove problematic characters while preserving meaningful whitespace.
    
    Args:
        content: String to sanitize
        
    Returns:
        Sanitized string
    """
    logger.debug("Sanitizing content")
    
    # First, handle the specific problematic character that appears as a box with a question mark
    # This is likely the Unicode character U+00B7 (middle dot) or similar
    # We'll replace all non-ASCII characters with appropriate substitutions
    
    # Define common replacements for specific Unicode characters
    replacements = {
        '\u2018': "'",  # Left single quote
        '\u2019': "'",  # Right single quote
        '\u201C': '"',  # Left double quote
        '\u201D': '"',  # Right double quote
        '\u2013': '-',  # En dash
        '\u2014': '--', # Em dash
        '\u2026': '...', # Ellipsis
        '\u00A0': ' ',  # Non-breaking space
        '\u00B7': '-',  # Middle dot - changed to hyphen
        '\u2022': '-',  # Bullet - changed to hyphen
        '\u2212': '-',  # Minus sign
        '\u00AD': '-',  # Soft hyphen
        '\u2010': '-',  # Hyphen
        '\u2011': '-',  # Non-breaking hyphen
        '\u00B0': ' degrees', # Degree sign
        '\u00AE': '(R)', # Registered trademark
        '\u00A9': '(C)', # Copyright
        '\u00A7': 'Section ', # Section sign
        '\u00A6': '|',   # Broken bar
        '\u00A5': 'Yen', # Yen sign
        '\u00A4': '$',   # Currency sign
        '\u00A3': 'GBP', # Pound sign
        '\u00A2': 'c',   # Cent sign
        '\u00A1': '!',   # Inverted exclamation mark
        '\u00BF': '?'    # Inverted question mark
    }
    
    # Apply specific replacements
    for old, new in replacements.items():
        content = content.replace(old, new)
    
    # Handle the problematic character that appears as a box with a question mark
    # This is likely to be a character like the bullet point (•) or similar
    # First, try to identify and replace common problematic characters
    
    # Replace all bullet-like characters with hyphens
    content = re.sub(r'[•·‣⁃⦁◦◘◙■□▪▫▬▭▮▯]', '-', content)
    
    # Replace all other non-ASCII characters that might cause issues
    # This is a more aggressive approach but ensures all problematic characters are handled
    sanitized = ""
    for char in content:
        if ord(char) < 128:  # ASCII characters
            sanitized += char
        elif char in replacements:
            sanitized += replacements[char]
        elif ord(char) == 0xFFFD:  # Replacement character (�)
            sanitized += '-'
        elif unicodedata.category(char).startswith('P'):  # Punctuation
            sanitized += '-'
        elif unicodedata.category(char).startswith('S'):  # Symbols
            sanitized += '-'
        elif unicodedata.category(char).startswith('Z'):  # Separators
            sanitized += ' '
        else:
            sanitized += '-'  # Replace all other non-ASCII with hyphen
    
    # Clean up the text
    # Remove multiple spaces while preserving newlines
    sanitized = re.sub(r' +', ' ', sanitized)
    sanitized = re.sub(r'\n +', '\n', sanitized)
    
    # Clean up multiple hyphens
    sanitized = re.sub(r'-+', '-', sanitized)
    
    # Clean up empty lines while preserving paragraph structure
    sanitized = re.sub(r'\n\s*\n\s*\n+', '\n\n', sanitized)
    
    logger.debug("Content sanitization completed")
    return sanitized.strip()

def strip_rtf(content):
    """
    Strip RTF formatting from text while preserving the actual content.
    
    Args:
        content: String potentially containing RTF formatting
        
    Returns:
        Plain text with RTF formatting removed
    """
    logger.debug("Stripping RTF formatting")
    
    # Check if content appears to be RTF
    if not content.startswith('{\\rtf'):
        return content
        
    # Remove RTF control words and groups
    def strip_rtf_controls(text):
        # Remove RTF headers and groups
        text = re.sub(r'^{\\rtf[^}]*}', '', text)
        # Remove other RTF groups
        text = re.sub(r'{[^{}]*}', '', text)
        # Remove RTF control words (starting with backslash)
        text = re.sub(r'\\[a-z]+[-]?[0-9]*\s?', '', text)
        # Remove special characters - fixed the syntax error here
        text = re.sub(r'\\~|\\-|\\\\|\\:|\\<|\\>|\\{|\\}|\\\'[0-9a-f]{2}', '', text)
        return text
    
    # Strip RTF formatting
    plain_text = strip_rtf_controls(content)
    
    # Clean up the resulting text
    plain_text = re.sub(r'\s+', ' ', plain_text)  # Normalize whitespace
    plain_text = plain_text.strip()
    
    logger.debug("RTF formatting stripped successfully")
    return plain_text

In [4]:
# Cell 4: XML Structure Functions

def create_xml_structure(collection_id, name, description):
    """
    Create the basic XML structure for artifacts collection.
    
    Args:
        collection_id: ID for the collection
        name: Name of the collection
        description: Description of the collection
        
    Returns:
        XML Element representing the root of the structure
    """
    logger.debug(f"Creating XML structure for collection: {collection_id}")
    
    root = ET.Element("artifacts_collection")
    
    # Add collection info
    collection_info = ET.SubElement(root, "collection_info")
    id_elem = ET.SubElement(collection_info, "id")
    id_elem.text = collection_id
    name_elem = ET.SubElement(collection_info, "name")
    name_elem.text = name
    version_elem = ET.SubElement(collection_info, "version")
    version_elem.text = "1.1"
    desc_elem = ET.SubElement(collection_info, "description")
    desc_elem.text = description
    
    # Add artifacts container
    artifacts = ET.SubElement(root, "artifacts")
    
    return root

def add_artifact(artifacts_elem, req_id, content):
    """
    Add an artifact to the artifacts element.
    
    Args:
        artifacts_elem: XML element to add artifact to
        req_id: ID for the artifact
        content: Content of the artifact
    """
    logger.debug(f"Adding artifact with ID: {req_id}")
    
    try:
        artifact = ET.SubElement(artifacts_elem, "artifact")
        
        id_elem = ET.SubElement(artifact, "id")
        id_elem.text = req_id
        
        content_elem = ET.SubElement(artifact, "content")
        content_elem.text = content
        
        parent_id = ET.SubElement(artifact, "parent_id")
        
        logger.debug(f"Successfully added artifact: {req_id}")
    except Exception as e:
        logger.error(f"Error adding artifact {req_id}: {e}")
        raise

def validate_xml(xml_string):
    """
    Validate that the XML is well-formed.
    
    Args:
        xml_string: XML string to validate
        
    Returns:
        Boolean indicating if XML is valid
    """
    try:
        ET.fromstring(xml_string)
        return True
    except ET.ParseError as e:
        logger.error(f"XML validation failed: {e}")
        return False

def prettify_xml(elem):
    """
    Return a pretty-printed XML string for the Element.
    
    Args:
        elem: XML Element to prettify
        
    Returns:
        Pretty-printed XML string
    """
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    # Skip the XML declaration since we'll add it manually
    pretty_xml = reparsed.toprettyxml(indent="  ")
    # Remove the first line containing the XML declaration
    return '\n'.join(pretty_xml.split('\n')[1:])

In [5]:
# Cell 5: Java Code Extraction Function

@handle_exception
def extract_java_headers_and_comments(content):
    """
    Extract only imports, comments, and method/class declarations from Java-like content
    without including any implementation details.
    
    Args:
        content: String containing Java code
        
    Returns:
        String containing extracted imports, comments, and declarations only
    """
    timer = DebugTimer("java_extraction", logger)
    logger.debug("Starting Java headers and comments extraction")

    # Normalize comment spacing issues (fix spaces in comment markers)
    content = re.sub(r'/\s*\*\s*\*', '/**', content)
    content = re.sub(r'\*\s*/', '*/', content)
    content = re.sub(r'/\s*/', '//', content)
    
    # Extract package declaration
    package_match = re.search(r'package\s+[^;]+;', content)
    package_line = package_match.group(0) if package_match else ""
    
    # Extract import statements
    import_pattern = re.compile(r'import\s+[^;]+;', re.MULTILINE)
    imports = import_pattern.findall(content)
    
    # Extract class declaration with inheritance (without implementation)
    class_pattern = re.compile(r'(public|private|protected)?\s*class\s+\w+(\s+extends\s+\w+)?(\s+implements\s+[^{(]+)?', re.MULTILINE)
    class_match = class_pattern.search(content)
    class_declaration = class_match.group(0) if class_match else ""
    
    # Extract interface declaration (without implementation)
    interface_pattern = re.compile(r'(public|private|protected)?\s*interface\s+\w+(\s+extends\s+[^{(]+)?', re.MULTILINE)
    interface_match = interface_pattern.search(content)
    interface_declaration = interface_match.group(0) if interface_match else ""
    
    # Extract method signatures with their comments, but without implementation
    method_pattern = re.compile(
        r'(/\*\*[\s\S]*?\*/\s*)?'  # Optional block comment
        r'(public|private|protected)(\s+static)?\s+'  # Access modifier
        r'[\w<>[\],\s]+\s+'  # Return type
        r'(\w+)\s*'  # Method name
        r'\([^)]*\)'  # Parameters
        r'(\s+throws\s+[\w,\s]+)?',  # Optional throws clause
        re.DOTALL
    )
    
    methods = []
    for method_match in method_pattern.finditer(content):
        # If there's a comment, include it
        comment = method_match.group(1) or ""
        
        # Get just the method signature (no implementation)
        signature = method_match.group(0)[len(comment):].strip()
        
        # Add to methods list
        if comment:
            methods.append(f"{comment}\n{signature}")
        else:
            methods.append(signature)
    
    # Extract all block comments that might not be associated with methods
    block_comment_pattern = re.compile(r'/\*\*[\s\S]*?\*/', re.DOTALL)
    block_comments = []
    
    for comment_match in block_comment_pattern.finditer(content):
        comment = comment_match.group(0)
        # Only add if not already part of a method
        if not any(comment in method for method in methods):
            block_comments.append(comment)
    
    # Extract all line comments
    line_comment_pattern = re.compile(r'//.*$', re.MULTILINE)
    line_comments = []
    
    for line_match in line_comment_pattern.finditer(content):
        line = line_match.group(0)
        line_comments.append(line)
    
    # Combine all extracted elements
    extracted_parts = []
    
    if package_line:
        extracted_parts.append(package_line)
    
    if imports:
        extracted_parts.append("\n".join(imports))
    
    # Add class or interface declaration (without implementation)
    if class_declaration:
        extracted_parts.append(class_declaration)
    elif interface_declaration:
        extracted_parts.append(interface_declaration)
    
    # Add methods (signatures only, no implementation)
    if methods:
        extracted_parts.append("\n\n".join(methods))
    
    # Add block comments that weren't already included
    if block_comments:
        extracted_parts.append("\n\n".join(block_comments))
    
    # Add line comments that weren't already included
    if line_comments:
        extracted_parts.append("\n".join(line_comments))
    
    timer.checkpoint("Finished extraction")
    result = "\n\n".join(extracted_parts)
    timer.end()
    
    return result

In [6]:
# Cell 6: File Processing Functions

@handle_exception
def process_requirement_file(filepath):
    """
    Process a single requirement file and extract its content.
    For Java-like content (in CC directory), only extract comments, headers, and dependencies.
    For RTF content, strip the formatting.
    
    Args:
        filepath: Path to the requirement file
        
    Returns:
        Sanitized content from the file
    """
    logger.debug(f"Processing requirement file: {filepath}")
    
    # First try UTF-8 with error handling
    try:
        with open(filepath, 'r', encoding='utf-8', errors='replace') as file:
            content = file.read().strip()
            if '\ufffd' in content:  # Replacement character indicating encoding issues
                raise UnicodeDecodeError('utf-8', b'', 0, 1, 'invalid start byte')
            
            # Check if this is likely Java code
            java_indicators = ['package ', 'import ', 'public class', 'private class', 'protected class']
            is_java = any(indicator in content for indicator in java_indicators)
            
            if is_java:
                logger.debug(f"Detected Java-like content in {filepath}")
                content = extract_java_headers_and_comments(content)
            else:
                # Check for and strip RTF formatting
                content = strip_rtf(content)
            
            return sanitize_content(content)
            
    except UnicodeDecodeError:
        # Only log warning once per directory
        dir_path = os.path.dirname(filepath)
        if not hasattr(process_requirement_file, '_warned_dirs'):
            process_requirement_file._warned_dirs = set()
        if dir_path not in process_requirement_file._warned_dirs:
            logger.warning(f"Files in {dir_path} appear to use latin-1 encoding instead of UTF-8")
            process_requirement_file._warned_dirs.add(dir_path)
            
        try:
            with open(filepath, 'r', encoding='latin-1') as file:
                content = file.read().strip()
                
                # Check for Java content with latin-1 encoding
                java_indicators = ['package ', 'import ', 'public class', 'private class', 'protected class']
                is_java = any(indicator in content for indicator in java_indicators)
                
                if is_java:
                    logger.debug(f"Detected Java-like content in {filepath}")
                    content = extract_java_headers_and_comments(content)
                else:
                    # Check for and strip RTF formatting
                    content = strip_rtf(content)
                    
                return sanitize_content(content)
                
        except Exception as e:
            logger.error(f"Error reading file {filepath} with latin-1 encoding: {str(e)}")
            return None
            
    except Exception as e:
        logger.error(f"Error reading file {filepath}: {str(e)}")
        return None

def extract_requirement_id(filename, prefix):
    """
    Extract the requirement ID from filename and add appropriate prefix.
    
    Args:
        filename (str): The filename to process (e.g. 'UC1.txt' or 'Banner.txt')
        prefix (str): The prefix to add ('UC' or 'CC')
        
    Returns:
        str: Formatted ID (e.g. 'UC1' or 'CC_Banner')
    """
    logger.debug(f"Extracting requirement ID from filename: {filename} with prefix: {prefix}")
    
    try:
        # Remove .txt extension
        base_name = os.path.splitext(filename)[0]  # Remove .txt
        
        # For UC files, extract just the numbers
        if prefix == "UC":
            number = ''.join(filter(str.isdigit, base_name))
            if not number:
                logger.warning(f"No numeric ID found in filename: {filename}")
                return None
            formatted_id = f"{prefix}{number}"
        # For CC files, use the whole base name with an underscore
        else:
            formatted_id = f"{prefix}_{base_name}"
            
        logger.debug(f"Extracted ID: {formatted_id}")
        return formatted_id
        
    except Exception as e:
        logger.error(f"Error extracting requirement ID from {filename}: {e}")
        return None

In [7]:
# Cell 7: Main Processing Functions

@handle_exception
def process_requirements(source_dir, prefix, output_file, collection_id, collection_name, collection_desc):
    """
    Process all requirement files in a directory and create XML output.
    
    Args:
        source_dir: Directory containing requirement files
        prefix: Prefix for requirement IDs
        output_file: Output file path
        collection_id: ID for the collection
        collection_name: Name of the collection
        collection_desc: Description of the collection
    """
    logger.info(f"Processing requirements from directory: {source_dir}")
    logger.info(f"Creating output file: {output_file}")
    
    # Create XML structure
    root = create_xml_structure(collection_id, collection_name, collection_desc)
    artifacts_elem = root.find("artifacts")
    
    try:
        # Process each file in the directory
        for filename in sorted(os.listdir(source_dir)):
            if filename.endswith('.txt') or filename.endswith('.TXT'):
                filepath = os.path.join(source_dir, filename)
                logger.debug(f"Processing file: {filepath}")
                
                # Extract requirement ID with prefix
                req_id = extract_requirement_id(filename, prefix)
                if not req_id:
                    logger.warning(f"Skipping file due to invalid ID format: {filepath}")
                    continue
                
                # Process the file
                content = process_requirement_file(filepath)
                if content:
                    add_artifact(artifacts_elem, req_id, content)
                else:
                    logger.warning(f"Skipping file due to processing error: {filepath}")
        
        # Generate and validate XML
        xml_string = prettify_xml(root)
        
        if validate_xml(('<?xml version="1.0" encoding="utf-8"?>\n' + xml_string).encode('utf-8')):
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write('<?xml version="1.0" encoding="utf-8"?>\n')
                f.write(xml_string)
            
            logger.info(f"Successfully created XML file: {output_file}")
        else:
            logger.error("XML validation failed, file not written")
            
    except Exception as e:
        logger.error(f"Error processing requirements: {e}")
        raise

def main():
    """
    Main function to process requirements and generate XML files
    """
    logger.info("Starting requirements processing")
    
    # Process use cases (UC)
    process_requirements(
        source_dir="./UC",
        prefix="UC",
        output_file="eTOUR-comments-functions-sourceArtifacts.xml",
        collection_id="UC",
        collection_name="eTOUR Source Artifacts",
        collection_desc="Use cases"
    )
    
    # Process class codes (CC)
    process_requirements(
        source_dir="./CC",
        prefix="CC",
        output_file="eTOUR-comments-functions-targetArtifacts.xml",
        collection_id="CC",
        collection_name="eTOUR Target Artifacts",
        collection_desc="Class code headers and comments"
    )
    
    logger.info("Requirements processing completed")

In [None]:
# Cell 8: Execution Cell

if __name__ == "__main__":
    main()