In [None]:
import os
import logging
import xml.etree.ElementTree as ET
from xml.dom import minidom
import re
from datetime import datetime
from pathlib import Path
import sys

# Get the actual project root by going up to the main project directory
notebook_path = Path.cwd()  # Current working directory (notebook location)
project_root = notebook_path.parent.parent.parent  # Go up 3 levels to reach project root
src_path = project_root / 'src'
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from thesis_sentence_transformer.logger import setup_logging, handle_exception, DebugTimer

logger = setup_logging("requirements_processor")

def sanitize_content(content):
    """
    Sanitize content to ensure XML/JSON compatibility.
    Replace or remove problematic characters while preserving meaningful whitespace.
    """
    logger.debug("Sanitizing content")
    
    # Define the XML 1.0 specification valid characters
    # Valid ranges are:
    # tab, newline, carriage return (0x9, 0xA, 0xD)
    # ASCII characters (0x20-0x7E)
    # Extended ASCII and Unicode (0x7F-0xFFFD)
    # Excluding surrogate blocks (0xFDD0-0xFDEF)
    
    # First, replace common problematic characters
    replacements = {
        '\u2018': "'",  # Left single quote
        '\u2019': "'",  # Right single quote
        '\u201C': '"',  # Left double quote
        '\u201D': '"',  # Right double quote
        '\u2013': '-',  # En dash
        '\u2014': '--', # Em dash
        '\u2026': '...' # Ellipsis
    }
    
    for old, new in replacements.items():
        content = content.replace(old, new)
    
    # Function to check if a character is valid XML
    def is_valid_xml_char(char):
        codepoint = ord(char)
        return (
            codepoint == 0x9 or
            codepoint == 0xA or
            codepoint == 0xD or
            (0x20 <= codepoint <= 0x7E) or
            (0x7F <= codepoint <= 0xFFFD and not (0xFDD0 <= codepoint <= 0xFDEF))
        )
    
    # Replace invalid characters with space
    sanitized = ''.join(char if is_valid_xml_char(char) else ' ' for char in content)
    
    # Remove multiple spaces while preserving newlines
    sanitized = re.sub(r' +', ' ', sanitized)
    sanitized = re.sub(r'\n +', '\n', sanitized)
    
    # Clean up empty lines while preserving paragraph structure
    sanitized = re.sub(r'\n\s*\n\s*\n+', '\n\n', sanitized)
    
    logger.debug("Content sanitization completed")
    return sanitized.strip()

def create_xml_structure(collection_id, name, description):
    """Create the basic XML structure for artifacts collection."""
    logger.debug(f"Creating XML structure for collection: {collection_id}")
    
    root = ET.Element("artifacts_collection")
    
    # Add collection info
    collection_info = ET.SubElement(root, "collection_info")
    id_elem = ET.SubElement(collection_info, "id")
    id_elem.text = collection_id
    name_elem = ET.SubElement(collection_info, "name")
    name_elem.text = name
    version_elem = ET.SubElement(collection_info, "version")
    version_elem.text = "1.1"
    desc_elem = ET.SubElement(collection_info, "description")
    desc_elem.text = description
    
    # Add artifacts container
    artifacts = ET.SubElement(root, "artifacts")
    
    return root

@handle_exception 
def extract_java_comments(content):
    """
    Extract comments and structure from Java-like content
    
    Args:
        content: String containing Java code
    Returns:
        String containing extracted comments and structure
    """
    timer = DebugTimer("java_comment_extraction", logger)
    logger.debug("Starting Java comment extraction")

    # Initialize output
    extracted_content = []
    
    # Track position in content
    pos = 0
    content_length = len(content)
    logger.debug(f"Processing content of length: {content_length}")
    
    while pos < content_length:
        timer.checkpoint(f"Processing position {pos}")
        
        # Look for comment markers
        block_comment_start = content.find("/**", pos)
        line_comment_start = content.find("//", pos)
        
        logger.debug(f"Found block comment at: {block_comment_start}, line comment at: {line_comment_start}")
        
        # No more comments found
        if block_comment_start == -1 and line_comment_start == -1:
            logger.debug("No more comments found, breaking loop")
            break
            
        # Process block comment
        if block_comment_start != -1 and (line_comment_start == -1 or block_comment_start < line_comment_start):
            logger.debug(f"Processing block comment at position {block_comment_start}")
            block_comment_end = content.find("*/", block_comment_start)
            
            if block_comment_end == -1:
                logger.warning("Unclosed block comment found, breaking")
                break
                
            comment = content[block_comment_start:block_comment_end + 2]
            extracted_content.append(comment)
            pos = block_comment_end + 2
            logger.debug(f"Advanced position to: {pos} after block comment")
            
        # Process line comment  
        elif line_comment_start != -1:
            logger.debug(f"Processing line comment at position {line_comment_start}")
            line_end = content.find("\n", line_comment_start)
            
            if line_end == -1:
                line_end = content_length
                
            comment = content[line_comment_start:line_end]
            extracted_content.append(comment)
            pos = line_end + 1
            logger.debug(f"Advanced position to: {pos} after line comment")
            
        # Failsafe - if position hasn't advanced
        if pos <= content_length and pos == block_comment_start:
            logger.warning(f"Position not advancing at {pos}, forcing advancement")
            pos += 1
            
        # Additional safety check
        if pos > content_length:
            logger.warning("Position exceeded content length, breaking")
            break
            
    timer.checkpoint("Finished comment extraction")
    result = "\n".join(extracted_content)
    timer.end()
    
    return result

def strip_rtf(content):
    """
    Strip RTF formatting from text while preserving the actual content.
    
    Args:
        content: String potentially containing RTF formatting
        
    Returns:
        Plain text with RTF formatting removed
    """
    logger.debug("Stripping RTF formatting")
    
    # Check if content appears to be RTF
    if not content.startswith('{\\rtf'):
        return content
        
    # Remove RTF control words and groups
    def strip_rtf_controls(text):
        # Remove RTF headers and groups
        text = re.sub(r'^{\\rtf[^}]*}', '', text)
        # Remove other RTF groups
        text = re.sub(r'{[^{}]*}', '', text)
        # Remove RTF control words (starting with backslash)
        text = re.sub(r'\\[a-z]+[-]?[0-9]*\s?', '', text)
        # Remove special characters
        text = re.sub(r'\\\~|\\\-|\\\||\\\:|\\\<|\\\>|\\\{|\\\}|\\\'[0-9a-f]{2}', '', text)
        return text
    
    # Strip RTF formatting
    plain_text = strip_rtf_controls(content)
    
    # Clean up the resulting text
    plain_text = re.sub(r'\s+', ' ', plain_text)  # Normalize whitespace
    plain_text = plain_text.strip()
    
    logger.debug("RTF formatting stripped successfully")
    return plain_text

def process_requirement_file(filepath):
    """
    Process a single requirement file and extract its content.
    For Java-like content (in CC directory), only extract comments.
    For RTF content, strip the formatting.
    
    Args:
        filepath: Path to the requirement file
        
    Returns:
        Sanitized content from the file
    """
    logger.debug(f"Processing requirement file: {filepath}")
    
    # First try UTF-8 with error handling
    try:
        with open(filepath, 'r', encoding='utf-8', errors='replace') as file:
            content = file.read().strip()
            if '\ufffd' in content:  # Replacement character indicating encoding issues
                raise UnicodeDecodeError('utf-8', b'', 0, 1, 'invalid start byte')
            
            # Check if this is likely Java code
            java_indicators = ['package ', 'import ', 'public class', 'private class', 'protected class']
            is_java = any(indicator in content for indicator in java_indicators)
            
            if is_java:
                logger.debug(f"Detected Java-like content in {filepath}")
                content = extract_java_comments(content)
            else:
                # Check for and strip RTF formatting
                content = strip_rtf(content)
            
            return sanitize_content(content)
            
    except UnicodeDecodeError:
        # Only log warning once per directory
        dir_path = os.path.dirname(filepath)
        if not hasattr(process_requirement_file, '_warned_dirs'):
            process_requirement_file._warned_dirs = set()
        if dir_path not in process_requirement_file._warned_dirs:
            logger.warning(f"Files in {dir_path} appear to use latin-1 encoding instead of UTF-8")
            process_requirement_file._warned_dirs.add(dir_path)
            
        try:
            with open(filepath, 'r', encoding='latin-1') as file:
                content = file.read().strip()
                
                # Check for Java content with latin-1 encoding
                java_indicators = ['package ', 'import ', 'public class', 'private class', 'protected class']
                is_java = any(indicator in content for indicator in java_indicators)
                
                if is_java:
                    logger.debug(f"Detected Java-like content in {filepath}")
                    content = extract_java_comments(content)
                else:
                    # Check for and strip RTF formatting
                    content = strip_rtf(content)
                    
                return sanitize_content(content)
                
        except Exception as e:
            logger.error(f"Error reading file {filepath} with latin-1 encoding: {str(e)}")
            return None
            
    except Exception as e:
        logger.error(f"Error reading file {filepath}: {str(e)}")
        return None

def add_artifact(artifacts_elem, req_id, content):
    """Add an artifact to the artifacts element."""
    logger.debug(f"Adding artifact with ID: {req_id}")
    
    try:
        artifact = ET.SubElement(artifacts_elem, "artifact")
        
        id_elem = ET.SubElement(artifact, "id")
        id_elem.text = req_id
        
        content_elem = ET.SubElement(artifact, "content")
        content_elem.text = content
        
        parent_id = ET.SubElement(artifact, "parent_id")
        
        logger.debug(f"Successfully added artifact: {req_id}")
    except Exception as e:
        logger.error(f"Error adding artifact {req_id}: {e}")
        raise

def validate_xml(xml_string):
    """Validate that the XML is well-formed."""
    try:
        ET.fromstring(xml_string)
        return True
    except ET.ParseError as e:
        logger.error(f"XML validation failed: {e}")
        return False

def prettify_xml(elem):
    """Return a pretty-printed XML string for the Element."""
    rough_string = ET.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)
    # Skip the XML declaration since we'll add it manually
    pretty_xml = reparsed.toprettyxml(indent="  ")
    # Remove the first line containing the XML declaration
    return '\n'.join(pretty_xml.split('\n')[1:])

def extract_requirement_id(filename, prefix):
    """
    Extract the requirement ID from filename and add appropriate prefix.
    
    Args:
        filename (str): The filename to process (e.g. 'EA140.txt' or 'EA1.txt')
        prefix (str): The prefix to add ('UC' or 'CC')
        
    Returns:
        str: Formatted ID (e.g. 'UC140' or 'CC1')
    """
    logger.debug(f"Extracting requirement ID from filename: {filename} with prefix: {prefix}")
    
    try:
        # Remove .txt extension and extract number
        base_name = os.path.splitext(filename)[0]  # Remove .txt
        number = ''.join(filter(str.isdigit, base_name))  # Extract just the numbers
        
        if not number:
            logger.warning(f"No numeric ID found in filename: {filename}")
            return None
            
        formatted_id = f"{prefix}{number}"
        logger.debug(f"Extracted ID: {formatted_id}")
        return formatted_id
        
    except Exception as e:
        logger.error(f"Error extracting requirement ID from {filename}: {e}")
        return None

def process_requirements(source_dir, prefix, output_file, collection_id, collection_name, collection_desc):
    """Process all requirement files in a directory and create XML output."""
    logger.info(f"Processing requirements from directory: {source_dir}")
    logger.info(f"Creating output file: {output_file}")
    
    # Create XML structure
    root = create_xml_structure(collection_id, collection_name, collection_desc)
    artifacts_elem = root.find("artifacts")
    
    try:
        # Process each file in the directory
        for filename in sorted(os.listdir(source_dir)):
            if filename.endswith('.txt'):
                filepath = os.path.join(source_dir, filename)
                logger.debug(f"Processing file: {filepath}")
                
                # Extract requirement ID with prefix
                req_id = extract_requirement_id(filename, prefix)
                if not req_id:
                    logger.warning(f"Skipping file due to invalid ID format: {filepath}")
                    continue
                
                # Process the file
                content = process_requirement_file(filepath)
                if content:
                    add_artifact(artifacts_elem, req_id, content)
                else:
                    logger.warning(f"Skipping file due to processing error: {filepath}")
        
        # Generate and validate XML
        xml_string = prettify_xml(root)
        
        if validate_xml(('<?xml version="1.0" encoding="utf-8"?>\n' + xml_string).encode('utf-8')):
            with open(output_file, 'w', encoding='utf-8') as f:
                f.write('<?xml version="1.0" encoding="utf-8"?>\n')
                f.write(xml_string)
            
            logger.info(f"Successfully created XML file: {output_file}")
        else:
            logger.error("XML validation failed, file not written")
            
    except Exception as e:
        logger.error(f"Error processing requirements: {e}")
        raise

def main():
    logger.info("Starting requirements processing")
    
    # Process use cases (UC)
    process_requirements(
        source_dir="./uc",
        prefix="UC",
        output_file="eANCI-comments-functions-sourceArtifacts.xml",
        collection_id="UC",
        collection_name="eANCI Source Artifacts",
        collection_desc="Use cases"
    )
    
    # Process class codes (CC)
    process_requirements(
        source_dir="./cc",
        prefix="CC",
        output_file="eANCI-comments-functions-targetArtifacts.xml",
        collection_id="CC",
        collection_name="eANCI Target Artifacts",
        collection_desc="Class code"
    )
    
    logger.info("Requirements processing completed")

if __name__ == "__main__":
    main()