In [1]:
# Cell 1: eTOUR/eTOUR/00_answerSet_update.ipynb
# Imports and Setup

import xml.etree.ElementTree as ET
import os
import logging
import re

# Set up logging configuration
logging.basicConfig(
    level=logging.DEBUG,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)

logger = logging.getLogger('answer_set_updater')

In [2]:
# Cell 2: Helper Functions

def handle_exception(func):
    """
    Decorator to handle exceptions in functions.
    
    Args:
        func: The function to wrap with exception handling
        
    Returns:
        The wrapped function with exception handling
    """
    def wrapper(*args, **kwargs):
        try:
            return func(*args, **kwargs)
        except Exception as e:
            logger.error(f"Error in {func.__name__}: {str(e)}")
            raise
    return wrapper

def get_cc_mapping():
    """
    Create a mapping from numeric CC IDs to filename-based CC IDs.
    Scans the CC directory to build the mapping.
    
    Returns:
        dict: Mapping from numeric IDs to filename-based IDs
    """
    logger.info("Building CC ID mapping")
    cc_mapping = {}
    
    try:
        cc_dir = "./CC"
        if not os.path.exists(cc_dir):
            logger.warning(f"CC directory not found: {cc_dir}")
            return cc_mapping
            
        # Get all CC files
        cc_files = sorted([f for f in os.listdir(cc_dir) if f.lower().endswith('.txt')])
        
        # Create mapping from index to filename
        for i, filename in enumerate(cc_files, 1):
            base_name = os.path.splitext(filename)[0]
            cc_mapping[f"CC{i}"] = f"CC_{base_name}"
            logger.debug(f"Mapped CC{i} to CC_{base_name}")
            
        logger.info(f"Created mapping for {len(cc_mapping)} CC files")
        return cc_mapping
        
    except Exception as e:
        logger.error(f"Error building CC mapping: {str(e)}")
        return {}

In [3]:
# Cell 3: Updated Content Sanitization Functions with Special Character Fix

def sanitize_content(content):
    """
    Sanitize content to ensure XML/JSON compatibility.
    
    Args:
        content (str): The raw text content to sanitize
        
    Returns:
        str: Sanitized content with problematic characters replaced
    """
    logger.debug("Sanitizing content")
    
    # Define the XML 1.0 specification valid characters
    # Valid ranges are:
    # tab, newline, carriage return (0x9, 0xA, 0xD)
    # ASCII characters (0x20-0x7E)
    # Extended ASCII and Unicode (0x7F-0xFFFD)
    # Excluding surrogate blocks (0xFDD0-0xFDEF)
    
    # First, replace common problematic characters
    replacements = {
        '\u2018': "'",  # Left single quote
        '\u2019': "'",  # Right single quote
        '\u201C': '"',  # Left double quote
        '\u201D': '"',  # Right double quote
        '\u2013': '-',  # En dash
        '\u2014': '--', # Em dash
        '\u2026': '...', # Ellipsis
        '\u00B7': '-',  # Middle dot/bullet point
        '\u2022': '-',  # Bullet point
        '\u00A0': ' ',  # Non-breaking space
        '\u00B0': ' ',  # Degree sign
        '\u00BA': ' ',  # Masculine ordinal indicator
        '\u00E8': 'e',  # è (e with grave)
        '\u00E9': 'e',  # é (e with acute)
        '\u00EC': 'i',  # ì (i with grave)
        '\u00F2': 'o',  # ò (o with grave)
        '\u00F9': 'u',  # ù (u with grave)
        '\u00BF': '?',  # Inverted question mark
        '\u00A1': '!',  # Inverted exclamation mark
        # Add the problematic character from UC files
        '\u00B0': '-',  # Degree symbol that appears as � in the UC files
        '\u00B7': '-',  # Middle dot that appears as � in the UC files
        '\u2022': '-',  # Bullet that appears as � in the UC files
        '\u2023': '-',  # Triangular bullet that might appear as � in the UC files
        '\u25E6': '-',  # White bullet that might appear as � in the UC files
        '\u2043': '-',  # Hyphen bullet that might appear as � in the UC files
        '\u204C': '-',  # Black leftwards bullet that might appear as � in the UC files
        '\u204D': '-',  # Black rightwards bullet that might appear as � in the UC files
        '\u2219': '-',  # Bullet operator that might appear as � in the UC files
        '\u25D8': '-',  # Inverse bullet that might appear as � in the UC files
        '\u25D9': '-',  # Inverse white circle that might appear as � in the UC files
        '\u25AA': '-',  # Black small square that might appear as � in the UC files
        '\u25AB': '-',  # White small square that might appear as � in the UC files
        '\u25FE': '-',  # Black medium small square that might appear as � in the UC files
        '\u25FD': '-',  # White medium small square that might appear as � in the UC files
        # Catch-all for any remaining � characters
        '�': '-'        # Direct replacement for the � character
    }
    
    for old, new in replacements.items():
        content = content.replace(old, new)
    
    # Function to check if a character is valid XML
    def is_valid_xml_char(char):
        codepoint = ord(char)
        return (
            codepoint == 0x9 or
            codepoint == 0xA or
            codepoint == 0xD or
            (0x20 <= codepoint <= 0x7E) or
            (0x7F <= codepoint <= 0xFFFD and not (0xFDD0 <= codepoint <= 0xFDEF))
        )
    
    # Replace invalid characters with space
    sanitized = ''.join(char if is_valid_xml_char(char) else ' ' for char in content)
    
    # Remove multiple spaces while preserving newlines
    sanitized = re.sub(r' +', ' ', sanitized)
    sanitized = re.sub(r'\n +', '\n', sanitized)
    
    # Clean up empty lines while preserving paragraph structure
    sanitized = re.sub(r'\n\s*\n\s*\n+', '\n\n', sanitized)
    
    logger.debug("Content sanitization completed")
    return sanitized.strip()

In [4]:
# Cell 4: Transformation Function

@handle_exception
def transform_answer_set(input_file, output_file):
    """
    Transform the answer set XML file by:
    1. Changing EA prefix to UC in source_artifact_id
    2. Changing EA prefix to CC_[Filename] in target_artifact_id using mapping
    3. Adding confidence score of 1 to each link
    4. Ensuring all target IDs have CC_ prefix
    
    Args:
        input_file (str): Path to input XML file
        output_file (str): Path to output XML file
    """
    logger.info(f"Transforming answer set from {input_file} to {output_file}")
    
    # Get CC mapping from numeric IDs to filename-based IDs
    cc_mapping = get_cc_mapping()
    
    # Parse the XML file
    tree = ET.parse(input_file)
    root = tree.getroot()
    
    # Track statistics
    total_links = 0
    updated_sources = 0
    updated_targets = 0
    added_confidence = 0
    
    # Process each link element
    for link in root.findall('.//link'):
        total_links += 1
        
        # Get source and target elements
        source = link.find('source_artifact_id')
        target = link.find('target_artifact_id')
        
        # Replace EA with UC in source
        if source is not None and source.text and source.text.startswith('EA'):
            old_value = source.text
            source.text = 'UC' + source.text[2:]
            logger.debug(f"Updated source: {old_value} -> {source.text}")
            updated_sources += 1
            
        # Replace EA with CC_[Filename] in target using mapping
        if target is not None and target.text:
            old_value = target.text
            
            # Case 1: If it starts with EA, convert to CC format
            if target.text.startswith('EA'):
                numeric_id = 'CC' + target.text[2:]  # Convert EA123 to CC123
                
                # Use mapping if available, otherwise keep numeric ID
                if numeric_id in cc_mapping:
                    target.text = cc_mapping[numeric_id]
                    logger.debug(f"Updated target with mapping: {old_value} -> {target.text}")
                else:
                    target.text = numeric_id
                    logger.debug(f"Updated target without mapping: {old_value} -> {target.text}")
                
                updated_targets += 1
            
            # Case 2: If it's already a class name but missing CC_ prefix
            elif not target.text.startswith('CC_'):
                # Add CC_ prefix to ensure consistency with target artifacts
                target.text = f"CC_{target.text}"
                logger.debug(f"Added CC_ prefix to target: {old_value} -> {target.text}")
                updated_targets += 1
        
        # Add confidence score if it doesn't exist
        if link.find('confidence_score') is None:
            confidence = ET.SubElement(link, 'confidence_score')
            confidence.text = '1'
            added_confidence += 1
    
    # Write the modified XML to the output file
    tree.write(output_file, encoding='utf-8', xml_declaration=True)
    
    # Log statistics
    logger.info(f"Processed {total_links} links")
    logger.info(f"Updated {updated_sources} source IDs")
    logger.info(f"Updated {updated_targets} target IDs")
    logger.info(f"Added confidence score to {added_confidence} links")
    logger.info(f"Successfully transformed XML and saved to {output_file}")

In [None]:
# Cell 5: Execution and Verification

@handle_exception
def verify_transformation(output_file):
    """
    Verify the transformation by reading and displaying parts of the output file.
    
    Args:
        output_file (str): Path to the transformed XML file
    """
    logger.info(f"Verifying transformation in {output_file}")
    
    # Check if file exists
    if not os.path.exists(output_file):
        logger.error(f"Output file not found: {output_file}")
        return
    
    # Read and display first few links
    tree = ET.parse(output_file)
    root = tree.getroot()
    
    # Get all links
    links = root.findall('.//link')
    
    # Display statistics
    logger.info(f"Total links in transformed file: {len(links)}")
    
    # Display first 5 links
    print("\nSample of transformed links:")
    for i, link in enumerate(links[:5]):
        source = link.find('source_artifact_id').text
        target = link.find('target_artifact_id').text
        confidence = link.find('confidence_score').text
        print(f"Link {i+1}: {source} -> {target} (confidence: {confidence})")

# Define input and output file paths
input_file = 'answer_req_code.xml'
output_file = 'eTOUR-answerSet.xml'

# Execute the transformation
transform_answer_set(input_file, output_file)

# Verify the transformation
verify_transformation(output_file)

In [None]:
# Cell 6: Additional Validation (Optional)

@handle_exception
def validate_against_artifacts(answer_set_file, source_artifacts_file, target_artifacts_file):
    """
    Validate that all IDs in the answer set exist in the source and target artifact files.
    
    Args:
        answer_set_file (str): Path to the answer set XML file
        source_artifacts_file (str): Path to the source artifacts XML file
        target_artifacts_file (str): Path to the target artifacts XML file
    """
    logger.info("Validating answer set against artifact files")
    
    # Check if files exist
    for file_path in [answer_set_file, source_artifacts_file, target_artifacts_file]:
        if not os.path.exists(file_path):
            logger.error(f"File not found: {file_path}")
            return
    
    # Parse XML files
    answer_tree = ET.parse(answer_set_file)
    source_tree = ET.parse(source_artifacts_file)
    target_tree = ET.parse(target_artifacts_file)
    
    # Get all artifact IDs
    source_ids = set(elem.text for elem in source_tree.findall('.//artifact/id'))
    target_ids = set(elem.text for elem in target_tree.findall('.//artifact/id'))
    
    # Get all IDs from answer set
    answer_source_ids = set(elem.text for elem in answer_tree.findall('.//source_artifact_id'))
    answer_target_ids = set(elem.text for elem in answer_tree.findall('.//target_artifact_id'))
    
    # Check for missing IDs
    missing_source_ids = answer_source_ids - source_ids
    missing_target_ids = answer_target_ids - target_ids
    
    # Report results
    if missing_source_ids:
        logger.warning(f"Found {len(missing_source_ids)} source IDs in answer set that don't exist in source artifacts")
        for id in sorted(missing_source_ids)[:10]:  # Show first 10
            logger.warning(f"Missing source ID: {id}")
    else:
        logger.info("All source IDs in answer set exist in source artifacts")
    
    if missing_target_ids:
        logger.warning(f"Found {len(missing_target_ids)} target IDs in answer set that don't exist in target artifacts")
        for id in sorted(missing_target_ids)[:10]:  # Show first 10
            logger.warning(f"Missing target ID: {id}")
    else:
        logger.info("All target IDs in answer set exist in target artifacts")
    
    # Overall validation result
    if not missing_source_ids and not missing_target_ids:
        logger.info("Validation successful: All IDs in answer set exist in artifact files")
    else:
        logger.warning("Validation found issues with IDs in answer set")

# Run validation if artifact files exist
if os.path.exists('eTOUR-sourceArtifacts.xml') and os.path.exists('eTOUR-targetArtifacts.xml'):
    validate_against_artifacts('eTOUR-answerSet.xml', 'eTOUR-sourceArtifacts.xml', 'eTOUR-targetArtifacts.xml')
else:
    logger.warning("Skipping validation: Artifact files not found")

In [None]:
# Cell 7: Additional Fix for Target Artifacts (if needed)

@handle_exception
def fix_target_artifacts_format(target_artifacts_file):
    """
    Fix the target artifacts XML file to ensure all IDs have the CC_ prefix.
    This is a one-time fix in case the target artifacts were generated without proper prefixes.
    
    Args:
        target_artifacts_file (str): Path to the target artifacts XML file
    """
    logger.info(f"Checking and fixing target artifacts format in {target_artifacts_file}")
    
    # Check if file exists
    if not os.path.exists(target_artifacts_file):
        logger.error(f"Target artifacts file not found: {target_artifacts_file}")
        return
    
    # Parse XML file
    tree = ET.parse(target_artifacts_file)
    root = tree.getroot()
    
    # Track changes
    changes_made = 0
    
    # Process each artifact ID
    for id_elem in root.findall('.//artifact/id'):
        if id_elem.text and not id_elem.text.startswith('CC_'):
            old_value = id_elem.text
            id_elem.text = f"CC_{old_value}"
            logger.debug(f"Fixed artifact ID: {old_value} -> {id_elem.text}")
            changes_made += 1
    
    # Only write if changes were made
    if changes_made > 0:
        logger.info(f"Fixed {changes_made} artifact IDs in target artifacts file")
        tree.write(target_artifacts_file, encoding='utf-8', xml_declaration=True)
    else:
        logger.info("No fixes needed in target artifacts file")

# Run the fix if target artifacts file exists
if os.path.exists('eTOUR-targetArtifacts.xml'):
    fix_target_artifacts_format('eTOUR-targetArtifacts.xml')
else:
    logger.warning("Target artifacts file not found, skipping fix")