# XML XPath Extractor

This notebook processes XML files from an input directory, extracts XPaths (unique or all), and saves the results to an output directory with detailed statistical analysis.

In [None]:
from __future__ import annotations

import xml.etree.ElementTree as ET
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Tuple, Optional

import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

INPUT_FOLDER_PATH: Path = Path("../../../../tests/test_data/test_eform_notices/")
OUTPUT_FOLDER_PATH: Path = Path("../../../../output")
EXTRACT_UNIQUE_XPATHS: bool = True
GENERATE_STATISTICS: bool = True

if not INPUT_FOLDER_PATH.exists():
    raise FileNotFoundError(f"Input folder does not exist: {INPUT_FOLDER_PATH}")

OUTPUT_FOLDER_PATH.mkdir(parents=True, exist_ok=True)
STATS_FOLDER_PATH = OUTPUT_FOLDER_PATH / 'statistics'
STATS_FOLDER_PATH.mkdir(parents=True, exist_ok=True)

input_folders = [item.name for item in INPUT_FOLDER_PATH.iterdir()]
print(input_folders)
print(STATS_FOLDER_PATH)

In [None]:
def get_all_paths(element: ET.Element,
                  current_path: str = "",
                  paths: Optional[List[str]] = None) -> List[str]:
    if paths is None:
        paths = []

    tag = element.tag
    if '}' in tag:
        tag = tag.split('}')[1]

    new_path = f"{current_path}/{tag}" if current_path else tag

    if EXTRACT_UNIQUE_XPATHS:
        if new_path not in paths:
            paths.append(new_path)
    else:
        paths.append(new_path)

    for attr in element.attrib.keys():
        attr_path = f"{new_path}/@{attr}"
        if EXTRACT_UNIQUE_XPATHS:
            if attr_path not in paths:
                paths.append(attr_path)
        else:
            paths.append(attr_path)

    for child in element:
        get_all_paths(child, new_path, paths)

    return paths

In [None]:
def process_xml_files(input_dir: Path) -> Tuple[List[str], Dict[str, List[str]]]:
    all_paths: List[str] = []
    file_path_map: Dict[str, List[str]] = {}

    xml_files = list(input_dir.glob('*.xml'))

    print(f"Found {len(xml_files)} XML files to process")
    print(f"{'Unique' if EXTRACT_UNIQUE_XPATHS else 'All'} XPaths will be extracted")

    for xml_file in tqdm(xml_files, desc="Processing XML files"):
        try:
            tree = ET.parse(xml_file)
            root = tree.getroot()

            file_paths = get_all_paths(root)
            file_path_map[xml_file.name] = file_paths

            if EXTRACT_UNIQUE_XPATHS:
                all_paths.extend([p for p in file_paths if p not in all_paths])
            else:
                all_paths.extend(file_paths)

        except Exception as e:
            print(f"Error processing {xml_file.name}: {str(e)}")

    return all_paths, file_path_map

In [None]:
class XPathStatisticsAnalyzer:
    def __init__(self, all_paths: List[str], file_path_mapping: Dict[str, List[str]]):
        self.all_paths = all_paths
        self.file_path_mapping = file_path_mapping
        self.unique_paths = list(dict.fromkeys(all_paths))

    def get_path_depth_stats(self) -> pd.DataFrame:
        def get_depth(xpath: str) -> int:
            return len([p for p in xpath.split('/') if p and not p.startswith('@')])

        depths = [get_depth(path) for path in self.unique_paths]
        depth_stats = pd.DataFrame({
            'xpath': self.unique_paths,
            'depth': depths,
            'occurrence_count': [self.all_paths.count(p) for p in self.unique_paths]
        })
        return depth_stats

    def get_attribute_analysis(self) -> pd.DataFrame:
        attr_data = []
        for path in self.unique_paths:
            if '@' in path:
                element_path = path.split('/@')[0]
                attr_name = path.split('/@')[1]
                count = self.all_paths.count(path)
                attr_data.append({
                    'element_path': element_path,
                    'attribute': attr_name,
                    'occurrence_count': count,
                    'files_count': sum(1 for paths in self.file_path_mapping.values() if path in paths)
                })
        return pd.DataFrame(attr_data)

    def get_path_patterns(self) -> pd.DataFrame:
        pattern_data = []
        for path in self.unique_paths:
            elements = [p for p in path.split('/') if p and not p.startswith('@')]
            pattern_data.append({
                'xpath': path,
                'element_count': len(elements),
                'has_attributes': '@' in path,
                'terminal_element': elements[-1] if elements else '',
                'occurrence_count': self.all_paths.count(path),
                'files_count': sum(1 for paths in self.file_path_mapping.values() if path in paths)
            })
        return pd.DataFrame(pattern_data)

    def get_file_complexity_metrics(self) -> pd.DataFrame:
        metrics_data = []
        for file, paths in self.file_path_mapping.items():
            unique_paths = list(dict.fromkeys(paths))
            metrics_data.append({
                'file_name': file,
                'total_paths': len(paths),
                'unique_paths': len(unique_paths),
                'max_depth': max(len(p.split('/')) for p in unique_paths),
                'avg_depth': np.mean([len(p.split('/')) for p in unique_paths]),
                'attribute_count': sum(1 for p in unique_paths if '@' in p),
                'complexity_score': len(unique_paths) * np.mean([len(p.split('/')) for p in unique_paths])
            })
        return pd.DataFrame(metrics_data)

    def get_path_relationships(self) -> pd.DataFrame:
        relationships = []
        for path in self.unique_paths:
            parts = path.split('/')
            for i in range(1, len(parts)):
                parent = '/'.join(parts[:i])
                child = '/'.join(parts[:i + 1])
                if parent and child != path:
                    relationships.append({
                        'parent_path': parent,
                        'child_path': child,
                        'level_difference': i,
                        'parent_occurrence': self.all_paths.count(parent),
                        'child_occurrence': self.all_paths.count(child)
                    })
        return pd.DataFrame(relationships)

In [None]:
class XPathExtractor:
    def __init__(self, input_path: Path, output_path: Path):
        self.input_path = input_path
        self.output_path = output_path
        self.stats_path = output_path / 'statistics'
        self.all_paths: List[str] = []
        self.file_path_mapping: Dict[str, List[str]] = {}
        self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    def save_results(self) -> None:
        results_dir = self.output_path / f'results_{self.timestamp}'
        results_dir.mkdir(parents=True, exist_ok=True)

        paths_df = pd.DataFrame(self.all_paths, columns=['xpath'])
        if EXTRACT_UNIQUE_XPATHS:
            output_name = 'unique_xpaths.csv'
        else:
            output_name = 'all_xpaths.csv'
            paths_df['occurrence_count'] = paths_df['xpath'].map(
                paths_df['xpath'].value_counts()
            )

        paths_csv_path = results_dir / output_name
        paths_df.to_csv(paths_csv_path, index=False)

        unique_paths = sorted(set(self.all_paths))
        matrix_data = [
            {'xpath': xpath, **{file: file_paths.count(xpath)
                                for file, file_paths in self.file_path_mapping.items()}}
            for xpath in unique_paths
        ]
        matrix_df = pd.DataFrame(matrix_data)
        matrix_df.to_csv(results_dir / 'xpath_file_matrix.csv', index=False)

        if GENERATE_STATISTICS:
            stats_dir = results_dir / 'statistics'
            stats_dir.mkdir(parents=True, exist_ok=True)

            analyzer = XPathStatisticsAnalyzer(self.all_paths, self.file_path_mapping)

            stats_files = {
                'depth_analysis.csv': analyzer.get_path_depth_stats(),
                'attribute_analysis.csv': analyzer.get_attribute_analysis(),
                'path_patterns.csv': analyzer.get_path_patterns(),
                'file_complexity.csv': analyzer.get_file_complexity_metrics(),
                'path_relationships.csv': analyzer.get_path_relationships()
            }

            for filename, df in stats_files.items():
                df.to_csv(stats_dir / filename, index=False)

            self._generate_summary_report(stats_dir, stats_files)

        print(f"Results saved to: {results_dir}")

    def _generate_summary_report(self, stats_dir: Path, stats_files: Dict[str, pd.DataFrame]) -> None:
        with open(stats_dir / 'summary_report.txt', 'w') as f:
            f.write(f"XPath Analysis Summary Report\n")
            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")

            f.write(f"Total number of XML files: {len(self.file_path_mapping)}\n")
            f.write(f"Total XPaths found: {len(self.all_paths)}\n")
            f.write(f"Unique XPaths: {len(set(self.all_paths))}\n\n")

            depth_stats = stats_files['depth_analysis.csv']
            f.write("Depth Statistics:\n")
            f.write(f"Maximum depth: {depth_stats['depth'].max()}\n")
            f.write(f"Average depth: {depth_stats['depth'].mean():.2f}\n\n")

            attr_stats = stats_files['attribute_analysis.csv']
            f.write("Attribute Statistics:\n")
            f.write(f"Total attributes found: {len(attr_stats)}\n")
            f.write(
                f"Most common attributes: {', '.join(attr_stats['attribute'].value_counts().head().index.tolist())}\n\n")

            complexity = stats_files['file_complexity.csv']
            f.write("File Complexity:\n")
            f.write(f"Most complex file: {complexity.loc[complexity['complexity_score'].idxmax(), 'file_name']}\n")
            f.write(f"Average complexity score: {complexity['complexity_score'].mean():.2f}\n")

    def display_statistics(self) -> None:
        print("=== Analysis Results ===")
        print(f"Results directory: {self.output_path}/results_{self.timestamp}")
        print(f"\nBasic Statistics:")
        print(f"- Total XML files processed: {len(self.file_path_mapping)}")
        print(f"- Total {'unique ' if EXTRACT_UNIQUE_XPATHS else ''}XPaths found: {len(self.all_paths)}")

        if GENERATE_STATISTICS:
            analyzer = XPathStatisticsAnalyzer(self.all_paths, self.file_path_mapping)

            depth_stats = analyzer.get_path_depth_stats()
            print("\nDepth Statistics:")
            print(f"- Maximum depth: {depth_stats['depth'].max()}")
            print(f"- Average depth: {depth_stats['depth'].mean():.2f}")

            attr_stats = analyzer.get_attribute_analysis()
            if not attr_stats.empty:
                print("\nAttribute Statistics:")
                print(f"- Total attributes found: {len(attr_stats)}")
                print("- Top 5 most common attributes:")
                for attr, count in attr_stats['attribute'].value_counts().head().items():
                    print(f"  - {attr}: {count} occurrences")

            complexity = analyzer.get_file_complexity_metrics()
            print("\nFile Complexity Metrics:")
            most_complex = complexity.loc[complexity['complexity_score'].idxmax()]
            print(f"- Most complex file: {most_complex['file_name']}")
            print(f"  - Complexity score: {most_complex['complexity_score']:.2f}")
            print(f"  - Unique paths: {most_complex['unique_paths']}")
            print(f"  - Max depth: {most_complex['max_depth']}")

    def process(self) -> None:
        self.all_paths, self.file_path_mapping = process_xml_files(
            self.input_path
        )
        self.save_results()
        self.display_statistics()

## Execute Processing

In [None]:
extractor = XPathExtractor(INPUT_FOLDER_PATH, OUTPUT_FOLDER_PATH)
extractor.process()