In [11]:
import logging
from pathlib import Path
from typing import List

from tqdm import tqdm

from ted_data_sampler.core.adapters.XPathValidator import XPATHValidator
from ted_data_sampler.core.services.import_eforms_fields import extract_xpaths_by_sdk_version



In [12]:
def generate_coverage(notices_folder_path: Path, xpaths_to_cover: List[str], sdk_version: str):

    stats_file_path = notices_folder_path / "stats.txt"
    stats_file_path.unlink(missing_ok=True)
    
    stats_file = stats_file_path.open("a")
    stats_file.write(f"Xpaths to cover: {len(xpaths_to_cover)}\n")
    xpaths_to_cover = list(set(xpaths_to_cover))
    stats_file.write(f"Distinct Xpaths to cover: {len(xpaths_to_cover)}\n")

    notices_path = [notice_path for notice_path in notices_folder_path.rglob('*.xml')]
    pbar = tqdm(total=len(xpaths_to_cover), desc=f"The coverage of XPaths for {sdk_version}\tsdk version queried only in sampled notices ({len(notices_path)})\twith the same version", dynamic_ncols=True)

    stats_file.write(f"Nr. of eform notices in {sdk_version} sdk version folder: {len(notices_path)}\n")
    covered_xpaths = []
    for idx, notice_path in enumerate(notices_path):

        validator = XPATHValidator(xml_content=notice_path.read_text(), logger=logging.getLogger())
        for xpath in xpaths_to_cover:
            if xpath not in covered_xpaths:
                try:
                    result = validator.validate(xpath_expression=xpath)
                    if len(result) > 0:
                        covered_xpaths.append(xpath)
                        pbar.update(1)
                except Exception as e:
                    stats_file.write(f"Something went wrong with: {e}")
                    stats_file.write(f"Xpath: {xpath}")
                    stats_file.write(f"id: {idx} | File: {notice_path}")
                    raise e

        if len(covered_xpaths) == xpaths_to_cover:
            stats_file.write(f"Covered in: {idx} notices")

    pbar.close()
    progress_bar_str = tqdm.format_meter(
        n=pbar.n,  # Current iteration count
        total=pbar.total,  # Total iterations
        elapsed=pbar.format_dict["elapsed"],  # Elapsed time
        ncols=pbar.ncols,  # Number of columns
        prefix=pbar.desc  # Prefix (if any)
    )
    
    stats_file.write(f"Nr. of covered xpaths: {len(covered_xpaths)}\n")
    uncovered_xpaths = list(set(xpaths_to_cover) - set(covered_xpaths))
    stats_file.write(f"Uncovered xpaths: {len(uncovered_xpaths)}\n")
    stats_file.write(f"{progress_bar_str}\n")
    stats_file.close()
    (notices_folder_path / "./uncovered_xpaths.txt").write_text("\n".join(uncovered_xpaths))
    (notices_folder_path / "./covered_xpaths.txt").write_text("\n".join(covered_xpaths))
    (notices_folder_path / "./xpaths_to_cover.txt").write_text("\n".join(xpaths_to_cover))

In [13]:

input_folder = Path("/mnt/c/Users/user/Desktop/data_samples_eforms_575_group_by_sdk_version_notice_type_notice_subtype")
xpaths_versions: List[str] = ["1.3.0", "1.4.0", "1.5.0", "1.6.0", "1.7.0", "1.8.0", "1.9.1", "1.10.0", "1.11.0", "1.12.0", "1.13.0-rc.3"]

assert input_folder.is_dir()

all_xpaths = []
for xpath_version in xpaths_versions:
    samples_folder_name = input_folder / f"eforms-sdk-{'.'.join(xpath_version.split('.')[:2])}"
    if samples_folder_name.is_dir():
        xpaths: List[str] = extract_xpaths_by_sdk_version(xpath_version)
        coverage_result = generate_coverage(samples_folder_name, xpaths, xpath_version)
        all_xpaths.extend(xpaths)
        
all_xpaths = list(set(all_xpaths))
generate_coverage(input_folder, all_xpaths, "all")

The coverage of XPaths for 1.3.0	sdk version queried only in sampled notices (33)	with the same version:  35%|███▌      | 343/971 [00:03<00:05, 111.81it/s]
The coverage of XPaths for 1.5.0	sdk version queried only in sampled notices (10)	with the same version:  28%|██▊       | 273/984 [00:00<00:01, 454.58it/s] 
The coverage of XPaths for 1.6.0	sdk version queried only in sampled notices (39)	with the same version:  43%|████▎     | 418/982 [00:02<00:02, 192.03it/s] 
The coverage of XPaths for 1.7.0	sdk version queried only in sampled notices (98)	with the same version:  68%|██████▊   | 670/986 [00:04<00:01, 165.55it/s] 
The coverage of XPaths for 1.8.0	sdk version queried only in sampled notices (70)	with the same version:  52%|█████▏    | 536/1040 [00:03<00:03, 134.85it/s] 
The coverage of XPaths for 1.9.1	sdk version queried only in sampled notices (81)	with the same version:  61%|██████    | 920/1507 [00:06<00:03, 151.62it/s] 
The coverage of XPaths for 1.10.0	sdk version queried onl