In [13]:
import logging
from logging import Logger
from pydantic import BaseModel
from ted_data_sampler.core.adapters.XPathValidator import XPATHValidator
from pathlib import Path
from typing import List

In [14]:
NOTICES_FOLDER_PATH = Path("/mnt/c/Users/user/Desktop/ted-data-sampler/tests/test_data/test_eform_notices")

XPATHS = [
    "/*/cbc:ID",
    "/*/cbc:ID/@schemeName",
    "/*/cbc:ID[@schemeName='notice-id']",
    ".//ext:UBLExtensions/ext:UBLExtension/ext:ExtensionContent/efext:EformsExtension/efac:Publication/efbc:NoticePublicationID[@schemeName='ojs-notice-id']"
]

In [15]:
NOTICE_PATHS: List[Path] = list(NOTICES_FOLDER_PATH.rglob("*.xml"))
print(len(NOTICE_PATHS))

10


In [16]:


class XPathQueryResult(BaseModel):
    xpath: str
    query_result: List[str]
    
    def __str__(self):
        if not self.query_result:
            return "[]"
        return ",".join(self.query_result)

class NoticeQueryResult(BaseModel):
    file_path: Path
    xpath_query_results: List[XPathQueryResult]
    file_name: str
    
    def __str__(self):
        return "{}\t{}".format(self.file_name, "\t".join([ str(xpath_query_results) for xpath_query_results in self.xpath_query_results ]))
    
        
class NoticeQuerySummary(BaseModel):
    xpaths: List[str]
    notices_query_result: List[NoticeQueryResult]
    
    def __str__(self):
        return "{}\t{}".format("Notice", "\t".join( self.xpaths ))


def query_notices_with_given_xpaths(xpaths: List[str], notice_paths: List[Path], logger: Logger) -> List[NoticeQueryResult]:    
    result = NoticeQuerySummary(xpaths=xpaths, notices_query_result=[])
    
    for notice_path in notice_paths:
        xml_content = notice_path.read_text()
        xpath_validator = XPATHValidator(logger=logger, xml_content=xml_content)
        notice_result: NoticeQueryResult = NoticeQueryResult(file_path=notice_path, xpath_query_results=[], file_name=notice_path.name)
        for xpath in xpaths:
            try:
                validate_result = xpath_validator.validate(xpath)
            except Exception as e:
                logger.error(e)
                query_result = []
            else:
                query_result: List[str] = [xpath_result.value for xpath_result in validate_result ]
            notice_result.xpath_query_results.append( XPathQueryResult(xpath=xpath, query_result=query_result) )
            
        result.notices_query_result.append(notice_result)

    return result

In [17]:
result = query_notices_with_given_xpaths(xpaths=XPATHS, notice_paths=NOTICE_PATHS, logger=logging.getLogger())

In [18]:
print(str(result))
for notice_result in result.notices_query_result:
    print(str(notice_result))

Notice	/*/cbc:ID	/*/cbc:ID/@schemeName	/*/cbc:ID[@schemeName='notice-id']	.//ext:UBLExtensions/ext:UBLExtension/ext:ExtensionContent/efext:EformsExtension/efac:Publication/efbc:NoticePublicationID[@schemeName='ojs-notice-id']
00468415_2024.xml	45602351-9e8c-493c-b9cf-bd71db4ffa08	notice-id	45602351-9e8c-493c-b9cf-bd71db4ffa08	00468415-2024
00468603_2024.xml	[]	[]	[]	00468603-2024
00468629_2024.xml	44d27597-9a3c-47f9-92c2-b66ee39392b3	notice-id	44d27597-9a3c-47f9-92c2-b66ee39392b3	00468629-2024
00468697_2024.xml	27866296-b033-419b-bf98-a8e62a8b74bd	notice-id	27866296-b033-419b-bf98-a8e62a8b74bd	00468697-2024
00468740_2024.xml	d44b7539-fc43-4284-b679-70a08ab8e886	notice-id	d44b7539-fc43-4284-b679-70a08ab8e886	00468740-2024
00468749_2024.xml	b8050444-3a2d-4777-8663-ee1235e09ef1,b8050444-3a2d-4777-8663-ee1235e09ef2	notice-id,notice-id	b8050444-3a2d-4777-8663-ee1235e09ef1,b8050444-3a2d-4777-8663-ee1235e09ef2	00468749-2024
00468785_2024.xml	d05b1f29-139a-4009-9975-3b9260cfd683	notice-id	d05b