In [1]:
from enum import Enum
import bs4 as bs
import re

class VocabularyXMLFileType(Enum):
    XML_BIREME = 'Bireme'
    XML_ERIC_THESAURUS = 'Eric'
    SKOS = 'SKOS'

class VocabularyLabel:
    def __init__(self, label:str, preferential_label: str = None, uri: str = None):
        self.label = label
        self.preferential_label = preferential_label
        self.uri = uri

class Vocabulary:
    def loadFromXmlFile(self, file_path: str, lang : str = 'en', file_type : VocabularyXMLFileType = VocabularyXMLFileType.SKOS, base_uri: str = ''):
        self.base_uri = base_uri
        xml_data = None
        with open(file_path, 'r') as xml_file:
            xml_data = xml_file.read()
        xml_soup = bs.BeautifulSoup(''.join(xml_data), "xml")
        
        content = {}

        if file_type == VocabularyXMLFileType.SKOS:
            for element in xml_soup.find_all('rdf:Description'):
                pref = element.find('skos:prefLabel', {'xml:lang' : lang})
                if pref:
                    content[' '.join(pref.contents)] = None
                    self._appendLabel(VocabularyLabel(' '.join(pref.contents), ' '.join(pref.contents)))
                    for alt in element.find_all('skos:altLabel'):
                        if alt['xml:lang'] == lang:
                            content[' '.join(alt.contents)] = ' '.join(pref.contents)
                            self._appendLabel(VocabularyLabel(' '.join(alt.contents), ' '.join(pref.contents)))
            self.contents = content
        elif file_type == VocabularyXMLFileType.XML_ERIC_THESAURUS:
            TermRecordSet = xml_soup.find_all("Term")
            vocab_terms = {}
            for item in TermRecordSet:
                label_name = item.find('Name').text.strip()
                for p in re.findall(r"\([0-9 ]+\)", label_name):
                    label_name = label_name.replace(p, '')
                #print(label_name)                
                vocab_terms[label_name] = label_name
                self._appendLabel(VocabularyLabel(label_name, label_name))
                relationships = item.find('Relationships')
                if relationships != None:
                    use = relationships.find('Relationship', {'type' : 'U'})
                    if use != None:
                        vocab_terms[label_name] = use.find('Is').text.strip()
                        self._appendLabel(VocabularyLabel(label_name, use.find('Is').text.strip()))
            self.contents = vocab_terms
        elif file_type == VocabularyXMLFileType.XML_BIREME:
            DescriptorRecordSet = xml_soup.find_all("DescriptorRecord")
            descriptors = {}
            for item in DescriptorRecordSet:
                data = {
                    'descriptor_ui' : item.find("DescriptorUI").text,
                    'descriptor_name': item.find("DescriptorName").find("String").text.strip() ,#.replaceAll( "<!\\[CDATA\\[", "" ).replaceAll( "]]>", "" ),
                    'preffered_term': '',
                    'terms': []
                }
                for concept in item.find('ConceptList').find_all('Concept'):
                    if concept['PreferredConceptYN'] == 'Y':
                        for term in concept.find('TermList').find_all("Term"):
                            if term.find("TermUI"):
                                if term.find("TermUI").text.startswith('spa'):
                                    if term['ConceptPreferredTermYN'] == 'Y':
                                        data['preffered_term'] = term.find("String").text.strip()
                                    else:
                                        data["terms"].append(term.find("String").text.strip())
                    else:
                        for term in concept.find('TermList').find_all("Term"):
                            if term.find("TermUI"):
                                if term.find("TermUI").text.startswith('spa'):
                                    data["terms"].append(term.find("String").text.strip())
                descriptors[data["descriptor_ui"]] = data
            vocab_terms = {}
            for k, v in descriptors.items():
                vocab_terms[v['preffered_term']] = None
                self._appendLabel(VocabularyLabel(v['preffered_term'], v['preffered_term']))
                if len(v['terms']) > 0:
                    for term in v['terms']:
                        vocab_terms[term] = v['preffered_term']
                        self._appendLabel(VocabularyLabel(term, v['preffered_term']))
            self.contents = vocab_terms
    
    def generateCompatibleSisaFile(self, file_path:str):
        with open(file_path, 'w') as out_file:
            for k, v in dict(sorted(self.contents.items())).items():
                out_file.write(k + "\n")
                if v != None and k != v:
                    out_file.write("\tUSE " + v + "\n")
    
    def generateTSVFile(self, file_path: str):
        with open(file_path, 'w', encoding="utf8") as txt_file:
            for k, v in self.contents.items():
                if v == None or k == v:
                    text ='<' + self._getLabelUri(k) + '>\t' + k + '\n'
                    txt_file.write(text)
    
    def getPreferentialLabel(self, label: str, ignore_case: bool = True):
        label_obj = self.getLabel(label, ignore_case)
        result = None if label_obj == None else label_obj.preferential_label
        return result
    
    def _getLabelUri(self, label: str):
        result = self.base_uri.strip('/') + '/' + label.replace(' ', '_')
        return result
    
    def getLabel(self, label: str, ignore_case: bool = True):
        obj_label = None
        if ignore_case:
            obj_label = self.labels_lowercase[label.lower()] if label.lower() in self.labels_lowercase else None
        else:
            obj_label = self.labels[label] if label in self.labels else None
        return obj_label
    
    def _appendLabel(self, label: VocabularyLabel):
        self.labels[label.label] = label
        self.labels_lowercase[label.label.lower()] = label
    
    def __init__(self):
        self.name = ''
        self.base_uri = ''
        self.contents = {}
        self.origin = None
        self.labels = {}
        self.labels_lowercase = {}

In [2]:
# eric = Vocabulary()
# eric.loadFromXmlFile('../education-en/raw-files/vocabulary/ERICThesaurus2025.xml', 'en', VocabularyXMLFileType.XML_ERIC_THESAURUS, 'https://eric.ed.gov/?qt=')

In [15]:
# eric.generateCompatibleSisaFile('../education-en/prepared-files/vocabulary/eric_thesaurus-2025.txt')

In [3]:
# eric.generateTSVFile('../education-en/prepared-files/vocabulary/eric_thesaurus-2025.tsv')