In [89]:
import glob
import xml.etree.ElementTree as ET

def mapEntry(repo):
    root = ET.parse(repo).getroot()
    id = root.find('.//{http://www.re3data.org/schema/2-2}re3data.orgIdentifier')
    if id is None or id.text == '':
        return None
    entry = ET.Element("registryEntry")
    ET.SubElement(entry, 'internalIdentifier').text = id.text
    name =  root.find('.//{http://www.re3data.org/schema/2-2}repositoryName')
    if not name is None :
        if name.text != '':
            if 'language' in name.attrib and name.attrib['language']!= '':
                ET.SubElement(entry, "name", nameLanguage = name.attrib['language'], languageVocabulary = "ISO 639-2").text = name.text
            else:
                ET.SubElement(entry, "name").text = name.text
    additionalname = root.find('.//{http://www.re3data.org/schema/2-2}additionalName')
    if not additionalname is None:
        if additionalname.text != '':
            if 'language' in additionalname.attrib and additionalname.attrib['language'] != '':
                ET.SubElement(entry, "name", nameLanguage = additionalname.attrib['language'], languageVocabulary = "ISO 639-2").text = additionalname.text
            else:
                ET.SubElement(entry,'additionalName').text = additionalname.text
    repoUrl = root.find('.//{http://www.re3data.org/schema/2-2}repositoryURL')
    if not repoUrl is None and repoUrl.text != '':
        ET.SubElement(entry,'url').text = repoUrl.text
    type = root.find('.//{http://www.re3data.org/schema/2-2}type')
    if not type is None and type.text != '':
        ET.SubElement(entry,'type').text = type.text
    dex =  root.find('.//{http://www.re3data.org/schema/2-2}description')
    if not dex is None and dex.text != '':
        ET.SubElement(entry,'description').text = dex.text
    ctype = root.findall('.//{http://www.re3data.org/schema/2-2}contentType')
    if len( ctype ) > 0:
        for ct in ctype:
            ET.SubElement(entry,'content').text = ct.text
    repoId =  root.findall('.//{http://www.re3data.org/schema/2-2}repositoryIdentifier')
    if len( repoId ) > 0 :
        for repo in repoId:
            splitIndex  = repo.text.find(':')
            pid = repo.text[0:splitIndex]
            value = repo.text[splitIndex+1:]
            ET.SubElement(entry, "identifier", type = pid).text = value
    subjs = root.findall('.//{http://www.re3data.org/schema/2-2}subject')
    if len(subjs) > 0:
        for s in subjs:
            if s.text != '':
                if 'subjectScheme' in s.attrib:
                    ET.SubElement(entry,'subject', scheme=s.attrib['subjectScheme']).text = s.text
                else:
                    ET.SubElement(entry,'subject').text = s.text
    kws = root.findall('.//{http://www.re3data.org/schema/2-2}keyword')
    if len(kws) > 0:
        for k in kws:
            if k.text != '':
                ET.SubElement(entry,'keyword').text = k.text           
    ists = root.findall('.//{http://www.re3data.org/schema/2-2}institution')
    
    if len(ists) > 0:

        for i in ists:
            #acronym is not mapped since re3data presents the broader additionalName. We could end up mapping something different from acronym
            #the common model should be changed by including the additionalName element for the organization
            iname = i.find('.//{http://www.re3data.org/schema/2-2}institutionName') 
            if iname is not None:
                if iname.text != '':
                    organization = ET.Element("organization")
                    if 'language' in iname.attrib and iname.attrib['language'] != '':
                        ET.SubElement(organization,"name", nameLanguage = iname.attrib['language'], languageVocabulary = "ISO 639-2").text = iname.text
                    else:
                        ET.SubElement(organization,"name").text = iname.text
                    country = i.find('.//{http://www.re3data.org/schema/2-2}institutionCountry')
                    if  country is not None and  country.text != '':
                        ET.SubElement(organization,"country").text = country.text
                    homepage = i.find('.//{http://www.re3data.org/schema/2-2}institutionURL')
                    if homepage is not None and homepage.text != '':
                        ET.SubElement(organization,"organizationUrl").text = homepage.text
                    identifiers = i.findall('.//{http://www.re3data.org/schema/2-2}institutionIdentifier')
                    if len(identifiers) > 0:
                        for identifier in identifiers:
                            if identifier is not None and identifier.text != '':
                                if identifier.text.find(':') > -1:
                                    ET.SubElement(organization,"id", type = identifier.text.split(':')[0]).text = identifier.text.split(':')[1]
                                else:
                                    ET.SubElement(organization,"id").text = identifier.text
            entry.append(organization)
        
    software = root.findall('.//{http://www.re3data.org/schema/2-2}software')
    if len(software) > 0:
        for s in software:
            if s.find('.//{http://www.re3data.org/schema/2-2}softwareName') is not None and s.find('.//{http://www.re3data.org/schema/2-2}softwareName').text != '':
                ET.SubElement(entry,'softwareName').text = s.find('.//{http://www.re3data.org/schema/2-2}softwareName').text 
        
    versioning = root.find('.//{http://www.re3data.org/schema/2-2}versioning')
    if versioning is not None and versioning.text != '':
        ET.SubElement(entry, "versioning").text = "True" 
    api = root.findall('.//{http://www.re3data.org/schema/2-2}api')
    if len(api) > 0:
        for a in api:
            if a.text != '':
                if 'apiType' in a.attrib and a.attrib['apiType'] != '':
                    ET.SubElement(entry,'apiUrl', type=a.attrib['apiType']).text = a.text
                    break

    startDate = root.find('.//{http://www.re3data.org/schema/2-2}startDate')
    if not startDate is None and not startDate.text is None and startDate.text != '':
        if len(startDate.text) == 4 :
            dateFormat = "yyyy"
        else:
            dateFormat = "yyyy-MM-dd"
        ET.SubElement(entry, 'startDate', format = dateFormat).text = startDate.text.split(" ")[0]
    lastUpdate = root.find('.//{http://www.re3data.org/schema/2-2}lastUpdate')
    if not lastUpdate is None and lastUpdate.text != '':
        ET.SubElement(entry, 'updateDate', format = "yyyy-MM-dd").text = lastUpdate.text.split(" ")[0]

    policy = root.findall('.//{http://www.re3data.org/schema/2-2}policy')
    if len(policy) > 0:
        for p in policy:
            if not p is None and p.find('.//{http://www.re3data.org/schema/2-2}policyUrl') is not None :
                ET.SubElement(entry, 'policyUrl').text = p.find('.//{http://www.re3data.org/schema/2-2}policyUrl').text
    
    dbAccess = root.find('.//{http://www.re3data.org/schema/2-2}databaseAccess')
    if dbAccess is not None :  
        if dbAccess.find('.//{http://www.re3data.org/schema/2-2}databaseAccessType').text !='' :
            if dbAccess.find('.//{http://www.re3data.org/schema/2-2}databaseAccessRestriction') is not None :
                ET.SubElement(entry, "access", type=dbAccess.find('.//{http://www.re3data.org/schema/2-2}databaseAccessType').text ,  restriction=dbAccess.find('.//{http://www.re3data.org/schema/2-2}databaseAccessRestriction') .text).text = "databaseAccess"
            else:
                ET.SubElement(entry, "access", type=dbAccess.find('.//{http://www.re3data.org/schema/2-2}databaseAccessType').text ).text = "databaseAccess"
            
    dataUpload = root.find('.//{http://www.re3data.org/schema/2-2}dataUpload')
    if dataUpload is not None:
        if dataUpload.find('.//{http://www.re3data.org/schema/2-2}dataUploadType').text == '':
            if dataUpload.find('.//{http://www.re3data.org/schema/2-2}dataUploadRestriction') is not None :
                ET.SubElement(entry, "access", type=dataUpload.find('.//{http://www.re3data.org/schema/2-2}dataUploadType').text ,  restriction=dataUpload.find('.//{http://www.re3data.org/schema/2-2}dataUploadRestriction') .text).text = "dataUpload"
            else:
                ET.SubElement(entry, "access", type=dataUpload.find('.//{http://www.re3data.org/schema/2-2}dataUploadType').text ).text = "dataUpload"
                
    dbLicenses = root.findall('.//{http://www.re3data.org/schema/2-2}databaseLicense')
    if not dbLicenses is None:
        for dbLicense in dbLicenses:
            ET.SubElement(entry, "licence", type = dbLicense.find('.//{http://www.re3data.org/schema/2-2}databaseLicenseName').text, url = dbLicense.find('.//{http://www.re3data.org/schema/2-2}databaseLicenseURL').text).text = "databaseLicence"

    dataLicences = root.findall('.//{http://www.re3data.org/schema/2-2}dataLicense')
    if dataLicences is not None:
        for dataLicence in dataLicences:
            if dataLicence is not None and dataLicence.find('.//{http://www.re3data.org/schema/2-2}dataLicenseName') is not None and dataLicence.find('.//{http://www.re3data.org/schema/2-2}dataLicenseURL') is not None:
                ET.SubElement(entry, "licence", type = dataLicence.find('.//{http://www.re3data.org/schema/2-2}dataLicenseName').text, url = dataLicence.find('.//{http://www.re3data.org/schema/2-2}dataLicenseURL').text).text = "dataLicence"
    return entry




In [90]:
repo_list = glob.glob('../data/re3dataRecords/*.xml')

entries = ET.Element("registryEntries")


for repo in repo_list:
    try:
        entries.append(mapEntry(repo))
    except:
        print("error on " + repo)
        break

tree = ET.ElementTree(entries)
tree.write("../commonModel/r3d.xml")