### Issue is EZ API converts _profile back to datacite, and returns xml

* No Way Around XML Need to translate 

In [48]:
from lxml import objectify, etree

In [719]:
doi = {
  "@context": "http://schema.org",
  "@type": "Dataset",
  "@id": "10.5072/test9999OR",
  "identifier": "10.5072/test9999OR",
    "includedInDataCatalog": "ark:/99999/fk4RatDC",
  "additionalType": "Data dictionary",
  "name": "A data dictionary that describes each variable in the GTEx_v7_Annotations_SubjectPhenotypesDS.txt",
  "author": {
    "@type": "Organization",
    "name": "The GTEx Consortium"
  },
  "keywords": "gtex, annotation, phenotype, gene regulation, transcriptomics",
  "datePublished": "2017",
  "version": "v7",
  "url": "https://www.gtexportal.org/home/datasets",
  "contentSize": "5.4 Mb",
  "fileFormat": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
  "funder": {
   "@type": "Organization",
   "@id": "https://doi.org/10.13039/100000050",
   "name": "National Heart, Lung, and Blood Institute"
  }
}

### Design of Datacite Payload

#### Mandatory Elements

Simple

* Identifier (with mandatory type sub-property) DOI √
* Title (optional type sub-properties) √
* PublicationYear √
* ResourceType -> Dataset √

Need to be unwrapped
* Publisher √
* Creator (optional, given name, name id) √

### Optional Elements

* funder √
* url √ non homepage url
* version √
* dateCreated
* additionalType
* description
* keywords
* liscence
* citation
* isBasedOn
* isPredecessor
* isSuccessor
* hasPart
* isPartOf
* contentSize
* fileFormat
* contentUrl


In [720]:
nameType_xml = {
    'Organization': 'Organizational',
    'Person': 'Personal'
}
nameType_json = {
    'Organizational': 'Organization',
    'Person': 'Personal'
}

In [855]:
def outputDataciteXML(doi_json):
    E = objectify.ElementMaker(
        annotate=False
    )
    
    resource = E.resource(
        E.identifier(doi_json.get('@id'), identifierType="DOI"),
      
        E.publicationYear(doi_json.get('datePublished')),
        E.resourceType(doi_json.get('@type'), resourceTypeGeneral="Dataset")
    )
    
    # add titles
    titles = etree.SubElement(resource, "titles")
    etree.SubElement(titles, "title")._setText(doi_json.get('name'))
    
    # creators tag always a list
    creators = etree.SubElement(resource, "creators")
    
    # get the author
    auth = doi_json.get('author')
    
    # is there a list of authors
    if isinstance(auth, list):
        c = etree.SubElement(creators, "creator")
        # every author in the list
        for author in auth:
            auth_name = author.get('name')
            auth_type = auth.get('@type')
            name_type = nameType_xml.get(auth_type)
            etree.SubElement(c, "name", nameType=name_type)._setText(auth_name)
    
    # a single dict for authors
    if isinstance(auth, dict):
        creator = etree.SubElement(creators, "creator")
        
        auth_name = auth.get('name')
        auth_type = auth.get('@type')
        name_type = nameType_xml.get(auth_type)
            
        etree.SubElement(creator, "creatorName")._setText(auth_name)
        etree.SubElement(resource, "publisher")._setText(auth_name)
    
    
    
    # Url homepage
    url = doi_json.get('url')
    related_identifiers = etree.SubElement(resource, "relatedIdentifiers")
    etree.SubElement(related_identifiers, "relatedIdentifier", relatedIdentifierType="URL", 
                     relationType= "IsDocumentedBy")._setText(url)

    # includedInDatacatalog
    # etree.SubElement(related_identifiers, "relatedIdentifier", relatedIdentifierType="",
    #                    relationType="")._setText(dc)
    
    # Version
    version = doi_json.get('version')
    etree.SubElement(resource, "version")._setText(version)
    
    # description
    desc = doi_json.get('description')
    descriptions = etree.SubElement(resource, "descriptions")
    etree.SubElement(descriptions, "description", descriptionType="Abstract")._setText(desc)
    
    
    
    # keywords
    subjects = etree.SubElement(resource, "subjects")
    subject_list = doi_json.get('keywords').split(',')
    
    for sub in subject_list:
        etree.SubElement(subjects, "subject")._setText(sub)
    
    
    # funder 
    funder = doi_json.get('funder')
    funding_reference = etree.SubElement(resource, "fundingReferences")
    if isinstance(funder,dict):
        soleFunder = etree.SubElement(funding_reference, "fundingReference")
        
        fund_name = funder.get('name')
        fund_id = funder.get('@id')
        
        etree.SubElement(soleFunder, "funderName")._setText(fund_name)
        etree.SubElement(soleFunder, "funderIdentifier", funderIdentifierType="Other")._setText(fund_id)
        #etree.SubElement(soleFunder, "awardNumber")._setText()
        #etree.SubElement(soleFunder, "awardTitle")._setText()
    
    
    
    return etree.tostring(resource)

In [856]:
output = outputDataciteXML(doi).decode('utf-8').replace('<resource>', properResourceTag)

print(output)


<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd"><identifier identifierType="DOI">10.5072/test9999OR</identifier><publicationYear>2017</publicationYear><resourceType resourceTypeGeneral="Dataset">Dataset</resourceType><titles><title>A data dictionary that describes each variable in the GTEx_v7_Annotations_SubjectPhenotypesDS.txt</title></titles><creators><creator><creatorName>The GTEx Consortium</creatorName></creator></creators><publisher>The GTEx Consortium</publisher><relatedIdentifiers><relatedIdentifier relatedIdentifierType="URL" relationType="IsDocumentedBy">https://www.gtexportal.org/home/datasets</relatedIdentifier></relatedIdentifiers><version>v7</version><descriptions><description descriptionType="Abstract"/></descriptions><subjects><subject>gtex</subject><subject> annotation</subject><subject> phenotype</su

In [731]:
properResourceTag ='<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd">'

In [527]:
root = etree.fromstring(output)

In [530]:
child_list = root.getchildren()
creators = child_list[4]

In [553]:
creators.findtext('{http://datacite.org/schema/kernel-4}creator/{http://datacite.org/schema/kernel-4}name')

'The GTEx Consortium'

In [552]:
creator.getchildren()

[<Element {http://datacite.org/schema/kernel-4}name at 0x115e78848>]

In [575]:
creators.getchildren()[0].getchildren()[0].text

'The GTEx Consortium'

In [870]:
def unpack_xml(input_root):
    xml_dict = {}
    for child in input_root.getchildren():
        temp_tag = child.tag.replace("{http://datacite.org/schema/kernel-4}", "")
        
       
        # special rule for subjects
        if temp_tag == "Subjects":
            subj_list = []
            for sub in child.getchildren():
                subj_list.append(sub.text.strip())
            xml_dict['Subjects']=subj_list
        
        if temp_tag == "creators":
            creat_list = []
            for sub in child.getchildren():
                creat_list.append(sub.findtext('{http://datacite.org/schema/kernel-4}creatorName'))
            xml_dict['creators'] = creat_list
        
        if temp_tag == 'descriptions':
            xml_dict['description']=child.findtext('{http://datacite.org/schema/kernel-4}description')
        
        if temp_tag == 'fundingReferences':
            if len(child.getchildren())>1:
                fund_list = []
                for funder in child.getchildren():
                    
                    fund_name = funder.findtext('{http://datacite.org/schema/kernel-4}funderName')
                    fund_id = funder.findtext('{http://datacite.org/schema/kernel-4}funderIdentifier')
                    
                    fund_obj = {
                        'name': fund_name,
                        '@id': fund_id,
                        '@type': 'Organization'
                        }
                    
                    fund_list.append(fund_obj)
                    
                xml_dict['funder'] = fund_list
            else:
                fund_name = child.findtext('{http://datacite.org/schema/kernel-4}fundingReference/{http://datacite.org/schema/kernel-4}funderName')
                fund_id = child.findtext('{http://datacite.org/schema/kernel-4}fundingReference/{http://datacite.org/schema/kernel-4}funderIdentifier')
                    
                xml_dict['funder'] = {
                        'name': fund_name,
                        '@id': fund_id,
                        '@type': 'Organization'
                        }
            
        
        # unparse attributs
        temp_attr = child.attrib
        
        # unpack value
        if child.text != None:
            if temp_tag == "ResourceType":
                xml_dict['@type'] = child.text
            else:
                xml_dict[temp_tag] = child.text
                
        # unpack nested values
        if len(child.getchildren()) != 0 and temp_tag not in ["Subjects", "creators", "descriptions", 'fundingReferences']:
            xml_dict[temp_tag] = unpack_xml(child)
        #tags to add to dict
        if child.attrib.get('nameType') == "Organizational":
            # add the name type to dictionary
            xml_dict['@type'] = 'Organization'
            
        if child.attrib.get('nameType') == "Personal":
            xml_dict['@type'] = 'Person'
            
            
            
    return xml_dict

In [871]:
read_xml = ezapi_get.content.decode('utf-8')
# need to translate datacite
xml_form = read_xml.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
xml_form = re.sub("%0A", "", xml_form)
xml_form = xml_form.strip()


unpack_xml(etree.fromstring(xml_form) )

{'creators': ['The GTEx Consortium'],
 'description': '',
 'funder': {'@id': None,
  '@type': 'Organization',
  'name': 'National Heart, Lung, and Blood Institute'},
 'identifier': '10.5072/test9999OR',
 'publicationYear': '2017',
 'publisher': 'The GTEx Consortium',
 'relatedIdentifiers': {'relatedIdentifier': 'https://www.gtexportal.org/home/datasets'},
 'resourceType': 'Dataset',
 'subjects': {'subject': ' transcriptomics'},
 'titles': {'title': 'A data dictionary that describes each variable in the GTEx_v7_Annotations_SubjectPhenotypesDS.txt'},
 'version': 'v7'}

In [818]:
xml_form

'<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd"><identifier identifierType="DOI">10.5072/test9999OR</identifier><publicationYear>2017</publicationYear><resourceType resourceTypeGeneral="Dataset">Dataset</resourceType><titles><title>A data dictionary that describes each variable in the GTEx_v7_Annotations_SubjectPhenotypesDS.txt</title></titles><creators><creator><creatorName>The GTEx Consortium</creatorName></creator></creators><publisher>The GTEx Consortium</publisher><relatedIdentifiers><relatedIdentifier relatedIdentifierType="URL" relationType="IsDocumentedBy">https://www.gtexportal.org/home/datasets</relatedIdentifier></relatedIdentifiers><version>v7</version><descriptions><description descriptionType="Abstract"/></descriptions><subjects><subject>gtex</subject><subject> annotation</subject><subject> phenotype</s

In [815]:
root = etree.fromstring(xml_form)

In [830]:
descriptions= root.getchildren()[8]

In [835]:
descriptions.getchildren()[0]

<Element {http://datacite.org/schema/kernel-4}description at 0x116478e88>

In [684]:
xml_form

'<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd">  <identifier identifierType="DOI">10.5072/TEST9999MDSORSTEST</identifier>  <creators/>  <titles>    <title/>  </titles>  <publisher/>  <publicationYear/>  <dates/>  <version/></resource>'

In [594]:
almostJson = unpack_xml(etree.fromstring(output) )
almostJson

{'@type': 'Dataset',
 'FundingReference': {'@type': 'Organization',
  'funderName': 'National Heart, Lung, and Blood Institute'},
 'Identifier': 'doi:10.5072/test9999mdsorstest',
 'PublicationYear': '2017',
 'RelatedIdentifier': 'https://www.gtexportal.org/home/datasets',
 'Subjects': ['gtex',
  'annotation',
  'phenotype',
  'gene regulation',
  'transcriptomics'],
 'Title': 'A data dictionary that describes each variable in the GTEx_v7_Annotations_SubjectPhenotypesDS.txt',
 'Version': 'v7',
 'creators': [{'@type': 'Organizational', 'name': 'The GTEx Consortium'}],
 'publisher': 'The GTEx Consortium'}

In [579]:
# rename keys
renamedKeys = {
    'FundingReference': "funder",
    'funderName': 'name',
    'Identifier': 'identifier',
    'PublicationYear': 'datePublished',
    'RelatedIdentifier': 'url', # should be object full of stuff but ok
    'Subjects': 'keywords',
    'Title': 'title',
    'Version': 'version',
    'creators': 'author',
}



In [591]:
# add @id @context and @type
input_dict = almostJson

final['@id'] = input_dict.get('Identifier')
final['@context'] = 'https://schema.org'


def rename(almost_json):
    final = {}
    for key, value in almost_json.items():
        
        if key == '@type':
            value = nameType_json.get(value, value)
        
        new_key = renamedKeys.get(key,key)
        if isinstance(value,str): 
            final[new_key] = value
            
        if isinstance(value,list):
            
            final[new_key] = [rename(elem) if isinstance(elem,dict) else elem for elem in value ]

                    
            
        if isinstance(value, dict):
            final[new_key] = rename(value)
            
    return final
    

json_payload = rename(input_dict)



In [592]:
json_payload.update(
    {
        '@id':json_payload['identifier'],
        '@context': 'https://schema.org'
    }
)

In [593]:
json_payload

{'@context': 'https://schema.org',
 '@id': 'doi:10.5072/test9999mdsorstest',
 '@type': 'Dataset',
 'author': [{'@type': 'Organization', 'name': 'The GTEx Consortium'}],
 'datePublished': '2017',
 'funder': {'@type': 'Organization',
  'name': 'National Heart, Lung, and Blood Institute'},
 'identifier': 'doi:10.5072/test9999mdsorstest',
 'keywords': ['gtex',
  'annotation',
  'phenotype',
  'gene regulation',
  'transcriptomics'],
 'publisher': 'The GTEx Consortium',
 'title': 'A data dictionary that describes each variable in the GTEx_v7_Annotations_SubjectPhenotypesDS.txt',
 'url': 'https://www.gtexportal.org/home/datasets',
 'version': 'v7'}

In [199]:
# Generate root element with proper datacite attributes
NSMAP = {
    "xsi": "http://www.w3.org/2001/XMLSchema-instance",
    #"xlink" : 'http://www.w3.org/1999/xlink'
}

# first pass create single elements
resource = E.resource(
    E.Identifier(doi.get('@id'), IdentifierType="DOI"),
    E.Title(doi.get('title') ),
    E.PublicationYear(doi.get('datePublished')),
    E.ResourceType(doi.get('@type'), resourceTypeGeneral="Dataset")
)


#creator = resource.SubElement("Creator")
#    creator.SubElement("")
#resource.SubElement("Title", )

# cant set proper attributes whatever
#resource.attrib['xmlns:xsi'] = "http://www.w3.org/2001/XMLSchema-instance",
#resource.attrib['xsi:schemaLocation'] = "http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4.1/metadata.xsd"


# add subelements
#resource.

# generate all the required elements for DOI


In [200]:
# supports many elemnts for creators
creators = etree.SubElement(resource, "creators")

# if author object is a dict there is a single creator
if isinstance(doi.get('author'), dict):
    creator_element = etree.SubElement(creators, "creator")
    etree.SubElement(creator_element, "name", nameType="Organizational")._setText(doi.get('author').get('name') )

In [202]:
print()

<resource xmlns="http://datacite.org/schema/kernel-4"><Identifier IdentifierType="DOI">doi:10.5072/test9999mdsorstest</Identifier><Title>GTEx</Title><PublicationYear>2017</PublicationYear><ResourceType resourceTypeGeneral="Dataset">Dataset</ResourceType><creators><creator><Name nameType="Organizational">The GTEx Consortium</Name></creator></creators><Publisher nameType="Organizational">The GTEx Consortium</Publisher></resource>


In [124]:
import requests
from helper_functions import *

In [182]:
xml_payload = etree.tostring(resource)

In [667]:
payload = {
    
    '_profile': 'datacite',
    '_status': '_reserved',
    '_target': 'https://example.org/',
    'datacite':  output
}

In [668]:
payload

{'_profile': 'datacite',
 '_status': '_reserved',
 '_target': 'https://example.org/',
 'datacite': b'<resource xmlns="http://datacite.org/schema/kernel-4"><Identifier identifierType="DOI">doi:10.5072/test9999mdsorstest</Identifier><Title>A data dictionary that describes each variable in the GTEx_v7_Annotations_SubjectPhenotypesDS.txt</Title><PublicationYear>2017</PublicationYear><ResourceType resourceTypeGeneral="Dataset">Dataset</ResourceType><creators/><creator><creatorName nameType="Organizational">The GTEx Consortium</creatorName></creator><Publisher>The GTEx Consortium</Publisher><RelatedIdentifier relatedIdentifierType="URL">https://www.gtexportal.org/home/datasets</RelatedIdentifier><Version>v7</Version><description/><Subjects><subject>gtex</subject><subject> annotation</subject><subject> phenotype</subject><subject> gene regulation</subject><subject> transcriptomics</subject></Subjects><FundingReference><funderName nameType="Organizational">National Heart, Lung, and Blood Insti

In [669]:
anvl_payload = outputAnvl(payload)

In [781]:
anvl_payload

b'_profile: datacite\n_status: _reserved\n_target: https%3A//example.org/\ndatacite: b\'<resource xmlns="http%3A//datacite.org/schema/kernel-4"><Identifier identifierType="DOI">doi%3A10.5072/test9999mdsorstest</Identifier><Title>A data dictionary that describes each variable in the GTEx_v7_Annotations_SubjectPhenotypesDS.txt</Title><PublicationYear>2017</PublicationYear><ResourceType resourceTypeGeneral="Dataset">Dataset</ResourceType><creators/><creator><creatorName nameType="Organizational">The GTEx Consortium</creatorName></creator><Publisher>The GTEx Consortium</Publisher><RelatedIdentifier relatedIdentifierType="URL">https%3A//www.gtexportal.org/home/datasets</RelatedIdentifier><Version>v7</Version><description/><Subjects><subject>gtex</subject><subject> annotation</subject><subject> phenotype</subject><subject> gene regulation</subject><subject> transcriptomics</subject></Subjects><FundingReference><funderName nameType="Organizational">National Heart, Lung, and Blood Institute</f

In [764]:
# store in the ezapi
ezapi_put = requests.put(
    url = "https://ez.test.datacite.org/id/"+doi['@id'],
    data = anvl_payload,
    auth = requests.auth.HTTPBasicAuth('DATACITE.DCPPC',  'Player&Chemo+segment')
)
ezapi_put.content

b'success: doi:10.5072/test9999or\n_status: reserved\n_target: https://example.org/\ndatacite: <?xml version="1.0" encoding="UTF-8"?>%0A<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd">%0A  <identifier identifierType="DOI">10.5072/TEST9999OR</identifier>%0A  <creators/>%0A  <titles>%0A    <title/>%0A  </titles>%0A  <publisher/>%0A  <publicationYear/>%0A  <dates/>%0A  <version/>%0A</resource>%0A\n_profile: datacite'

In [880]:
doi_xml ='<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd"><identifier identifierType="DOI">10.5072/test9999OR</identifier><publicationYear>2017</publicationYear><resourceType resourceTypeGeneral="Dataset">Dataset</resourceType><titles><title>A data dictionary that describes each variable in the GTEx_v7_Annotations_SubjectPhenotypesDS.txt</title></titles><creators><creator><creatorName>The GTEx Consortium</creatorName></creator></creators><publisher>The GTEx Consortium</publisher><relatedIdentifiers><relatedIdentifier relatedIdentifierType="URL" relationType="IsDocumentedBy">https://www.gtexportal.org/home/datasets</relatedIdentifier><relatedIdentifier relatedIdentifierType="DOI" relationType="IsPartOf">ark:/99999/fk4RatDC</relatedIdentifier></relatedIdentifiers><version>v7</version><subjects><subject>gtex</subject><subject> annotation</subject><subject> phenotype</subject><subject> gene regulation</subject><subject> transcriptomics</subject></subjects><fundingReferences><fundingReference><funderName>National Heart, Lung, and Blood Institute</funderName><funderIdentifier funderIdentifierType="Other">https://doi.org/10.13039/100000050</funderIdentifier></fundingReference></fundingReferences></resource>'

In [902]:
# register metadata
create_metadata = requests.post(
    url = "https://mds.test.datacite.org/metadata/",
    data = doi_xml,
    auth = requests.auth.HTTPBasicAuth('DATACITE.DCPPC',  'Player&Chemo+segment')
)

In [903]:
create_metadata.content

b'OK (10.5072/test9999OR)'

In [901]:
# create doi
reserve_doi = requests.put(
    url = "https://mds.test.datacite.org/doi/"+doi['@id'],
    data = "doi="+doi['@id']+"\nurl=http://example.com",
    auth = requests.auth.HTTPBasicAuth('DATACITE.DCPPC',  'Player&Chemo+segment')
)

print(reserve_doi.status_code)
print(reserve_doi.content)



201
b'OK'


In [None]:
# create metadata record

In [900]:
"doi="+doi['@id']+"\nurl=http://example.com"

'doi=10.5072/test9999OR\nurl=http://example.com'

In [896]:
# retrieve from ezapi
ezapi_get = requests.get(
    url = "https://mds.test.datacite.org/metadata/"+doi['@id'],
    auth = requests.auth.HTTPBasicAuth('DATACITE.DCPPC',  'Player&Chemo+segment')
)

In [897]:
ezapi_get.content

b'dataset inactive'

In [793]:
datacite_dict

{}

In [637]:
read_xml = datacite_dict['datacite']
read_xml

'<?xml version="1.0" encoding="UTF-8"?>%0A<resource xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://datacite.org/schema/kernel-4" xsi:schemaLocation="http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd">%0A  <identifier identifierType="DOI">10.5072/TEST9999MDSORSTEST</identifier>%0A  <creators/>%0A  <titles>%0A    <title/>%0A  </titles>%0A  <publisher/>%0A  <publicationYear/>%0A  <dates/>%0A  <version/>%0A</resource>%0A'

In [904]:
# delete
ezapi_delete = requests.delete(
    url = "https://mds.test.datacite.org/metadata/"+doi['@id'],
    auth = requests.auth.HTTPBasicAuth('DATACITE.DCPPC',  'Player&Chemo+segment')
)
assert ezapi_delete.status_code == 200

In [895]:
ezapi_delete.content

b'OK'

In [276]:
# delete localhost
# delete
ezapi_delete = requests.delete(
    url = "https://mds.test.datacite.org/metadata/"+doi['@id'],
    auth = requests.auth.HTTPBasicAuth('DATACITE.DCPPC',  'Player&Chemo+segment')
)
assert ezapi_delete.status_code == 200

In [309]:
xml_form

'<resource xmlns="http://datacite.org/schema/kernel-4">  <Identifier IdentifierType="DOI">doi:10.5072/test9999mdsorstest</Identifier>  <Title>GTEx</Title>  <PublicationYear>2017</PublicationYear>  <ResourceType resourceTypeGeneral="Dataset">Dataset</ResourceType>  <creators/>  <creator>    <Name nameType="Organizational">The GTEx Consortium</Name>  </creator>  <Publisher nameType="Organizational">The GTEx Consortium</Publisher></resource>'

In [310]:
# read into lxml
root= etree.fromstring(xml_form)

In [311]:
root.nsmap

{None: 'http://datacite.org/schema/kernel-4'}

In [336]:
for child in root.getchildren():
    print(child.text)

doi:10.5072/test9999mdsorstest
GTEx
2017
Dataset
None
    
The GTEx Consortium


In [350]:
unpack_xml(root)

{'IdentifierType': 'DOI'}
{}
{}
{'resourceTypeGeneral': 'Dataset'}
{}
{}
{'nameType': 'Organizational'}
{'nameType': 'Organizational'}


{'@type': 'Organization',
 'Identifier': 'doi:10.5072/test9999mdsorstest',
 'PublicationYear': '2017',
 'Publisher': 'The GTEx Consortium',
 'Title': 'GTEx',
 'creator': {'@type': 'Organization', 'Name': 'The GTEx Consortium'}}

In [323]:
  # if sub object look for attributes to add to object
            for key, value in child.attrib.items():
                if re.match('.*Type.*', key):
                xml_dict['@type'] = value
       # check attributes to turn into schema.org metadata
        

In [353]:
# take xml tag names to json-ld keys
translation = {
    'Identifier': 'identifier',
    'PublicationYear': 'publicationDate',
    'Publisher': '',
    'Title': '',
    'creator': ''
    
}

In [874]:
ver = "hello"

In [875]:
if ver:
    print("hi")

hi
