In [12]:
import pandas as pd
import lxml.etree 
import csv
import tarfile
import os
from tqdm import tqdm



In [19]:

# open your tar.gz file

file='ORCID_2021_10_summaries.tar.gz'


def readTar(file:str):
    with tarfile.open(file, "r:gz") as tar:
        # Go over each member
        for member in tar:
            if '.xml' in member.name:
                yield tar.extractfile(member).read()  

def writeRows(writer,tree,header,xslt):
    transform = lxml.etree.XSLT(xslt)
    transformed=transform(tree)
    lines=str(transformed).split('$end_line$')
    for line in lines:
        #write only if there are values in the line
        if (len(header)<len(line)):
            writer.writerow(line.split('¬'))

outdir='data'
os.makedirs(outdir, exist_ok=True)

with open(
    os.path.join(outdir, "orcid_profiles.csv"), "w", encoding="utf8") as op, open(
        os.path.join(outdir, "orcid_external-identifiers.csv"), "w", encoding="utf8") as oei, open(
            os.path.join(outdir, "orcid_researcher-url.csv"), "w", encoding="utf8") as oru, open(
                os.path.join(outdir, "orcid_distinctions.csv"), "w", encoding="utf8") as od:
    op_writer = csv.writer(op, lineterminator="\n")
    oei_writer = csv.writer(oei, lineterminator="\n")
    oru_writer = csv.writer(oru, lineterminator="\n")
    od_writer = csv.writer(od, lineterminator="\n")

    header_profile = [
        "orcid_uri",
        "orcid_path",
        "preferences:locale",
        "history:creation-method",
        "history:submission-date",
        "history:last-modified-date",
        "history:claimed",
        "history:verified-email",
        "history:verified-primary-email",
        "person:name__@visibility",
        "person:name__common:created-date",
        "person:name__common:last-modified-date",
        "personal-details:given-names",
        "personal-details:family-name",
        "personal-details:credit-name",
        "other-name:other-name",
        "person:biography__personal-details:content",
        "address:address__address:country",
        "keyword:keyword__keyword:content"
            ]

    header_identifiers=[
        "orcid_uri",
        "orcid_path",
        "@display-index",
        "common:created-date",
        "common:last-modified-date",
        "common:external-id-type" ,
        "common:external-id-value" ,
        "common:external-id-url",
        "common:external-id-relationship",
        "common:source-name",
        "common:assertion-origin-name"
    ]
    header_researcher_url=[
        "orcid_uri",
        "orcid_path",
        "@display-index",
        "common:created-date",
        "common:last-modified-date",
        "researcher-url:url-name",
        "researcher-url:url",
        "common:source-name"
    ]

    header_distinctions=[
        "orcid_uri",
        "orcid_path",
        "@display-index",
        "common:created-date",
        "common:last-modified-date",
        "common:role-title",
        "common:start-date__common:year",
        "common:start-date__common:month",
        "common:start-date__common:day",
        "common:end-date__common:year",
        "common:end-date__common:month",
        "common:end-date__common:day",
        "common:organization__common:name",
        "common:organization__common:address__common:city",
        "common:organization__common:address__common:region",
        "common:organization__common:address__common:country",
        "common:organization__common:disambiguated-organization__common:disambiguated-organization-identifier",
        "common:organization__common:disambiguated-organization__common:disambiguation-source",
        "common:source-name"
    ]


    xslt_profile=lxml.etree.parse('xslt/profile.xsl')
    xslt_external_identifier=lxml.etree.parse('xslt/external-identifier.xsl')
    xslt_researcher_url=lxml.etree.parse('xslt/researcher-url.xsl')
    xslt_distinctions=lxml.etree.parse('xslt/distinctions.xsl')

    op_writer.writerow(header_profile)
    oei_writer.writerow(header_identifiers)
    oru_writer.writerow(header_researcher_url)
    od_writer.writerow(header_distinctions)
    import itertools

    #for xml in tqdm(itertools.islice(readTar(file), 10000),total=10000):
    for xml in tqdm(readTar(file),total=12000000):
        # Extract member
        tree = lxml.etree.fromstring(xml)
        writeRows(op_writer,tree,header_profile,xslt_profile)
        writeRows(oei_writer,tree,header_identifiers,xslt_external_identifier)
        writeRows(oru_writer,tree,header_researcher_url,xslt_researcher_url)
        writeRows(od_writer,tree,header_distinctions,xslt_distinctions)
        
            
    

 34%|███▍      | 4136144/12000000 [2:23:59<4:08:51, 526.67it/s] 