In [105]:
from sickle import Sickle
import pandas as pd
import xmltodict
from tqdm import tqdm
import os

JOURNAL_ID = 18
OAI_ENDPOINT = 'https://ojs.letras.up.pt/index.php/tm/oai'

In [55]:
# jats extraction
def extract_metadata(meta):
    result = {}

    # --- Article IDs ---
    for id_ in meta.get("article-id", []):
        if isinstance(id_, dict):
            id_type = id_.get("@pub-id-type")
            if id_type and "#text" in id_:
                result[id_type] = id_["#text"]

    # --- Categories ---
    for subj in meta.get("article-categories", {}).get("subj-group", []):
        if isinstance(subj, dict):
            lang = subj.get("@xml:lang", "unknown")
            result[f"category_{lang}"] = subj.get("subject")

    # --- Titles ---
    if "title-group" in meta:
        main_title = meta["title-group"].get("article-title", {})
        if isinstance(main_title, dict):
            lang = main_title.get("@xml:lang", "unknown")
            result[f"title_{lang}"] = main_title.get("#text", "")
        elif isinstance(main_title, str):
            result["title"] = main_title

        for trans in meta["title-group"].get("trans-title-group", []):
            if isinstance(trans, dict):
                lang = trans.get("@xml:lang", "unknown")
                result[f"title_{lang}"] = trans.get("trans-title", "")

    # --- Authors ---
    contribs = meta.get("contrib-group", {}).get("contrib", [])
    if isinstance(contribs, dict):  # pojedynczy autor
        contribs = [contribs]

    authors = []
    for contrib in contribs:
        if not isinstance(contrib, dict):
            continue
        name = contrib.get("name-alternatives", {}).get("name", {})
        if isinstance(name, dict):
            full_name = f"{name.get('given-names','')} {name.get('surname','')}".strip()
            authors.append(full_name)
        if "email" in contrib:
            result.setdefault("author_emails", []).append(contrib["email"])
        bio = contrib.get("bio")
        if isinstance(bio, dict):
            result[f"author_bio_{bio.get('@xml:lang','unknown')}"] = bio.get("#text", "")
        elif isinstance(bio, str):
            result["author_bio"] = bio

    if authors:
        result["authors"] = " | ".join(authors)

    # --- Affiliations ---
    affs = meta.get("aff", [])
    if isinstance(affs, dict):  # pojedyncza afiliacja
        affs = [affs]

    institutions = []
    for aff in affs:
        if isinstance(aff, dict):
            inst = aff.get("institution")
            if isinstance(inst, dict):
                institutions.append(inst.get("#text", ""))
            elif isinstance(inst, str):
                institutions.append(inst)
    if institutions:
        result["affiliations"] = "; ".join(institutions)

    # --- Publication Dates ---
    for pub in meta.get("pub-date", []):
        if not isinstance(pub, dict):
            continue
        if pub.get("@date-type") == "pub":
            result["pub_date"] = f"{pub.get('year','')}-{pub.get('month','')}-{pub.get('day','')}"
        elif pub.get("@date-type") == "collection":
            result["collection_year"] = pub.get("year")

    # --- Issue ---
    result["issue_id"] = meta.get("issue-id")
    issue = meta.get("issue", {})
    if isinstance(issue, dict):
        result["issue_number"] = issue.get("#text")
        result["issue_seq"] = issue.get("@seq")
    for issue_title in meta.get("issue-title", []):
        if isinstance(issue_title, dict):
            lang = issue_title.get("@xml:lang", "unknown")
            result[f"issue_title_{lang}"] = issue_title.get("#text")
        elif isinstance(issue_title, str):
            result.setdefault("issue_title", issue_title)

    # --- Pages ---
    result["fpage"] = meta.get("fpage")
    result["lpage"] = meta.get("lpage")

    # --- Permissions ---
    perms = meta.get("permissions", {})
    if isinstance(perms, dict):
        result["copyright_statement"] = perms.get("copyright-statement")
        result["copyright_year"] = perms.get("copyright-year")
        result["copyright_holder"] = perms.get("copyright-holder")
        license_ = perms.get("license", {})
        if isinstance(license_, dict):
            result["license_url"] = license_.get("@xlink:href")
            result["license_text"] = license_.get("license-p")

    # --- Links ---
    for i, uri in enumerate(meta.get("self-uri", []), start=1):
        if isinstance(uri, dict):
            result[f"self_uri_{i}"] = uri.get("@xlink:href")
        elif isinstance(uri, str):
            result[f"self_uri_{i}"] = uri

    # --- Abstracts ---
    abs_val = meta.get("abstract")
    if isinstance(abs_val, dict):
        result["abstract_es"] = abs_val.get("p", "")
    elif isinstance(abs_val, str):
        result["abstract"] = abs_val

    for trans in meta.get("trans-abstract", []):
        if isinstance(trans, dict):
            lang = trans.get("@xml:lang", "unknown")
            result[f"abstract_{lang}"] = trans.get("p") or trans.get("#text")
        elif isinstance(trans, str):
            result.setdefault("abstract_trans", []).append(trans)

    # --- Keywords ---
    for kwd_group in meta.get("kwd-group", []):
        if not isinstance(kwd_group, dict):
            continue
        lang = kwd_group.get("@xml:lang", "unknown")
        kwds = kwd_group.get("kwd")
        if isinstance(kwds, list):
            result[f"keywords_{lang}"] = " | ".join(kwds)
        elif isinstance(kwds, str):
            result[f"keywords_{lang}"] = kwds

    return result


In [103]:
# oai_dc metadata
sickle = Sickle(OAI_ENDPOINT, verify=False, headers={"User-Agent": "Mozilla/5.0 (Windows 11; Win64; x64) Python/3.13.5 (via VSCode; +https://www.python.org)"}) 
oai_sets = {s.setSpec:s.setName for s in sickle.ListSets()}
records = sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True)

output_records = []
for record in tqdm(records):
    try:
        meta = {k: ' | '.join(v) for k,v in record.metadata.items()}
    except: continue
    meta['oai_id'] = record.header.identifier
    meta['set_specs'] = ' | '.join(record.header.setSpecs)
    meta['set_names'] = ' | '.join([oai_sets.get(e) for e in meta['set_specs'].split(' | ')])
    output_records.append(meta)

df = pd.DataFrame(output_records)
df.to_excel(f'./metadata/journal_{JOURNAL_ID}_oai_dc.xlsx', index=False)

175it [00:02, 82.36it/s]


In [None]:
# oai_dc metadata
sickle = Sickle(OAI_ENDPOINT)
oai_sets = {s.setSpec:s.setName for s in sickle.ListSets()}
print(oai_sets)
records = sickle.ListRecords(metadataPrefix='jats', ignore_deleted=True, set='all')

output_records = []
for record in tqdm(records):
    meta = xmltodict.parse(record.raw)['record']['metadata']['article']['front']['article-meta']
    meta = extract_metadata(meta)
    meta['oai_id'] = record.header.identifier
    meta['set_specs'] = ' | '.join(record.header.setSpecs)
    meta['set_names'] = ' | '.join([oai_sets.get(e, '') for e in meta['set_specs'].split(' | ')])
    output_records.append(meta)
    break

df = pd.DataFrame(output_records)
df.to_excel(f'./metadata/journal_{JOURNAL_ID}_oai_jats.xlsx', index=False)

{'all': 'All articles', 'set:numer-51': 'Numer 51', 'set:numer-50-50': 'Numer 50 – 50!', 'set:numer-49-mikrojezyki': 'Numer 49 – Mikrojęzyki ', 'set:numer-48-czulosc-w-przekladzie': 'Numer 48 – Czułość w przekładzie', 'set:numer-47-biografie-tlumaczy': 'Numer 47 – Biografie tłumaczy', 'set:numer-46-przeklad-i-przemoc': 'Numer 46 – Przekład i przemoc', 'set:special-issue-2-2023-experimental-translation': 'Special Issue 2/2023 – Experimental Translation', 'set:special-issue-1-2023-translation-criticism-and-its-vicinity': 'Special Issue 1/2023 – Translation Criticism and Its Vicinity', 'set:numer-45': 'Numer 45', 'set:special-issue-2022-east-west-transactions': 'Special Issue 2022 – East-West. Transactions', 'set:special-issue-2022-translating-genre-literature': 'Special Issue 2022 – Translating Genre Literature', 'set:numer-44': 'Numer 44', 'set:numer-43-przeklad-eksperymentalny': 'Numer 43  – Przekład eksperymentalny', 'set:numer-42-krytyka-przekladu-i-okolice': 'Numer 42 – Krytyka prze

0it [00:00, ?it/s]

oai:/en/journal/przekladaniec/oai:article/29577
all | driver





In [108]:
counter = 0
for file in os.listdir('metadata'):
    df = pd.read_excel('./metadata/' + file)
    counter += len(df)
print('Number of records:', counter)

Number of records: 9433
