In [None]:
%pip install --upgrade avro==1.10.0 kafka-python
%pip install selenium

Collecting avro==1.10.0
  Downloading avro-1.10.0.tar.gz (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting kafka-python
  Downloading kafka_python-2.0.2-py2.py3-none-any.whl (246 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.5/246.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: avro
  Building wheel for avro (setup.py) ... [?25l[?25hdone
  Created wheel for avro: filename=avro-1.10.0-py3-none-any.whl size=96719 sha256=6b86205a408fb9f4717c40171c9add7bb894935a96c83ee3503e12cadbc22d4e
  Stored in directory: /root/.cache/pip/wheels/b5/45/02/59d22f799de5f011ddd515bf6e2e3b8c929ef56129008fe2b8
Successfully built avro
Installing collected packages: kafka-python, avro
Successfully installed avro-1.10.0 kafka-python-2.0.2
Collecting selenium
  Downloading selenium-4.20.0-py3-no

In [None]:
import hashlib
import io
import json
import os
import zipfile

from avro import schema
from avro.io import DatumWriter, BinaryEncoder
from kafka import KafkaConsumer, KafkaProducer

import re
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup

import pprint  # Import pprint for pretty printing

# <font color=yellow>1.Read file
* As we are trying to make the given data more readable, this is the first step for doing a data formatting
* using the ```json``` library to handle the structure of the data

### (Optional) Read file of specified year (not finished)

- you need to upload the zip of the year you want and it will unzip and store all of the json in the folder

In [None]:
!rm -r "2018"

rm: cannot remove '2018': No such file or directory


In [None]:
# Base path where the zip files are stored and where directories are created
base_path = './'

# Create a directory for each year and unzip the corresponding file
years = range(2018, 2024)  # Years 2018 to 2023
for year in years:
    # Construct the path for the year directory
    year_dir = os.path.join(base_path, str(year))
    # Create the directory if it does not exist
    os.makedirs(year_dir, exist_ok=True)

    # Construct the zip file path for the current year
    zip_path = os.path.join(base_path, f'{year}.zip')

    # Check if the zip file exists
    if os.path.exists(zip_path):
        # Open the zip file
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:  # corrected to use `zip_path`
            # Extract all the contents into the directory of the current year
            zip_ref.extractall(year_dir)
            print(f"All files from {zip_path} have been extracted to {year_dir}")
    else:
        print(f"No zip file found for {year} at {zip_path}")

print("All applicable files have been extracted and sorted by year.")


All files from ./2018.zip have been extracted to ./2018
All files from ./2019.zip have been extracted to ./2019
All files from ./2020.zip have been extracted to ./2020
All files from ./2021.zip have been extracted to ./2021
All files from ./2022.zip have been extracted to ./2022
All files from ./2023.zip have been extracted to ./2023
All applicable files have been extracted and sorted by year.


In [None]:
# Load the JSON file
with open("./2018/2018 copy/201800051", 'r') as file:
    json_data = json.load(file)

# Display a part of the JSON data to understand its structure
# Show only a limited part to avoid too much output if the file is large
json_data_part = json.dumps(json_data, indent=4, ensure_ascii=False)

In [None]:
def print_dict(d, indent=0):
    """ Recursively prints nested dictionaries."""
    for key, value in d.items():
        print('    ' * indent + str(key), end='')
        if isinstance(value, dict):
            print()  # Print the key and start a new line for nested dictionary
            print_dict(value, indent+1)  # Recursive call to print nested dictionary
        else:
            print(': ' + str(value))  # Print key-value pair

In [None]:
# Uncomment this to see a big pile of data that is un-formatted
#print_dict(json_data)

In [None]:
# use this function to print the json with the indent=4
def jprint(data):
  print(json.dumps(data,indent=4))

# <font color='yellow'> 2.Abstract Info
* this will be the outer-most scope of our data
* we will see that we have 8 inner scope for the this scope
  * in which, we will dives into the important ones
  * includes ```item```, ```coredata```, ```idxterms```, ```authkeywords``` and ```subject-areas```

In [None]:
abstracts_info = json_data["abstracts-retrieval-response"]
for topic in abstracts_info:
  print(topic)

item
affiliation
coredata
idxterms
language
authkeywords
subject-areas
authors


In [None]:
# Accessing different parts of the abstract information
abstracts_info_item = abstracts_info["item"]
abstract_info_coredata = abstracts_info["coredata"]
abstract_info_idxterms = abstracts_info["idxterms"]
abstract_info_authkeywords = abstracts_info.get("authkeywords", [])  # Default to empty list if null
abstract_info_subject = abstracts_info.get("subject-areas", [])  # Default to empty list if null

## Initialize it accordingly

In [None]:
def process_abstract_data(abstract_info):
    try:
        affiliation_info = abstract_info.get("affiliation", [])
        # Ensure affiliation_info is always a list
        if isinstance(affiliation_info, dict):
            affiliation_info = [affiliation_info]  # Normalize single dict to list
        elif not isinstance(affiliation_info, list):
            raise ValueError("affiliation data is neither a dictionary nor a list")

        affiliation_structure = {}
        num_affi = num_city = num_country = 0
        uni = []

        for affiliation in affiliation_info:
            # Ensure each affiliation is a dictionary
            if not isinstance(affiliation, dict):
                continue  # Skip if the affiliation is not a dictionary

            country = affiliation.get("affiliation-country", "Unknown Country")
            city = affiliation.get("affiliation-city", "Unknown City")
            affi_name = affiliation.get("affilname", "")

            if "University" in affi_name:
                uni.append(affi_name)

            affil_dict = {
                "affilname": affi_name,
                "href": affiliation.get("@href", "")
            }

            if country not in affiliation_structure:
                affiliation_structure[country] = {}
                num_country += 1

            if city not in affiliation_structure[country]:
                affiliation_structure[country][city] = []
                num_city += 1

            affiliation_structure[country][city].append(affil_dict)
            num_affi += 1

        affiliation_structure["Statistics"] = {
            "num_affiliations": num_affi,
            "num_university": len(set(uni)),
            "num_country": num_country,
            "num_city": num_city
        }
        return affiliation_structure

    except Exception as e:
        print(f"An error occurred: {e}")
        # Depending on your use case, you might want to return an empty dict, raise the exception, or handle it differently
        return {}


### Old split

In [None]:
# the important ones
abstracts_info_item = abstracts_info["item"]
abstract_info_coredata = abstracts_info["coredata"]
abstract_info_idxterms = abstracts_info["idxterms"]
abstract_info_authkeywords = abstracts_info["authkeywords"]  # Can be null
abstract_info_subject = abstracts_info["subject-areas"]


In [None]:
# look into the unimportant ones as well
affiliation_info = abstracts_info["affiliation"]
language = abstracts_info["language"]

In [None]:
# Assuming affiliation_info can be a dict or a list of dicts

# Normalize affiliation_info to always be a list
if isinstance(affiliation_info, dict):
    affiliation_info = [affiliation_info]  # Make it a list if it's just a single dict

# Initialize a dictionary to organize the data
affiliation_structure = {}
num_affi = 0
num_city = 0
num_country = 0
uni = []

# Iterate through each affiliation entry
for affiliation in affiliation_info:
    country = affiliation.get("affiliation-country", "Unknown Country")
    city = affiliation.get("affiliation-city", "Unknown City")
    # Create a dictionary for each affiliation containing the name and href

    affi_name = affiliation.get("affilname", "")
    if "University" in affi_name:
      uni.append(affi_name)

    affil_dict = {
        "affilname": affi_name,
        "href": affiliation.get("@href", "")
    }

    # Check if the country key exists, if not, initialize it
    if country not in affiliation_structure:
        affiliation_structure[country] = {}
        num_country += 1

    # Check if the city key exists in the country's dict, if not, initialize it
    if city not in affiliation_structure[country]:
        affiliation_structure[country][city] = []
        num_city += 1

    # Append the dictionary to the city list
    affiliation_structure[country][city].append(affil_dict)
    num_affi += 1


affiliation_structure["Statistics"] = {
    "num_affiliations": num_affi,
    "num_university": len(set(uni)),
    "num_country": num_country,
    "num_city": num_city
}
# Print the organized structure
jprint(affiliation_structure)

{
    "Thailand": {
        "Bangkok": [
            {
                "affilname": "Chulalongkorn University",
                "href": "https://api.elsevier.com/content/affiliation/affiliation_id/60028190"
            }
        ]
    },
    "Statistics": {
        "num_affiliations": 1,
        "num_university": 1,
        "num_country": 1,
        "num_city": 1
    }
}


In [None]:
language

{'@xml:lang': 'eng'}

# <font color="yellow">3.Item
* this might be where the most data is stored
* And after carefully examining the data, the Item data is stored like this diagram
```
item -> "ait:process-info"
       -> "xocs:meta"
       -> "bibrecord" -> "head"
                      -> "item-info"
                      -> "tail"
```

In [None]:
# look at the first two as well, uncomment to see each one
jprint(abstracts_info_item['ait:process-info'])

# this one might or might not provided in the data
# jprint(abstracts_info_item['xocs:meta'])

{
    "ait:status": {
        "@state": "update",
        "@type": "core",
        "@stage": "S300"
    },
    "ait:date-delivered": {
        "@day": "10",
        "@year": "2020",
        "@timestamp": "2020-02-10T17:30:52.000052-05:00",
        "@month": "02"
    },
    "ait:date-sort": {
        "@day": "30",
        "@year": "2018",
        "@month": "12"
    }
}


## 3.1. Look at the "bibrecord"
which will be most of the data in the "item" field

In [None]:
bibrecord_data = abstracts_info_item["bibrecord"]

### 3.1.1. Starting with head
extract the neccessary field
```
-> "bibrecord" -> "head"   // looking at this one
               -> "item-info"
               -> "tail"
```

```
-> "head" -> "author-group"  
          -> "correspondence"   // might not have
          -> "citation-title"
          -> "abstracts"
          -> "citation-info"
          -> "source"
          -> "enhancement
          -> "grantlist"       // not important
```

In [None]:
head_output_data = bibrecord_data['head']

In [None]:
for i in head_output_data:
  print(i)

author-group
citation-title
abstracts
citation-info
source
enhancement


In [None]:
def process_bibliographic_head_data(bibrecord_data):
    head = bibrecord_data.get("head", {})
    head_output_data = {
        "author_groups": [],
        "correspondence": [],
        "enhancement": [],
        "citation_title": head.get("citation-title", ""),
        "abstracts": head.get("abstracts", "")
    }

    # Process author groups
    author_groups = head.get("author-group", [])
    if isinstance(author_groups, dict):
        author_groups = [author_groups]

    for author in author_groups:
        affi = author.get("affiliation", {})
        org = affi.get("organization", [])
        organization_names = [org.get("$", "")] if isinstance(org, dict) else [o.get("$", "") for o in org if isinstance(o, dict)]
        authors_list = author.get("author", [])
        authors_list = [authors_list] if isinstance(authors_list, dict) else authors_list

        for person in authors_list:
            author_info = {
                "indexed-name": person.get("preferred-name", {}).get("ce:indexed-name", ""),
                "seq": person.get("@seq", ""),
                "auid": person.get("@auid", ""),
                "affiliation": {
                    "affiliation_id": affi.get("@afid", ""),
                    "dpt_id": affi.get("@dptid", ""),
                    "country": affi.get("country", ""),
                    "organization": organization_names
                }
            }
            head_output_data["author_groups"].append(author_info)

    # Process correspondence
    correspondence_data = head.get("correspondence", [])
    if not isinstance(correspondence_data, list):
        correspondence_data = [correspondence_data] if correspondence_data else []

    for item in correspondence_data:
        if isinstance(item, dict):
            person_info = item.get("person", {})
            correspondence_info = {
                "affiliation": {
                    "organization": [aff.get("$", "") for aff in item.get("affiliation", {}).get("organization", []) if isinstance(aff, dict)]
                },
                "person": {
                    "given_name": person_info.get("ce:given-name", ""),
                    "initials": person_info.get("ce:initials", ""),
                    "surname": person_info.get("ce:surname", ""),
                    "indexed_name": person_info.get("ce:indexed-name", "")
                }
            }
            head_output_data["correspondence"].append(correspondence_info)

    # Enhancements
    enhancements = head.get("enhancement", {})
    classifications = enhancements.get("classificationgroup", {}).get("classifications", [])
    if isinstance(classifications, dict):
        classifications = [classifications]

    for classification in classifications:
        class_content = classification.get("classification", [])
        class_content = [class_content] if isinstance(class_content, dict) else class_content
        class_info = {
            "type": classification.get("@type"),
            "classifications": [item.get("$", item) for item in class_content if isinstance(item, dict)]
        }
        head_output_data["enhancement"].append(class_info)

    # Source information
    source_info = head.get("source", {})
    website_info = source_info.get("website", {})
    website_address = ""
    if isinstance(website_info, list):
        # Assuming the first item in the list contains the 'ce:e-address' if the list is not structured as expected
        website_address = website_info[0].get("ce:e-address", {}).get("$", "") if isinstance(website_info[0], dict) else ""
    elif isinstance(website_info, dict):
        website_address = website_info.get("ce:e-address", {}).get("$", "")

    head_output_data["source"] = {
        "website_address": website_address,
        "publication_date": source_info.get("publicationdate", ""),
        "publisher_name": source_info.get("publisher", {}).get("publishername", "")
    }

    return head_output_data


#### Old Data Processing

In [None]:
# Assume bibrecord_data is loaded as a dictionary from your JSON data
# Normalize 'author-group' to always be a list
author_groups = bibrecord_data["head"]["author-group"]
if isinstance(author_groups, dict):
    author_groups = [author_groups]  # Make it a list if it's just a single dict

# Initialize a storage structure for processed data
head_output_data = {"author_groups": []}

# Iterate over each author group in the data
for author in author_groups:
    # Extract affiliation details if available
    affi = author.get("affiliation", {})

    # Handling the 'organization' field which can be either a dictionary or a list
    org = affi.get("organization", [])
    if isinstance(org, dict):  # If 'organization' is a dictionary, make it a list
        organization_names = [org["$"]]
    elif isinstance(org, list):  # If 'organization' is already a list, process normally
        organization_names = [o["$"] for o in org]
    else:
        organization_names = []  # Safe fallback if 'organization' is neither dict nor list

    affi_info = {
        "affiliation_id": affi.get("@afid", ""),
        "dpt_id": affi.get("@dptid", ""),
        "country": affi.get("country", ""),
        "organization": organization_names  # Use the processed list
    }

    # Extract individual author details including their affiliation
    authors = author.get("author", [])
    if isinstance(authors, dict):  # Ensure 'authors' is always a list
        authors = [authors]

    # Process each author in the list
    for person in authors:
        author_info = {
            "indexed-name": person.get("preferred-name", {}).get("ce:indexed-name", ""),
            "seq": person.get("@seq", ""),
            "auid": person.get("@auid", ""),
            "affiliation": affi_info  # Include affiliation info
        }
        head_output_data["author_groups"].append(author_info)

In [None]:
# Normalize the correspondence data to always be a list
if "correspondence" in bibrecord_data["head"]:
    head_output_data["correspondence"] = []
    correspondence_data = bibrecord_data["head"]["correspondence"]
    # Check if correspondence_data is not a list, then make it a list
    if not isinstance(correspondence_data, list):
        correspondence_data = [correspondence_data]

    for index, item in enumerate(correspondence_data):
        # print(f"Inspecting item {index} in correspondence:", item)

        # Initialize default values for correspondence info
        affiliation_info = {"affiliation_instance_id": "", "organization": []}
        person_info = {"author_instance_id": ""}

        # Ensure 'item' is a dictionary
        if isinstance(item, dict):
            # Extract affiliation details if available and correctly structured
            if 'affiliation' in item and isinstance(item['affiliation'], dict):
                affiliation = item['affiliation']
                # print("AFFILIATION", affiliation)

                # Normalize 'organization' to always be a list
                organization_data = affiliation.get("organization", [])
                if isinstance(organization_data, dict):  # If 'organization' is a single dict, make it a list
                    organization_data = [organization_data]

                # Extract 'organization' details safely
                organizations = [org["$"] for org in organization_data if "$" in org]

                affiliation_info = {
                    "organization": organizations
                }

            # Extract person details if available and correctly structured
            if 'person' in item and isinstance(item['person'], dict):
                person = item['person']
                person_info = {
                    "given_name": person.get("ce:given-name", ""),
                    "initials": person.get("ce:initials", ""),
                    "surname": person.get("ce:surname", ""),
                    "indexed_name": person.get("ce:indexed-name", "")
                }

            # Compile the correspondence information
            correspondence_info = {
                "affiliation": affiliation_info,
                "person": person_info
            }
            head_output_data["correspondence"].append(correspondence_info)
        else:
            print(f"Error: Item {index} in correspondence is not a dictionary, but {type(item)}")
else:
    print("Error: 'correspondence' not found or is not a list")

Error: 'correspondence' not found or is not a list


In [None]:
# Extract source information
head_output_data["source"] = {
    "website_address": bibrecord_data["head"]["source"]["website"]["ce:e-address"]["$"],
    "publication_date": bibrecord_data["head"]["source"]["publicationdate"],
    "publisher_name": bibrecord_data["head"]["source"]["publisher"]["publishername"]
}

In [None]:
head_output_data["enhancement"] = []

# Check if 'enhancement' and 'classificationgroup' keys are present and properly structured
if "enhancement" in bibrecord_data["head"] and "classificationgroup" in bibrecord_data["head"]["enhancement"]:
    classifications = bibrecord_data["head"]["enhancement"]["classificationgroup"].get("classifications", [])

    # Normalize classifications to always be a list
    if isinstance(classifications, dict):
        classifications = [classifications]  # Make a single dict a list for uniform processing

    # Extract enhancement information for specific classification type
    for classification in classifications:
        if classification["@type"] == "SUBJABBR":  # Check for the specific type
            # Handle the classification detail, which could be direct string or list or dict
            if isinstance(classification.get("classification"), (list, dict)):
                # If it's a list, extract all values
                if isinstance(classification["classification"], list):
                    all_classifications = [item.get("$", item) if isinstance(item, dict) else item for item in classification["classification"]]
                else:  # If it's a dictionary
                    all_classifications = [classification["classification"].get("$", classification["classification"])]
            else:  # It's a string directly
                all_classifications = [classification["classification"]]

            class_info = {
                "type": classification["@type"],
                "classifications": all_classifications
            }
            head_output_data["enhancement"].append(class_info)

In [None]:
# Extract citation title and abstracts
head_output_data["citation_title"] = bibrecord_data["head"]["citation-title"]
head_output_data["abstracts"] = bibrecord_data["head"]["abstracts"]

In [None]:
for i in head_output_data:
  print(i)

author_groups
source
enhancement
citation_title
abstracts


#### Author Group

In [None]:
# function to find how many author are in this paper
def num_author_group(head_data):
  num = 0
  for author in head_data:
    if int(author['seq']) > num:
      num = int(author['seq'])

  return num

In [None]:
author_group = head_output_data["author_groups"]
jprint(author_group)
print("Number of Author in this paper is: " + str(num_author_group(author_group)))

[
    {
        "indexed-name": "Karnkawinpong T.",
        "seq": "1",
        "auid": "57207728099",
        "affiliation": {
            "affiliation_id": "60028190",
            "dpt_id": "113891981",
            "country": "Thailand",
            "organization": [
                "Department of Computer Engineering",
                "Chulalongkorn University"
            ]
        }
    },
    {
        "indexed-name": "Limpiyakorn Y.",
        "seq": "2",
        "auid": "56032668700",
        "affiliation": {
            "affiliation_id": "60028190",
            "dpt_id": "113891981",
            "country": "Thailand",
            "organization": [
                "Department of Computer Engineering",
                "Chulalongkorn University"
            ]
        }
    }
]
Number of Author in this paper is: 2


#### Enhancement

In [None]:
classifications = head_output_data['enhancement']
jprint(classifications)

[
    {
        "type": "SUBJABBR",
        "classifications": [
            "BIOC"
        ]
    }
]


#### Correspondence

In [None]:
if 'correspondence' in head_output_data:
  jprint(head_output_data['correspondence'])

[
    {
        "affiliation": {
            "organization": [
                "Division of Medical Genetics and Metabolism",
                "Department of Pediatrics",
                "Faculty of Medicine",
                "Chulalongkorn University"
            ]
        },
        "person": {
            "given_name": "Kanya",
            "initials": "K.",
            "surname": "Suphapeetiporn",
            "indexed_name": "Suphapeetiporn K."
        }
    }
]


### 3.1.2. Look at the "item-info"
```
-> "bibrecord" -> "head"   
               -> "item-info" // looking at this one
               -> "tail"
```


In [None]:
def process_bibliography_item_info_data(bibrecord_data):
    item_info_output = {
        "external_source": "",
        "history": {}
    }

    item_info = bibrecord_data.get("item-info", {})
    if not item_info:  # Check if item_info is empty or None
        return item_info_output

    item_info_output["external_source"] = item_info.get("external-source", "")

    history = item_info.get("history", {})
    date_created = history.get("date-created", {})
    item_info_output["history"] = {
        "day": date_created.get("@day", ""),
        "timestamp": date_created.get("@timestamp", ""),
        "year": date_created.get("@year", ""),
        "month": date_created.get("@month", "")
    }

    return item_info_output


### old fn

In [None]:
# Initialize the output dictionary for item-info
item_info_output = {
    "external_source": "",
    "history": {}
}

# Access the 'item-info' from the data
if "item-info" in bibrecord_data:
    item_info = bibrecord_data["item-info"]

    # Extract 'external-source' if available
    item_info_output["external_source"] = item_info.get("external-source", "")

    # Extract 'history' details if available
    if "history" in item_info:
        history = item_info["history"]
        if "date-created" in history:
            date_created = history["date-created"]
            item_info_output["history"] = {
                "day": date_created.get("@day", ""),
                "timestamp": date_created.get("@timestamp", ""),
                "year": date_created.get("@year", ""),
                "month": date_created.get("@month", "")
            }

# Print the extracted item-info data
jprint(item_info_output)

{
    "external_source": "MEDLINE",
    "history": {
        "day": "25",
        "timestamp": "BST 16:36:48",
        "year": "2018",
        "month": "10"
    }
}


### 3.1.3 Look at the "tail"
```
-> "bibrecord" -> "head"   
               -> "item-info"
               -> "tail"  // looking at this one
```

In [None]:
def process_bibliography_tail_data(bibrecord_data):
    # Ensure 'tail' is a dictionary before proceeding
    tail_info = bibrecord_data.get("tail", {})

    bibliography_output = {
        "refcount": "",
        "references": []
    }

    if not tail_info:  # Check if tail_info is empty or None
        return bibliography_output

    bibliography = tail_info.get("bibliography", {})
    bibliography_output["refcount"] = bibliography.get("@refcount", "0")

    # Handle references
    references = bibliography.get("reference", [])
    if not isinstance(references, list):
        references = [references] if references else []

    for ref in references:
        reference_info = {
            "id": ref.get("@id", ""),
            "ref_fulltext": ref.get("ref-fulltext", ""),
            "ref_text": ref.get("ce:source-text", ""),
            "ref_info": {},
            "ref_authors": [],
            "ref_authors_count": "",
            "ref_collab": []
        }

        ref_info = ref.get("ref-info", {})
        reference_info["ref_info"] = {
            "ref_publicationyear": ref_info.get("ref-publicationyear", {}).get("@first", ""),
            "ref_title": ref_info.get("ref-title", {}).get("ref-titletext", "Title Not Available"),
            "ref_sourcetitle": ref_info.get("ref-sourcetitle", "")
        }

        # Process authors and collaborations
        ref_authors = ref_info.get("ref-authors", {})
        authors = ref_authors.get("author", [])
        if not isinstance(authors, list):
            authors = [authors] if authors else []
        reference_info["ref_authors"] = [author.get("ce:indexed-name", "") for author in authors]
        reference_info["ref_authors_count"] = len(reference_info["ref_authors"])

        collaborations = ref_authors.get("collaboration", [])
        if not isinstance(collaborations, list):
            collaborations = [collaborations] if collaborations else []
        reference_info["ref_collab"] = [{"collaboration_name": collab.get("ce:text", "")} for collab in collaborations]

        bibliography_output["references"].append(reference_info)

    return bibliography_output


### old fn

In [None]:
tail_info = bibrecord_data["tail"]

In [None]:
import json

# Initialize the output dictionary for bibliography
bibliography_output = {
    "refcount": "",
    "references": []
}

# Assume tail_info is defined and contains 'bibliography' data
if "bibliography" in tail_info:
    bibliography = tail_info["bibliography"]
    bibliography_output["refcount"] = bibliography.get("@refcount", "0")  # Get refcount safely

    # Process each reference if the list is present
    if "reference" in bibliography and isinstance(bibliography["reference"], list):
        for ref in bibliography["reference"]:
            reference_info = {
                "id": ref.get("@id", ""),
                "ref_fulltext": ref.get("ref-fulltext", ""),
                "ref_text": ref.get("ce:source-text", ""),  # Some might not have this field
                "ref_info": {}
            }

            # Extract ref-info if available
            if "ref-info" in ref:
                ref_info = ref["ref-info"]
                reference_info["ref_info"] = {
                    "ref_publicationyear": ref_info.get("ref-publicationyear", {}).get("@first", ""),
                    "ref_title": ref_info.get("ref-title", {}).get("ref-titletext", "Title Not Available"),
                    "ref_sourcetitle": ref_info.get("ref-sourcetitle", "")
                }

                # Extract authors or collaborations if available
                if "ref-authors" in ref_info:
                    if "author" in ref_info["ref-authors"]:
                        reference_info["ref_authors_count"] = ""
                        reference_info["ref_authors"] = []
                        authors = [author.get("ce:indexed-name", "") for author in ref_info["ref-authors"]["author"]]
                        reference_info["ref_authors"].extend(authors)  # Add all authors to the list
                        reference_info["ref_authors_count"] = len(reference_info["ref_authors"])

                    if "collaboration" in ref_info["ref-authors"]:
                        reference_info["ref_collab"] = []
                        # Handle multiple collaborations if it's a list, single if it's not
                        collaborations = ref_info["ref-authors"]["collaboration"]
                        if isinstance(collaborations, list):
                            for collab in collaborations:
                                collab_info = {"collaboration_name": collab.get("ce:text", "")}
                                reference_info["ref_authors"].append(collab_info)
                        else:
                            collab_info = {"collaboration_name": collaborations.get("ce:text", "")}
                            reference_info["ref_collab"].append(collab_info)

            bibliography_output["references"].append(reference_info)

# Print the extracted item-info data using json.dumps for pretty printing
jprint(bibliography_output)

{
    "refcount": "20",
    "references": [
        {
            "id": "1",
            "ref_fulltext": "Brassier, A., Gobin, S., Arnoux, J.B., Valayannopoulos, V., Habarou, F., Kossorotoff, M., Servais, A., Barbier, V., Dubois, S., Touati, G., Barouki, R., Lesage, F., Dupic, L., Bonnefont, J.P., Ottolenghi, C., De Lonlay, P., Long-term outcomes in ornithine transcarbamylase deficiency: a series of 90 patients. Orphanet J. Rare Dis., 10, 2015, 58.",
            "ref_text": "",
            "ref_info": {
                "ref_publicationyear": "2015",
                "ref_title": "Long-term outcomes in ornithine transcarbamylase deficiency: a series of 90 patients",
                "ref_sourcetitle": "Orphanet J. Rare Dis."
            },
            "ref_authors_count": 16,
            "ref_authors": [
                "Brassier A.",
                "Gobin S.",
                "Arnoux J.B.",
                "Valayannopoulos V.",
                "Habarou F.",
                "Kossorotoff 

# <font color="yellow">4.Coredata

In [None]:
abstract_info_coredata = abstracts_info["coredata"]

## Integrate with web scrape

In [None]:
def get_citation_count(link):
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Firefox(options=options)
    driver.get(link)
    driver.implicitly_wait(5)
    redirected_html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(redirected_html, 'html.parser')
    page_title_header = soup.find('span', id='pageTitleHeader')

    if page_title_header:
        text_inside_span = page_title_header.get_text(strip=True)
        numbers = re.findall(r'\d+', text_inside_span)
        return numbers[0] if numbers else "0"
    return "0"

def process_coredata_info(abstract_info_coredata):
    coredata_info_output = {
        "srctype": abstract_info_coredata.get("srctype", ""),
        "dc_description": abstract_info_coredata.get("dc:description", ""),
        "prism_aggregationType": abstract_info_coredata.get("prism:aggregationType", ""),
        "prism_url": abstract_info_coredata.get("prism:url", ""),
        "dc_title": abstract_info_coredata.get("dc:title", ""),
        "article_number": abstract_info_coredata.get("article-number", ""),
        "prism_publicationName": abstract_info_coredata.get("prism:publicationName", "Publication Name Not Available"),
        "dc_identifier": abstract_info_coredata.get("dc:identifier", ""),
        "dc_publisher": abstract_info_coredata.get("dc:publisher", ""),
        "dc_creator": [],
        "links": [],
        "citation_count": "0"
    }

    if "dc:creator" in abstract_info_coredata and "author" in abstract_info_coredata["dc:creator"]:
        for author in abstract_info_coredata["dc:creator"]["author"]:
            author_info = {
                "preferred_name": author.get("preferred-name", {}),
                "auid": author.get("@auid", ""),
                "author_url": author.get("author-url", "")
            }
            coredata_info_output["dc_creator"].append(author_info)

    if "link" in abstract_info_coredata:
        for link in abstract_info_coredata["link"]:
            link_info = {
                "rel": link.get("@rel", ""),
                "href": link.get("@href", "")
            }
            coredata_info_output["links"].append(link_info)
            if link.get("@rel") == "your_desired_relation_here":  # Assuming you want a specific link
                coredata_info_output["citation_count"] = get_citation_count(link.get("@href"))

    return coredata_info_output

## the fn

In [None]:
def process_coredata_info(abstract_info_coredata):
    # Initialize an output dictionary to store the extracted information
    coredata_info_output = {
        "srctype": abstract_info_coredata.get("srctype", ""),
        "dc_description": abstract_info_coredata.get("dc:description", ""),
        "prism_aggregationType": abstract_info_coredata.get("prism:aggregationType", ""),
        "prism_url": abstract_info_coredata.get("prism:url", ""),
        "dc_title": abstract_info_coredata.get("dc:title", ""),
        "article_number": abstract_info_coredata.get("article-number", ""),
        "prism_publicationName": abstract_info_coredata.get("prism:publicationName", "Publication Name Not Available"),
        "dc_identifier": abstract_info_coredata.get("dc:identifier", ""),
        "dc_publisher": abstract_info_coredata.get("dc:publisher", ""),
        "dc_creator": [],
        "links": []
    }

    # Extract creator details
    if "dc:creator" in abstract_info_coredata and "author" in abstract_info_coredata["dc:creator"]:
        for author in abstract_info_coredata["dc:creator"]["author"]:
            author_info = {
                "preferred_name": author.get("preferred-name", {}),
                "auid": author.get("@auid", ""),
                "author_url": author.get("author-url", "")
            }
            coredata_info_output["dc_creator"].append(author_info)

    # Extract links
    if "link" in abstract_info_coredata:
        for link in abstract_info_coredata["link"]:
            link_info = {
                "rel": link.get("@rel", ""),
                "href": link.get("@href", "")
            }
            coredata_info_output["links"].append(link_info)

    return coredata_info_output

In [None]:
coredata_info = process_coredata_info(abstract_info_coredata)
jprint(coredata_info)

{
    "srctype": "j",
    "dc_description": "Ornithine transcarbamylase deficiency (OTCD) is an X-linked urea cycle disorder affecting both males and females. Hemizygous males commonly present with severe hyperammonemic encephalopathy during the neonatal period. Heterozygous females have great phenotypic variability. The majority of female patients can manifest later in life or have unrecognized symptoms, making the diagnosis of OTCD in females very challenging. Here we report on three unrelated Thai female cases with OTCD presenting with different manifestations including aggressive behavior, acute liver failure and severe encephalopathy. Whole exome sequencing successfully identified disease-causing mutations in all three cases including two novel ones: the c.209_210delAA (p.Lys70Argfs*17) and the c.850T>A (p.Tyr284Asn). This study affirms variable symptoms in female patients with OTCD and emphasizes the importance of early recognition and prompt management for favorable outcomes. In

In [None]:
import re
from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup

l = coredata_info['links'][2]['href']

def get_citation_count(links):
    # Set up Firefox options
    options = Options()
    options.add_argument('--headless')

    # Initialize WebDriver
    driver = webdriver.Firefox(options=options)

    # URL to scrape
    link = links

    # Get the redirected URL
    driver.get(link)

    # Wait for the page to fully load (you may need to adjust the timeout)
    driver.implicitly_wait(10)

    # Get the HTML content of the redirected URL
    redirected_html = driver.page_source

    # Quit the WebDriver
    driver.quit()

    # Parse the HTML content with Beautiful Soup
    soup = BeautifulSoup(redirected_html, 'html.parser')

    # Find the span tag with id "pageTitleHeader"
    page_title_header = soup.find('span', id='pageTitleHeader')

    numbers = 0
    # Extract the text from the span tag
    if page_title_header:
        text_inside_span = page_title_header.get_text(strip=True)
        # Use regular expression to extract numbers
        numbers = re.findall(r'\d+', text_inside_span)
        if numbers:
            return numbers[0]
        else:
            print("No numbers found in the text.")
    else:
        print("Span tag with id 'pageTitleHeader' not found.")


c = get_citation_count(l)
print('citation count:', c)

citation count: 14


#<font color="yellow"> 5.Authkeywords & SubjectAreas
* **authkeywords**: each representing keywords selected by the authors of the research paper. These keywords are typically chosen to highlight the core topics, methods, and areas of focus within the research.
* **subject_areas**: each representing what's the paper is associates with in each field of studies

In [None]:
def process_auth_subject_data(abstract_info_authkeywords, abstract_info_subject):
    # Prepare output dictionaries
    authkeywords_output = {"author_keyword": []}
    subject_areas_output = {"subject_area": []}

    # Check and extract data for authkeywords
    if abstract_info_authkeywords is not None and "author-keyword" in abstract_info_authkeywords:
        author_keywords = abstract_info_authkeywords["author-keyword"]
        # Normalize to list if it's a single dictionary
        if isinstance(author_keywords, dict):
            author_keywords = [author_keywords]  # Convert a single dict to a list
        # Extract keywords assuming a list of dictionaries now
        authkeywords_output["author_keyword"] = [keyword["$"] for keyword in author_keywords]

    # Check and extract data for subject-areas
    if abstract_info_subject is not None and "subject-area" in abstract_info_subject:
        subject_areas = abstract_info_subject["subject-area"]
        # Normalize to list if it's a single dictionary
        if isinstance(subject_areas, dict):
            subject_areas = [subject_areas]  # Convert a single dict to a list
        subject_areas_output["subject_area"] = [
            {
                "name": area["$"],
                "code": area["@code"],
                "abbrev": area["@abbrev"]
            } for area in subject_areas
        ]

    # Return both outputs as a tuple of dictionaries
    return authkeywords_output, subject_areas_output


### old fn

In [None]:
# Check if the authkeywords is null or not
keywordsNull = abstract_info_authkeywords is None

In [None]:
if not keywordsNull:
  # Extracting data for authkeywords
  authkeywords_output = {
      "author_keyword": [keyword["$"] for keyword in abstract_info_authkeywords["author-keyword"]]
  }
  jprint(authkeywords_output)
else:
  print("AuthKeywords is null")
# Extracting data for subject-areas
subject_areas_output = {
    "subject_area": [
        {
            "name": area["$"],
            "code": area["@code"],
            "abbrev": area["@abbrev"]
        } for area in abstract_info_subject["subject-area"]
    ]
}

# Print the extracted data
print("Subject Areas:", json.dumps(subject_areas_output, indent=4))

{
    "author_keyword": [
        "Female",
        "Hyperammonemia",
        "Novel mutations",
        "Ornithine transcarbamylase deficiency",
        "OTC"
    ]
}
Subject Areas: {
    "subject_area": [
        {
            "name": "Genetics",
            "code": "1311",
            "abbrev": "BIOC"
        }
    ]
}


# <font color="Yellow"> 6.IdxTerms
* **weight**: This attribute is typically used to indicate the importance or relevance of a term within the context of the document (a > b > c)
* **candidate**: This attribute usually indicates whether a term is being considered for a particular role or function within the document or system (for easier understanding -> is it important for out analysis n == no | y == yes)

In [None]:
# Check if the authkeywords is null or not
idxNull = abstract_info_idxterms is None

In [None]:
if not idxNull:
  jprint(abstract_info_idxterms)

NameError: name 'idxNull' is not defined

# Process all data (in functions way)

## Test dic

In [None]:
years = range(2018, 2024)  # Years from 2018 to 2023
for year in years:
    # Construct the URL
    url = f"https://github.com/nnatchy/DSDE_Project/raw/main/{year}_test.zip"
    # Construct the wget command to download and rename the file directly
    !wget {url} -O {year}.zip

--2024-05-05 19:06:06--  https://github.com/nnatchy/DSDE_Project/raw/main/2018_test.zip
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/nnatchy/DSDE_Project/main/2018_test.zip [following]
--2024-05-05 19:06:06--  https://raw.githubusercontent.com/nnatchy/DSDE_Project/main/2018_test.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6913340 (6.6M) [application/zip]
Saving to: ‘2018.zip’


2024-05-05 19:06:07 (72.0 MB/s) - ‘2018.zip’ saved [6913340/6913340]

--2024-05-05 19:06:07--  https://github.com/nnatchy/DSDE_Project/raw/main/2019_test.zip
Resolving github.com (github.com)... 140.82.113.3
Co

## Real dic

In [None]:
years = range(2018, 2024)  # Years from 2018 to 2023
for year in years:
    # Construct the URL
    url = f"https://github.com/nnatchy/DSDE_Project/raw/main/{year}.zip"
    !wget {url} -O {year}.zip

## Applying

In [None]:
def process_abstract_data(abstract_info):
    try:
        affiliation_info = abstract_info.get("affiliation", [])
        # Ensure affiliation_info is always a list
        if isinstance(affiliation_info, dict):
            affiliation_info = [affiliation_info]  # Normalize single dict to list
        elif not isinstance(affiliation_info, list):
            raise ValueError("affiliation data is neither a dictionary nor a list")

        affiliation_structure = {}
        num_affi = num_city = num_country = 0
        uni = []

        for affiliation in affiliation_info:
            # Ensure each affiliation is a dictionary
            if not isinstance(affiliation, dict):
                continue  # Skip if the affiliation is not a dictionary

            country = affiliation.get("affiliation-country", "Unknown Country")
            city = affiliation.get("affiliation-city", "Unknown City")
            affi_name = affiliation.get("affilname", "")

            if "University" in affi_name:
                uni.append(affi_name)

            affil_dict = {
                "affilname": affi_name,
                "href": affiliation.get("@href", "")
            }

            if country not in affiliation_structure:
                affiliation_structure[country] = {}
                num_country += 1

            if city not in affiliation_structure[country]:
                affiliation_structure[country][city] = []
                num_city += 1

            affiliation_structure[country][city].append(affil_dict)
            num_affi += 1

        affiliation_structure["Statistics"] = {
            "num_affiliations": num_affi,
            "num_university": len(set(uni)),
            "num_country": num_country,
            "num_city": num_city
        }
        return affiliation_structure

    except Exception as e:
        print(f"An error occurred: {e}")
        # Depending on your use case, you might want to return an empty dict, raise the exception, or handle it differently
        return {}


In [None]:
def process_bibliographic_head_data(bibrecord_data):
    head = bibrecord_data.get("head", {})
    head_output_data = {
        "author_groups": [],
        "correspondence": [],
        "enhancement": [],
        "citation_title": head.get("citation-title", ""),
        "abstracts": head.get("abstracts", "")
    }

    # Process author groups
    author_groups = head.get("author-group", [])
    if isinstance(author_groups, dict):
        author_groups = [author_groups]

    for author in author_groups:
        affi = author.get("affiliation", {})
        org = affi.get("organization", [])
        organization_names = [org.get("$", "")] if isinstance(org, dict) else [o.get("$", "") for o in org if isinstance(o, dict)]
        authors_list = author.get("author", [])
        authors_list = [authors_list] if isinstance(authors_list, dict) else authors_list

        for person in authors_list:
            author_info = {
                "indexed-name": person.get("preferred-name", {}).get("ce:indexed-name", ""),
                "seq": person.get("@seq", ""),
                "auid": person.get("@auid", ""),
                "affiliation": {
                    "affiliation_id": affi.get("@afid", ""),
                    "dpt_id": affi.get("@dptid", ""),
                    "country": affi.get("country", ""),
                    "organization": organization_names
                }
            }
            head_output_data["author_groups"].append(author_info)

    # Process correspondence
    correspondence_data = head.get("correspondence", [])
    if not isinstance(correspondence_data, list):
        correspondence_data = [correspondence_data] if correspondence_data else []

    for item in correspondence_data:
        if isinstance(item, dict):
            person_info = item.get("person", {})
            correspondence_info = {
                "affiliation": {
                    "organization": [aff.get("$", "") for aff in item.get("affiliation", {}).get("organization", []) if isinstance(aff, dict)]
                },
                "person": {
                    "given_name": person_info.get("ce:given-name", ""),
                    "initials": person_info.get("ce:initials", ""),
                    "surname": person_info.get("ce:surname", ""),
                    "indexed_name": person_info.get("ce:indexed-name", "")
                }
            }
            head_output_data["correspondence"].append(correspondence_info)

    # Enhancements
    enhancements = head.get("enhancement", {})
    classifications = enhancements.get("classificationgroup", {}).get("classifications", [])
    if isinstance(classifications, dict):
        classifications = [classifications]

    for classification in classifications:
        class_content = classification.get("classification", [])
        class_content = [class_content] if isinstance(class_content, dict) else class_content
        class_info = {
            "type": classification.get("@type"),
            "classifications": [item.get("$", item) for item in class_content if isinstance(item, dict)]
        }
        head_output_data["enhancement"].append(class_info)

    # Source information
    source_info = head.get("source", {})
    website_info = source_info.get("website", {})
    website_address = ""
    if isinstance(website_info, list):
        # Assuming the first item in the list contains the 'ce:e-address' if the list is not structured as expected
        website_address = website_info[0].get("ce:e-address", {}).get("$", "") if isinstance(website_info[0], dict) else ""
    elif isinstance(website_info, dict):
        website_address = website_info.get("ce:e-address", {}).get("$", "")

    head_output_data["source"] = {
        "website_address": website_address,
        "publication_date": source_info.get("publicationdate", ""),
        "publisher_name": source_info.get("publisher", {}).get("publishername", "")
    }

    return head_output_data

def process_bibliography_item_info_data(bibrecord_data):
    item_info_output = {
        "external_source": "",
        "history": {}
    }

    item_info = bibrecord_data.get("item-info", {})
    if not item_info:  # Check if item_info is empty or None
        return item_info_output

    item_info_output["external_source"] = item_info.get("external-source", "")

    history = item_info.get("history", {})
    date_created = history.get("date-created", {})
    item_info_output["history"] = {
        "day": date_created.get("@day", ""),
        "timestamp": date_created.get("@timestamp", ""),
        "year": date_created.get("@year", ""),
        "month": date_created.get("@month", "")
    }

    return item_info_output

def process_bibliography_tail_data(bibrecord_data):
    # Ensure 'tail' is a dictionary before proceeding
    tail_info = bibrecord_data.get("tail", {})

    bibliography_output = {
        "refcount": "",
        "references": []
    }

    if not tail_info:  # Check if tail_info is empty or None
        return bibliography_output

    bibliography = tail_info.get("bibliography", {})
    bibliography_output["refcount"] = bibliography.get("@refcount", "0")

    # Handle references
    references = bibliography.get("reference", [])
    if not isinstance(references, list):
        references = [references] if references else []

    for ref in references:
        reference_info = {
            "id": ref.get("@id", ""),
            "ref_fulltext": ref.get("ref-fulltext", ""),
            "ref_text": ref.get("ce:source-text", ""),
            "ref_info": {},
            "ref_authors": [],
            "ref_authors_count": "",
            "ref_collab": []
        }

        ref_info = ref.get("ref-info", {})
        reference_info["ref_info"] = {
            "ref_publicationyear": ref_info.get("ref-publicationyear", {}).get("@first", ""),
            "ref_title": ref_info.get("ref-title", {}).get("ref-titletext", "Title Not Available"),
            "ref_sourcetitle": ref_info.get("ref-sourcetitle", "")
        }

        # Process authors and collaborations
        ref_authors = ref_info.get("ref-authors", {})
        authors = ref_authors.get("author", [])
        if not isinstance(authors, list):
            authors = [authors] if authors else []
        reference_info["ref_authors"] = [author.get("ce:indexed-name", "") for author in authors]
        reference_info["ref_authors_count"] = len(reference_info["ref_authors"])

        collaborations = ref_authors.get("collaboration", [])
        if not isinstance(collaborations, list):
            collaborations = [collaborations] if collaborations else []
        reference_info["ref_collab"] = [{"collaboration_name": collab.get("ce:text", "")} for collab in collaborations]

        bibliography_output["references"].append(reference_info)

    return bibliography_output


In [None]:
def get_citation_count(link):
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Firefox(options=options)
    driver.get(link)
    driver.implicitly_wait(5)
    redirected_html = driver.page_source
    driver.quit()

    soup = BeautifulSoup(redirected_html, 'html.parser')
    page_title_header = soup.find('span', id='pageTitleHeader')

    if page_title_header:
        text_inside_span = page_title_header.get_text(strip=True)
        numbers = re.findall(r'\d+', text_inside_span)
        return numbers[0] if numbers else "0"
    return "0"

def process_coredata_info(abstract_info_coredata):
    coredata_info_output = {
        "srctype": abstract_info_coredata.get("srctype", ""),
        "dc_description": abstract_info_coredata.get("dc:description", ""),
        "prism_aggregationType": abstract_info_coredata.get("prism:aggregationType", ""),
        "prism_url": abstract_info_coredata.get("prism:url", ""),
        "dc_title": abstract_info_coredata.get("dc:title", ""),
        "article_number": abstract_info_coredata.get("article-number", ""),
        "prism_publicationName": abstract_info_coredata.get("prism:publicationName", "Publication Name Not Available"),
        "dc_identifier": abstract_info_coredata.get("dc:identifier", ""),
        "dc_publisher": abstract_info_coredata.get("dc:publisher", ""),
        "dc_creator": [],
        "links": [],
        "citation_count": "0"
    }

    if "dc:creator" in abstract_info_coredata and "author" in abstract_info_coredata["dc:creator"]:
        for author in abstract_info_coredata["dc:creator"]["author"]:
            author_info = {
                "preferred_name": author.get("preferred-name", {}),
                "auid": author.get("@auid", ""),
                "author_url": author.get("author-url", "")
            }
            coredata_info_output["dc_creator"].append(author_info)

    if "link" in abstract_info_coredata:
        for link in abstract_info_coredata["link"]:
            link_info = {
                "rel": link.get("@rel", ""),
                "href": link.get("@href", "")
            }
            coredata_info_output["links"].append(link_info)
            if link.get("@rel") == "your_desired_relation_here":  # Assuming you want a specific link
                coredata_info_output["citation_count"] = get_citation_count(link.get("@href"))

    return coredata_info_output

In [None]:
def process_auth_subject_data(abstract_info_authkeywords, abstract_info_subject):
    # Prepare output dictionaries
    authkeywords_output = {"author_keyword": []}
    subject_areas_output = {"subject_area": []}

    # Check and extract data for authkeywords
    if abstract_info_authkeywords is not None and "author-keyword" in abstract_info_authkeywords:
        author_keywords = abstract_info_authkeywords["author-keyword"]
        # Normalize to list if it's a single dictionary
        if isinstance(author_keywords, dict):
            author_keywords = [author_keywords]  # Convert a single dict to a list
        # Extract keywords assuming a list of dictionaries now
        authkeywords_output["author_keyword"] = [keyword["$"] for keyword in author_keywords]

    # Check and extract data for subject-areas
    if abstract_info_subject is not None and "subject-area" in abstract_info_subject:
        subject_areas = abstract_info_subject["subject-area"]
        # Normalize to list if it's a single dictionary
        if isinstance(subject_areas, dict):
            subject_areas = [subject_areas]  # Convert a single dict to a list
        subject_areas_output["subject_area"] = [
            {
                "name": area["$"],
                "code": area["@code"],
                "abbrev": area["@abbrev"]
            } for area in subject_areas
        ]

    # Return both outputs as a tuple of dictionaries
    return authkeywords_output, subject_areas_output


In [None]:
import os
import json
import zipfile

def process_files(year_range, base_path):
    batch_years = []
    for year in year_range:
        # Determine the directory pattern based on the year
        year_directory = f"{base_path}/{year}/" if year == 2018 else f"{base_path}/{year}/"

        # Ensure the year directory exists and create if not
        os.makedirs(year_directory, exist_ok=True)

        # Check for zip files in the base path and unzip them to the year directory
        zip_file_path = os.path.join(base_path, f'{year}.zip')
        if os.path.exists(zip_file_path):
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                zip_ref.extractall(year_directory)
                print(f"Extracted {zip_file_path} to {year_directory}")

        # Process JSON files in the unzipped directory
        batch_data = []
        print("cur year:", year_directory)
        for filename in os.listdir(year_directory):
            if filename == "__MACOSX": continue
            file_copy = os.path.join(year_directory, filename)
            print("cur cop:", file_copy)
            for file_elem in os.listdir(file_copy):
              if file_elem == '.DS_Store': continue
              file_path = os.path.join(file_copy, file_elem)
              print("cur path:", file_path)
              with open(file_path, 'r') as file:
                  json_data = json.load(file)
                  # Assuming 'abstracts-retrieval-response' is the key where the data starts
                  abstracts_info = json_data.get("abstracts-retrieval-response", {})

                  # look into the unimportant ones as well
                  affiliation_info = abstracts_info["affiliation"]
                  language = abstracts_info["language"]

                  abstracts_info_item = abstracts_info["item"]
                  bibrecord_data = abstracts_info_item["bibrecord"]

                  abstract_info_coredata = abstracts_info["coredata"]

                  abstract_info_idxterms = abstracts_info["idxterms"]

                  abstract_info_authkeywords = abstracts_info.get("authkeywords", [])  # Default to empty list if null
                  abstract_info_subject = abstracts_info.get("subject-areas", [])  # Default to empty list if null
                  # Call your data processing functions here and collect results

                  affiliation_data = process_abstract_data(affiliation_info)
                  bibliographic_data = process_bibliographic_head_data(bibrecord_data)
                  item_info_data = process_bibliography_item_info_data(bibrecord_data)
                  bibliography_tail_data = process_bibliography_tail_data(bibrecord_data)
                  core_data_info = process_coredata_info(abstract_info_coredata)
                  authkeywords_output, subject_areas_output = process_auth_subject_data(abstract_info_authkeywords, abstract_info_subject)

                  processed_data = {
                      "affiliationData": affiliation_data,
                      "bibliographicData": bibliographic_data,
                      "itemInfoData": item_info_data,
                      "bibliographyTailData": bibliography_tail_data,
                      "coreDataInfo": core_data_info,
                      "authKeywordsInfo": authkeywords_output,
                      "subjectAreasInfo": subject_areas_output,
                  }
                  batch_data.append(processed_data)

        batch_years.append(batch_data)

    return batch_years

# Example to call the function
by = process_files(range(2018, 2024), ".")
pprint.pprint(by[0])  # Use pprint to print the final data structure nicely