# Literate source code of helper functions to update publications page

## Read local data

In [1]:
import os
import pandas as pd
import json

In [2]:
%ls ~/GitHub/nicomarr.github.io/_data # Contains the actual data files
%ls ~/GitHub/nicomarr.github.io/_py/test_data # For testing and development

PMID-export.txt        articles-metadata.csv  update-log.json
PMID-export.txt             articles-metadata_copy.csv
PMID-export_copy.txt        update-log.json
articles-metadata.csv       update-log_copy.json


In [3]:
# Set path to the test data directory
data_dir = os.path.expanduser("~/GitHub/nicomarr.github.io/_py/test_data") # uncomment for local testing

In [4]:
# Set path to the data files
pmid_list_path = os.path.join(data_dir, "PMID-export.txt")
print(pmid_list_path)
articles_metadata_path = os.path.join(data_dir, "articles-metadata.csv")
print(articles_metadata_path)
update_log_path = os.path.join(data_dir, "update-log.json")
print(update_log_path)

/Users/user2/GitHub/nicomarr.github.io/_py/test_data/PMID-export.txt
/Users/user2/GitHub/nicomarr.github.io/_py/test_data/articles-metadata.csv
/Users/user2/GitHub/nicomarr.github.io/_py/test_data/update-log.json


In [5]:
with open(pmid_list_path) as f:
    pmids = f.read().splitlines()
print(f"No. of PMIDs: {len(pmids)}\n{pmids}")

No. of PMIDs: 73
['38776920', '38701783', '38557723', '38422122', '38363432', '38175961', '38157855', '38048195', '37875108', '37779520', '37448622', '37083451', '37047709', '36763636', '36515678', '36736301', '36326697', '36342405', '36425144', '36094518', '36003377', '35670811', '35091979', '35090163', '34427831', '34623332', '34183838', '34413140', '34137790', '34214472', '34183371', '33876776', '33529170', '33497357', '33510449', '32960813', '33296702', '32972995', '32163377', '31995689', '31784499', '31270247', '31346092', '31046570', '30578925', '31231515', '31559014', '30578352', '30143481', '29907691', '29537367', '28437470', '28069966', '27347375', '26720836', '25819983', '24949794', '24332264', '24603545', '24391215', '24119913', '23543769', '23467413', '23487427', '22535679', '21695123', '21050116', '20176798', '18582518', '18424515', '17230419', '39048830', '38776920']


In [6]:
df_articles_metadata = pd.read_csv(articles_metadata_path, dtype=str)
df_articles_metadata["publication_date"] = pd.to_datetime(df_articles_metadata["publication_date"])
df_articles_metadata = df_articles_metadata.sort_values(by="publication_date", ascending=False)
df_articles_metadata

Unnamed: 0,first_author_last_name,article_title,journal,publication_year,publication_date,pmid,pmcid,oaid,pdf_url,doi_url,cited_by_count,cited_by_ui_url,type,type_crossref,updated_date
0,Chan,Human TMEFF1 is a restriction factor for herpe...,Nature,2024,2024-07-24,39048830,,https://openalex.org/W4400948832,not available,https://doi.org/10.1038/s41586-024-07745-x,0,https://openalex.org/works?filter=cites:W44009...,article,journal-article,2024-08-01T05:43:34.150760
1,Momenilandi,FLT3L governs the development of partially ove...,Cell,2024,2024-05-01,38701783,,https://openalex.org/W4396620188,http://www.cell.com/article/S0092867424004045/pdf,https://doi.org/10.1016/j.cell.2024.04.009,0,https://openalex.org/works?filter=cites:W43966...,article,journal-article,2024-08-01T05:41:23.662223
2,Guérin,Helper T cell immunity in humans with inherite...,The Journal of Experimental Medicine,2024,2024-04-01,38557723,,https://openalex.org/W4393386550,https://rupress.org/jem/article-pdf/221/5/e202...,https://doi.org/10.1084/jem.20231044,2,https://openalex.org/works?filter=cites:W43933...,article,journal-article,2024-08-05T00:29:35.879343
3,Materna,The immunopathological landscape of human pre-...,Science,2024,2024-03-01,38422122,,https://openalex.org/W4392282847,not available,https://doi.org/10.1126/science.adh4059,1,https://openalex.org/works?filter=cites:W43922...,article,journal-article,2024-08-01T12:58:02.773428
4,Rosain,Recombinant IFN-γ1b Treatment in a Patient wit...,Journal of Clinical Immunology,2024,2024-02-16,38363432,,https://openalex.org/W4391883684,https://link.springer.com/content/pdf/10.1007/...,https://doi.org/10.1007/s10875-024-01661-5,0,https://openalex.org/works?filter=cites:W43918...,article,journal-article,2024-08-05T02:10:08.539190
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,Marr,Variability in the Lipooligosaccharide Structu...,The Journal of Infectious Diseases,2010,2010-12-15,21050116,,https://openalex.org/W2114791730,https://academic.oup.com/jid/article-pdf/202/1...,https://doi.org/10.1086/657409,27,https://openalex.org/works?filter=cites:W21147...,article,journal-article,2024-08-02T14:33:23.728804
66,Marr,Substitution of the <i>Bordetella pertussis</i...,Infection and Immunity,2010,2010-05-01,20176798,2863497,https://openalex.org/W2126122440,https://iai.asm.org/content/iai/78/5/2060.full...,https://doi.org/10.1128/iai.01346-09,43,https://openalex.org/works?filter=cites:W21261...,article,journal-article,2024-08-01T03:57:55.659991
67,Marr,Protective activity of the Bordetella pertussi...,Vaccine,2008,2008-08-01,18582518,,https://openalex.org/W2044914152,not available,https://doi.org/10.1016/j.vaccine.2008.06.017,64,https://openalex.org/works?filter=cites:W20449...,article,journal-article,2024-08-06T08:15:01.441660
68,Marr,Glucosamine Found as a Substituent of Both Pho...,Journal of Bacteriology,2008,2008-06-15,18424515,2446747,https://openalex.org/W2153119855,https://www.ncbi.nlm.nih.gov/pmc/articles/2446...,https://doi.org/10.1128/jb.01875-07,58,https://openalex.org/works?filter=cites:W21531...,article,journal-article,2024-08-02T23:24:47.866990


In [7]:
with open(update_log_path) as f:
    update_log = json.load(f)
update_log

{'last_modified': '2024-08-06'}

## Make call to the OpenAlex API using the `requests` library

In [8]:
import requests
from pprint import pprint

id = "34427831" # pmid from the list; for testing
base_url = "https://api.openalex.org/works/"
params = {
    "mailto": os.environ["EMAIL"],
    "select": "id,title,doi,primary_location,authorships,publication_year,publication_date,ids,best_oa_location,cited_by_count,cited_by_api_url,type,type_crossref,updated_date",
}
url = f"{base_url}pmid:{id}"

verbose = True  # Define verbose or pass it as a parameter

try:
    response = requests.get(url, params=params)
    response.raise_for_status()  # Raises an HTTPError for bad responses
    work = response.json()
except requests.RequestException as e:
    if verbose:
        print(f"An error occurred while making an API call with UID {id}: {e}")
    work = None
except json.JSONDecodeError as e:
    if verbose:
        print(f"Failed to decode JSON response for UID {id}: {e}")
        print(f"Response content: {response.text}")
    work = None

if work is None:
    print(f"Status code: {response.status_code}")
    print("Response headers:")
    pprint(dict(response.headers), indent=2)
    print("Response content:")
    try:
        pprint(response.json(), indent=2)
    except json.JSONDecodeError:
        print(response.text)
else:
    pprint(work, indent=2)

{ 'authorships': [ { 'affiliations': [ { 'institution_ids': [],
                                         'raw_affiliation_string': 'Research '
                                                                   'Branch, '
                                                                   'Sidra '
                                                                   'Medicine, '
                                                                   'Doha, '
                                                                   'Qatar'}],
                     'author': { 'display_name': 'Andrea Guennoun',
                                 'id': 'https://openalex.org/A5023942449',
                                 'orcid': 'https://orcid.org/0000-0002-4303-8653'},
                     'author_position': 'first',
                     'countries': ['QA'],
                     'institutions': [],
                     'is_corresponding': False,
                     'raw_affiliation_strings': [ 'Research Br

## Use `openalex_api_utils` to get metadata from the OpenAlex API

In [9]:
import sys
sys.path.append(os.path.expanduser("../utils"))
from openalex_api_utils import get_works

#### Get metadata for a single entry


In [10]:
works, failed_calls = get_works(
    ids=["34427831"], 
    email=os.environ.get("EMAIL"),
    select_fields="id,title,doi,primary_location,authorships,publication_year,publication_date,ids,best_oa_location,cited_by_count,cited_by_api_url,type,type_crossref,updated_date",
    show_progress=True
)

Retrieving works: 100%|██████████| 1/1 [00:00<00:00,  2.77it/s]


#### Print relevant metadata


In [11]:
try:
    print(works[0]["metadata"]["title"])
    print(works[0]["metadata"]["authorships"][0]["author"]["display_name"])
    print(works[0]["metadata"]["publication_date"])
    print(works[0]["metadata"]["publication_year"])
    print(works[0]["metadata"]["doi"])
    print(works[0]["metadata"]["ids"].get("pmid").split("/")[-1]) # PMID
    print(works[0]["metadata"]["ids"].get("pmcid").split("/")[-1]) # PMCID
    print(works[0]["metadata"]["id"]) # OpenAlex ID
    print(works[0]["metadata"].get("best_oa_location").get("pdf_url")) # may be None
    print(works[0]["metadata"]["cited_by_count"])
    print(works[0]["metadata"]["cited_by_api_url"])
    print(works[0]["metadata"].get("type"))
    print(works[0]["metadata"].get("type_crossref"))
    print(works[0]["metadata"]["updated_date"])
except KeyError as e:
    print(f"KeyError: {e}")

A Novel STK4 Mutation Impairs T Cell Immunity Through Dysregulation of Cytokine-Induced Adhesion and Chemotaxis Genes
Andrea Guennoun
2021-08-24
2021
https://doi.org/10.1007/s10875-021-01115-2
34427831
8604862
https://openalex.org/W3194117818
https://link.springer.com/content/pdf/10.1007/s10875-021-01115-2.pdf
6
https://api.openalex.org/works?filter=cites:W3194117818
article
journal-article
2024-09-13T16:04:21.884242


#### Get metadata from multiple entries and iterate over a list of works to extract relevant information

In [12]:
works, failed_calls = get_works( 
    ids=pmids,
    email=os.environ.get("EMAIL"),
    select_fields="id,title,doi,primary_location,authorships,publication_year,publication_date,ids,best_oa_location,cited_by_count,cited_by_api_url,type,type_crossref,updated_date",
    show_progress=True
)


Retrieving works: 100%|██████████| 73/73 [00:33<00:00,  2.17it/s]


In [13]:
len(works), len(failed_calls)

(73, 0)

In [14]:
# Extract the relevant data from the API response
oa_data = []
for work in works:
    metadata = work["metadata"]
    first_author_last_name = metadata["authorships"][0]["author"]["display_name"].split(" ")[-1]
    article_title = metadata["title"]
    journal = metadata["primary_location"]["source"]["display_name"]
    publication_year = str(metadata["publication_year"])
    publication_date = metadata["publication_date"]
    pmid = metadata["ids"].get("pmid", "").split("/")[-1] # remove the url prefix
    pmcid = metadata["ids"].get("pmcid")
    if pmcid is not None:
        pmcid = pmcid.split("/")[-1] # remove the url prefix
    else: 
        pmcid = "NaN"
    oaid = metadata["id"]
    try:
        pdf_url = metadata.get("best_oa_location", {}).get("pdf_url", "not available")
    except AttributeError:
        pdf_url = "not available"
    if pdf_url is None:
        pdf_url = "not available"
    doi_url = metadata["doi"]
    cited_by_count = str(metadata["cited_by_count"])
    cited_by_ui_url = metadata["cited_by_api_url"].replace("api.openalex.org", "openalex.org")
    type = metadata.get("type")
    type_crossref = metadata.get("type_crossref")
    updated_date = metadata.get("updated_date")

    # Append the extracted data to the list
    oa_data.append([
        first_author_last_name, article_title, journal, publication_year,
        publication_date, pmid, pmcid, oaid, pdf_url,
        doi_url, cited_by_count, cited_by_ui_url, type, type_crossref, updated_date
    ])
print(f"{len(oa_data)} entries:")
print(json.dumps(oa_data, indent=2))

73 entries:
[
  [
    "Neehus",
    "Human inherited CCR2 deficiency underlies progressive polycystic lung disease",
    "Cell",
    "2024",
    "2024-06-01",
    "38776920",
    "NaN",
    "https://openalex.org/W4398170248",
    "not available",
    "https://doi.org/10.1016/j.cell.2024.05.021",
    "0",
    "https://openalex.org/works?filter=cites:W4398170248",
    "erratum",
    "journal-article",
    "2024-09-09T19:41:48.100693"
  ],
  [
    "Momenilandi",
    "FLT3L governs the development of partially overlapping hematopoietic lineages in humans and mice",
    "Cell",
    "2024",
    "2024-05-01",
    "38701783",
    "NaN",
    "https://openalex.org/W4396620188",
    "http://www.cell.com/article/S0092867424004045/pdf",
    "https://doi.org/10.1016/j.cell.2024.04.009",
    "2",
    "https://openalex.org/works?filter=cites:W4396620188",
    "article",
    "journal-article",
    "2024-09-12T13:50:59.480262"
  ],
  [
    "Gu\u00e9rin",
    "Helper T cell immunity in humans with inheri

In [15]:
# Create a DataFrame with the specified columns
columns = [
    'first_author_last_name', 'article_title', 'journal',
    'publication_year', 'publication_date', 'pmid', 'pmcid', 'oaid',
    'pdf_url', 'doi_url', 'cited_by_count', 'cited_by_ui_url', 'type',
    'type_crossref', 'updated_date'
]

df_works = pd.DataFrame(oa_data, columns=columns)
df_works = df_works.drop_duplicates(subset=["pmid"])
df_works = df_works[df_works["type"] != "erratum"]
df_works.head()

Unnamed: 0,first_author_last_name,article_title,journal,publication_year,publication_date,pmid,pmcid,oaid,pdf_url,doi_url,cited_by_count,cited_by_ui_url,type,type_crossref,updated_date
1,Momenilandi,FLT3L governs the development of partially ove...,Cell,2024,2024-05-01,38701783,,https://openalex.org/W4396620188,http://www.cell.com/article/S0092867424004045/pdf,https://doi.org/10.1016/j.cell.2024.04.009,2,https://openalex.org/works?filter=cites:W43966...,article,journal-article,2024-09-12T13:50:59.480262
2,Guérin,Helper T cell immunity in humans with inherite...,The Journal of Experimental Medicine,2024,2024-04-01,38557723,,https://openalex.org/W4393386550,https://rupress.org/jem/article-pdf/221/5/e202...,https://doi.org/10.1084/jem.20231044,2,https://openalex.org/works?filter=cites:W43933...,article,journal-article,2024-08-31T09:37:24.431170
3,Materna,The immunopathological landscape of human pre-...,Science,2024,2024-03-01,38422122,,https://openalex.org/W4392282847,not available,https://doi.org/10.1126/science.adh4059,2,https://openalex.org/works?filter=cites:W43922...,article,journal-article,2024-09-12T13:20:05.823215
4,Rosain,Recombinant IFN-γ1b Treatment in a Patient wit...,Journal of Clinical Immunology,2024,2024-02-16,38363432,,https://openalex.org/W4391883684,https://link.springer.com/content/pdf/10.1007/...,https://doi.org/10.1007/s10875-024-01661-5,1,https://openalex.org/works?filter=cites:W43918...,article,journal-article,2024-09-07T09:38:30.231251
5,Bastard,Higher COVID-19 pneumonia risk associated with...,The Journal of Experimental Medicine,2024,2024-01-04,38175961,,https://openalex.org/W4390576813,https://rupress.org/jem/article-pdf/221/2/e202...,https://doi.org/10.1084/jem.20231353,16,https://openalex.org/works?filter=cites:W43905...,article,journal-article,2024-09-15T11:59:54.875091


## Define a function that parses metadata and returns the relevant information

In [16]:
from typing import List, Dict, Any

def parse_data(works: List[Dict[str, Any]], exclude_errata = True) -> pd.DataFrame:
    """
    Parse the raw data from the OpenAlex API and create a DataFrame.

    This function extracts relevant information from each work in the input list
    and creates a DataFrame with specified columns. It also removes duplicates
    based on PMID and filters out errata (if specified).

    Args:
        works (List[Dict[str, Any]]): A list of dictionaries, where each dictionary
            contains metadata about a work.
        exclude_errata (bool): Whether to exclude errata from the DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing extracted and processed information
        from the works.

    Example:
        >>> df_works = create_works_dataframe(works)
        >>> df_works.head()

    Note:
        The function extracts the following information for each work:
        - First author's last name
        - Article title
        - Journal name
        - Publication year and date
        - PMID, PMCID, and OpenAlex ID
        - PDF URL (if available)
        - DOI URL
        - Citation count and URL
        - Work type and Crossref type
        - Updated date (from the API)
    """
    oa_data = []
    for work in works:
        metadata = work["metadata"]
        first_author_last_name = metadata["authorships"][0]["author"]["display_name"].split(" ")[-1]
        article_title = metadata["title"]
        journal = metadata["primary_location"]["source"]["display_name"]
        publication_year = str(metadata["publication_year"])
        publication_date = metadata["publication_date"]
        pmid = metadata["ids"].get("pmid", "").split("/")[-1] # remove the url prefix
        pmcid = metadata["ids"].get("pmcid")
        if pmcid is not None:
            pmcid = pmcid.split("/")[-1] # remove the url prefix
        else: 
            pmcid = "NaN"
        oaid = metadata["id"]
        try:
            pdf_url = metadata.get("best_oa_location", {}).get("pdf_url", "not available")
        except AttributeError:
            pdf_url = "not available"
        if pdf_url is None:
            pdf_url = "not available"
        doi_url = metadata["doi"]
        cited_by_count = str(metadata["cited_by_count"])
        cited_by_ui_url = metadata["cited_by_api_url"].replace("api.openalex.org", "openalex.org")
        work_type = metadata.get("type")
        type_crossref = metadata.get("type_crossref")
        updated_date = metadata.get("updated_date")

        oa_data.append([
            first_author_last_name, article_title, journal, publication_year,
            publication_date, pmid, pmcid, oaid, pdf_url, doi_url,
            cited_by_count, cited_by_ui_url, work_type, type_crossref, updated_date
        ])

    columns = [
        'first_author_last_name', 'article_title', 'journal',
        'publication_year', 'publication_date', 'pmid', 'pmcid', 'oaid',
        'pdf_url', 'doi_url', 'cited_by_count', 'cited_by_ui_url', 'type',
        'type_crossref', 'updated_date'
    ]

    df_works = pd.DataFrame(oa_data, columns=columns, dtype=str)
    df_works = df_works.drop_duplicates(subset=["pmid"])
    if exclude_errata:
        df_works = df_works[df_works["type"] != "erratum"]

    df_works["publication_date"] = pd.to_datetime(df_works["publication_date"], format='%Y-%m-%d', errors='coerce')
    df_works = df_works.sort_values(by="publication_date", ascending=False)
    df_works.reset_index(drop=True, inplace=True)

    return df_works

In [17]:
df_works = parse_data(works)
df_works.head()

Unnamed: 0,first_author_last_name,article_title,journal,publication_year,publication_date,pmid,pmcid,oaid,pdf_url,doi_url,cited_by_count,cited_by_ui_url,type,type_crossref,updated_date
0,Chan,Human TMEFF1 is a restriction factor for herpe...,Nature,2024,2024-07-24,39048830,,https://openalex.org/W4400948832,not available,https://doi.org/10.1038/s41586-024-07745-x,1,https://openalex.org/works?filter=cites:W44009...,article,journal-article,2024-09-07T20:39:15.475471
1,Momenilandi,FLT3L governs the development of partially ove...,Cell,2024,2024-05-01,38701783,,https://openalex.org/W4396620188,http://www.cell.com/article/S0092867424004045/pdf,https://doi.org/10.1016/j.cell.2024.04.009,2,https://openalex.org/works?filter=cites:W43966...,article,journal-article,2024-09-12T13:50:59.480262
2,Guérin,Helper T cell immunity in humans with inherite...,The Journal of Experimental Medicine,2024,2024-04-01,38557723,,https://openalex.org/W4393386550,https://rupress.org/jem/article-pdf/221/5/e202...,https://doi.org/10.1084/jem.20231044,2,https://openalex.org/works?filter=cites:W43933...,article,journal-article,2024-08-31T09:37:24.431170
3,Materna,The immunopathological landscape of human pre-...,Science,2024,2024-03-01,38422122,,https://openalex.org/W4392282847,not available,https://doi.org/10.1126/science.adh4059,2,https://openalex.org/works?filter=cites:W43922...,article,journal-article,2024-09-12T13:20:05.823215
4,Rosain,Recombinant IFN-γ1b Treatment in a Patient wit...,Journal of Clinical Immunology,2024,2024-02-16,38363432,,https://openalex.org/W4391883684,https://link.springer.com/content/pdf/10.1007/...,https://doi.org/10.1007/s10875-024-01661-5,1,https://openalex.org/works?filter=cites:W43918...,article,journal-article,2024-09-07T09:38:30.231251


## Compare local data with external data and find mismatches

In [18]:
# Compare the shapes of the DataFrames
print("df_works shape:", df_works.shape)
print("df_articles_metadata shape:", df_articles_metadata.shape)
assert df_works.shape == df_articles_metadata.shape, "DataFrames have different shapes."

df_works shape: (70, 15)
df_articles_metadata shape: (70, 15)


In [19]:
# Compare the column names:
print("df_works columns:", df_works.columns.tolist())
print("articles_metadata columns:", df_articles_metadata.columns.tolist())
assert df_works.columns.tolist() == df_articles_metadata.columns.tolist(), "Column names are different."

df_works columns: ['first_author_last_name', 'article_title', 'journal', 'publication_year', 'publication_date', 'pmid', 'pmcid', 'oaid', 'pdf_url', 'doi_url', 'cited_by_count', 'cited_by_ui_url', 'type', 'type_crossref', 'updated_date']
articles_metadata columns: ['first_author_last_name', 'article_title', 'journal', 'publication_year', 'publication_date', 'pmid', 'pmcid', 'oaid', 'pdf_url', 'doi_url', 'cited_by_count', 'cited_by_ui_url', 'type', 'type_crossref', 'updated_date']


In [20]:
# Check for common columns:
common_columns = set(df_works.columns) & set(df_articles_metadata.columns)
print("Common columns:", common_columns)
print("No. of common columns:", len(common_columns))

# Check for columns only in one of the DataFrames:
works_columns = set(df_works.columns)
articles_metadata_columns = set(df_articles_metadata.columns)
print("Columns only in df_works:", works_columns - articles_metadata_columns)
print("Columns only in articles_metadata:", articles_metadata_columns - works_columns)

Common columns: {'publication_year', 'doi_url', 'publication_date', 'pmid', 'pdf_url', 'cited_by_ui_url', 'updated_date', 'oaid', 'first_author_last_name', 'cited_by_count', 'type_crossref', 'article_title', 'pmcid', 'type', 'journal'}
No. of common columns: 15
Columns only in df_works: set()
Columns only in articles_metadata: set()


In [21]:
# Compare the datatypes:
common_columns = list(common_columns)
df_works_dtypes = df_works[common_columns].dtypes
articles_metadata_dtypes = df_articles_metadata[common_columns].dtypes
dtypes_comparison = df_works_dtypes == articles_metadata_dtypes

print("Columns with mismatched datatypes:")
for column in common_columns:
    if df_works_dtypes[column] == articles_metadata_dtypes[column]:
        continue
    print(f"{column}:")
    print(f"  df_works: {df_works_dtypes[column]}")
    print(f"  articles_metadata: {articles_metadata_dtypes[column]}")
    print(f"  Match: {dtypes_comparison[column]}")
    print()

# leave pmid dtypes as int for consistency

Columns with mismatched datatypes:


In [22]:
# Compare the "pmid" values:
df_works_pmids = set(df_works["pmid"])
articles_metadata_pmids = set(df_articles_metadata["pmid"].astype(str))
common_pmids = df_works_pmids & articles_metadata_pmids

print("Number of common PMIDs:", len(common_pmids))
print("PMIDs only in df_works:", len(df_works_pmids - articles_metadata_pmids))
print("PMIDs only in articles_metadata:", len(articles_metadata_pmids - df_works_pmids))

Number of common PMIDs: 70
PMIDs only in df_works: 0
PMIDs only in articles_metadata: 0


In [23]:
# Compare the values for article_title in both DataFrames
series1 = df_works["article_title"].reset_index(drop=True)
series2 = df_articles_metadata["article_title"].reset_index(drop=True)
title_mismatch = series1.compare(series2)
title_mismatch

Unnamed: 0,self,other
17,DETECTION OF ANTINUCLEAR ANTIBODIES TARGETING ...,Detection of Antinuclear Antibodies Targeting ...
68,Glucosamine Found as a Substituent of Both Pho...,Glucosamine Found as a Substituent of Both Pho...
69,<i>Bordetella pertussis</i>Binds Human C1 Este...,<i>Bordetella pertussis</i> Binds Human C1 Est...


In [24]:
# Compare the values for pdf_url in both DataFrames
series1 = df_works["pdf_url"].reset_index(drop=True)
series2 = df_articles_metadata["pdf_url"].reset_index(drop=True)
pdf_url_mismatch = series1.compare(series2)
pdf_url_mismatch

Unnamed: 0,self,other
28,not available,https://www.ncbi.nlm.nih.gov/pmc/articles/8446...
29,not available,https://www.ncbi.nlm.nih.gov/pmc/articles/8217...
49,not available,https://www.ncbi.nlm.nih.gov/pmc/articles/5915...
52,not available,https://www.ncbi.nlm.nih.gov/pmc/articles/4916...
61,not available,https://www.ncbi.nlm.nih.gov/pmc/articles/3636...
62,not available,https://www.ncbi.nlm.nih.gov/pmc/articles/3656...
68,not available,https://www.ncbi.nlm.nih.gov/pmc/articles/2446...


In [25]:
# Compare the values for the doi_url in both DataFrames
series1 = df_works["doi_url"].reset_index(drop=True)
series2 = df_articles_metadata["doi_url"].reset_index(drop=True)
pdf_url_mismatch = series1.compare(series2)
pdf_url_mismatch

Unnamed: 0,self,other


In [26]:
# Compare all columns
result = df_works.reset_index(drop=True).compare(df_articles_metadata.reset_index(drop=True))
result

Unnamed: 0_level_0,first_author_last_name,first_author_last_name,article_title,article_title,pmcid,pmcid,pdf_url,pdf_url,cited_by_count,cited_by_count,updated_date,updated_date
Unnamed: 0_level_1,self,other,self,other,self,other,self,other,self,other,self,other
0,,,,,,,,,1,0,2024-09-07T20:39:15.475471,2024-08-01T05:43:34.150760
1,,,,,,,,,2,0,2024-09-12T13:50:59.480262,2024-08-01T05:41:23.662223
2,,,,,,,,,,,2024-08-31T09:37:24.431170,2024-08-05T00:29:35.879343
3,,,,,,,,,2,1,2024-09-12T13:20:05.823215,2024-08-01T12:58:02.773428
4,,,,,,,,,1,0,2024-09-07T09:38:30.231251,2024-08-05T02:10:08.539190
...,...,...,...,...,...,...,...,...,...,...,...,...
65,,,,,,,,,29,27,2024-09-16T10:17:28.168230,2024-08-02T14:33:23.728804
66,,,,,,,,,46,43,2024-08-31T23:36:48.582534,2024-08-01T03:57:55.659991
67,,,,,,,,,66,64,2024-09-09T11:37:16.017456,2024-08-06T08:15:01.441660
68,,,Glucosamine Found as a Substituent of Both Pho...,Glucosamine Found as a Substituent of Both Pho...,,,not available,https://www.ncbi.nlm.nih.gov/pmc/articles/2446...,61,58,2024-09-16T03:10:59.488199,2024-08-02T23:24:47.866990


## Define a function to add metadata of new articles to the local/existing data

In [27]:
pmids.append("39198650") # add a new PMID for testing

In [28]:
# find IDs in the PMID list missing from the local metadata
new_pmids = set(pmids) - set(df_articles_metadata["pmid"].astype(str))
new_pmids

{'36342405', '38776920', '39198650'}

In [29]:
# make API calls to get the missing PMIDs
exclude_errata=True
new_works, failed_calls = get_works(
    ids=list(new_pmids),
    email=os.environ.get("EMAIL"),
    select_fields="id,title,doi,primary_location,authorships,publication_year,publication_date,ids,best_oa_location,cited_by_count,cited_by_api_url,type,type_crossref,updated_date",
    show_progress=True
)

df_new_works = parse_data(new_works, exclude_errata=exclude_errata)
if len(df_new_works) == 0:
    print("No new articles found.")
    print(f"Failed calls: {len(failed_calls)}")
    print(f"Errata excluded: {exclude_errata}")
    print(f"No. of errata in new works: {len(df_new_works[df_new_works['type'] == 'erratum'])}")
    
else:
    print(f"\n{len(df_new_works)} new article(s) found.\n")
    print(df_new_works[["article_title"]])
assert len(df_new_works) == 1, "Number of new articles is not match the expected number."

Retrieving works: 100%|██████████| 3/3 [00:01<00:00,  2.64it/s]


1 new article(s) found.

                                       article_title
0  Tuberculosis in otherwise healthy adults with ...





In [30]:
if len(df_new_works) == 0:
    print("No new articles found.")
else:
    df_updated_works = pd.concat([df_works, df_new_works], ignore_index=False)

In [31]:
df_updated_works = df_updated_works.sort_values(by="publication_date", ascending=False)
assert df_new_works.at[0, 'article_title'] in df_updated_works["article_title"].values
df_updated_works.head(3)

Unnamed: 0,first_author_last_name,article_title,journal,publication_year,publication_date,pmid,pmcid,oaid,pdf_url,doi_url,cited_by_count,cited_by_ui_url,type,type_crossref,updated_date
0,Arias,Tuberculosis in otherwise healthy adults with ...,Nature,2024,2024-08-28,39198650,,https://openalex.org/W4401947283,not available,https://doi.org/10.1038/s41586-024-07866-3,1,https://openalex.org/works?filter=cites:W44019...,article,journal-article,2024-09-14T05:25:27.821423
0,Chan,Human TMEFF1 is a restriction factor for herpe...,Nature,2024,2024-07-24,39048830,,https://openalex.org/W4400948832,not available,https://doi.org/10.1038/s41586-024-07745-x,1,https://openalex.org/works?filter=cites:W44009...,article,journal-article,2024-09-07T20:39:15.475471
1,Momenilandi,FLT3L governs the development of partially ove...,Cell,2024,2024-05-01,38701783,,https://openalex.org/W4396620188,http://www.cell.com/article/S0092867424004045/pdf,https://doi.org/10.1016/j.cell.2024.04.009,2,https://openalex.org/works?filter=cites:W43966...,article,journal-article,2024-09-12T13:50:59.480262


In [33]:
import os
import pandas as pd
from datetime import datetime
import argparse
from typing import Tuple, Set, List, Dict, Any

def append_metadata(metadata_file_path: str, pmid_file_path: str, exclude_errata: bool = True, verbose: bool = True) -> Tuple[bool, str]:
    """
    Append metadata for missing PMIDs to an existing metadata file.

    Args:
        metadata_file_path (str): Path to CSV file containing existing metadata.
        pmid_file_path (str): Path to file containing list of PMIDs.
        verbose (bool): Whether to show verbose messages during the process.

    Returns:
        tuple: A tuple containing a boolean indicating if any updates were made, and a message string with details.
    """

    # Input validation
    assert metadata_file_path.endswith(".csv"), "Invalid file format. Please provide a CSV file."
    assert os.path.exists(metadata_file_path), "Metadata file not found."
    assert pmid_file_path.endswith(".txt"), "Invalid file format. Please provide a TXT file."
    assert os.path.exists(pmid_file_path), "PMID file not found."
    assert isinstance(verbose, bool), "Verbose must be a boolean."

    # Read existing metadata
    if verbose: print("Reading the existing metadata file...")
    try:
        metadata = pd.read_csv(metadata_file_path, dtype=str)
        metadata_bkp = deepcopy(metadata) # Make a deepcopy of the DataFrame to save a backup
    except Exception as e:
        if verbose:
            print(f"An error occurred while reading the metadata file: {e}")
        return False, f"An error occurred while reading the metadata file: {e}"

    # Read PMIDs from file
    if verbose: print("Reading the PMID file...")
    try:
        with open(pmid_file_path, 'r') as f:
            pmids = set(line.strip() for line in f)
    except Exception as e:
        if verbose:
            print(f"An error occurred while reading the PMID file: {e}")
        return False, f"An error occurred while reading the PMID file: {e}"

    # Find missing PMIDs
    if verbose: print("Searching for new PMIDs not in the metadata...")
    existing_pmids: Set[str] = set(metadata["pmid"])
    new_pmids: Set[str] = pmids - existing_pmids
    new_pmids_str = ", ".join(new_pmids) # Convert to a string with comma-separated values
    if verbose: print(f"Found {len(new_pmids)} new PMID(s): {new_pmids_str}.")
    if len(new_pmids) == 0:
        return False, "No new PMIDs found."
    else:
        try:
            # Make API calls to get the missing PMIDs
            exclude_errata: bool = True
            select_fields: str = (
                "id,title,doi,primary_location,authorships,publication_year,"
                "publication_date,ids,best_oa_location,cited_by_count,"
                "cited_by_api_url,type,type_crossref,updated_date"
            )
            new_articles, failed_calls = get_works(
                ids=list(new_pmids),
                email=os.environ.get("EMAIL"),
                select_fields=select_fields,
                show_progress=verbose
            )
            if verbose: print(f"API calls completed. Failed calls: {len(failed_calls)}")
        except Exception as e:
            if verbose:
                print(f"An error occurred while fetching works data from the API: {e}")
            return False, f"An error occurred while fetching works data from the API: {e}"

        # Parse the data for new articles
        if verbose: print("Parsing the data for new articles...")
        try:
            df_new_articles = parse_data(new_articles, exclude_errata=True)
        except Exception as e:
            if verbose:
                print(f"An error occurred while parsing the data for new articles: {e}")
            return False, f"An error occurred while parsing the data for new articles: {e}"
        
        if df_new_articles.empty:
            if exclude_errata:
                if verbose: print("No new articles found (Errata excluded).")
                return False, "No new articles found (Errata excluded)."
            else:
                if verbose: print("No new articles found.")
                return False, "No new articles found."
        else:
            # Append the new articles to the existing metadata
            new_pmids = set(df_new_articles["pmid"])
            new_pmids = ", ".join(new_pmids) # Convert to a string with comma-separated values
            if verbose: print(f"Appending {len(df_new_articles)} new article(s) with PMID(s) {new_pmids} to the existing metadata...")
            try:
                metadata = pd.concat([metadata, df_new_articles], ignore_index=True)
            except Exception as e:
                if verbose:
                    print(f"An error occurred while appending the new articles to the existing metadata: {e}")
                return False, f"An error occurred while appending the new articles to the existing metadata: {e}"

            # Save the updated metadata to a CSV file
            if verbose: print("Saving the updated metadata to a CSV file...")
            metadata.to_csv(metadata_file_path, index=False)
            if verbose: print("Saving a backup file to disk...")
            bkp_file_path = metadata_file_path.replace(".csv", f"_bkp-{datetime.now().strftime('%Y%m%d-%Hh%Mm')}.csv")
            metadata_bkp.to_csv(bkp_file_path, index=False)
            if verbose: print("Metadata updated successfully.")

            # Update the log file
            log_file_path = os.path.join(os.path.dirname(metadata_file_path), "update-log.json")
            try:
                # get the path to the log file from the metadata file path
                if verbose: print("Updating the log file...")
                with open(log_file_path, "r") as f:
                    update_log = json.load(f)
                # format {"last_modified": "2024-08-06"}
                update_log["last_modified"] = datetime.now().strftime("%Y-%m-%d")
                with open(log_file_path, "w") as f:
                    json.dump(update_log, f)
                if verbose: print(f"Log file updated successfully.")
            except Exception as e: # Note: Additional error handling has been added in the final version below
                if verbose: print(f"No log file found. Error: {e}. Creating a new log file...")
                with open(log_file_path, "w") as f:
                    json.dump({"last_modified": datetime.now().strftime("%Y-%m-%d")}, f)
                if verbose: print(f"Log file created successfully.")

            return True, f"Appended {len(df_new_articles)} article(s) and saved file to {metadata_file_path}. Backup saved as {bkp_file_path}"

## Define a function to update citation counts of all articles in the local data

In [34]:
# get up-to-date metadata for articles in the local metadata file; only cited_by_count and updated_date will be updated
works, failed_calls = get_works( 
    ids=df_articles_metadata["pmid"].astype(str).tolist(),
    email=os.environ.get("EMAIL"),
    select_fields="id,doi,title,cited_by_count,updated_date",
    show_progress=True
)

Retrieving works: 100%|██████████| 70/70 [00:26<00:00,  2.59it/s]


In [35]:
# Print the citation counts
# for work in works:
#     print(f"{work["metadata"]["title"][:30]}... {work["metadata"]["doi"].replace("https://doi.org/","")}, Citation count: {work["metadata"]["cited_by_count"]}")

In [36]:
# Subset a row in df_articles_metadata based on a condition
df_articles_metadata[df_articles_metadata["doi_url"] == "https://doi.org/10.1038/s41586-024-07745-x"]

Unnamed: 0,first_author_last_name,article_title,journal,publication_year,publication_date,pmid,pmcid,oaid,pdf_url,doi_url,cited_by_count,cited_by_ui_url,type,type_crossref,updated_date
0,Chan,Human TMEFF1 is a restriction factor for herpe...,Nature,2024,2024-07-24,39048830,,https://openalex.org/W4400948832,not available,https://doi.org/10.1038/s41586-024-07745-x,0,https://openalex.org/works?filter=cites:W44009...,article,journal-article,2024-08-01T05:43:34.150760


In [37]:
# Iterate over the rows in df_articles_metadata and update the cited_by_count and updated_date
counter = 0

# make a deepcopy of the DataFrame to save a backup
from copy import deepcopy
df_articles_metadata_bkp = deepcopy(df_articles_metadata)

for index, row in df_articles_metadata.iterrows():
    id = row["oaid"]
    current_cited_by_count = row["cited_by_count"]
    work = next((work for work in works if work["metadata"]["id"] == id), None)
    new_cited_by_count = work["metadata"]["cited_by_count"]
    if new_cited_by_count > int(current_cited_by_count):
        try:
            print(f"Updating the cited_by_count for ID: {id.split('/')[-1]} from {current_cited_by_count} to {new_cited_by_count}")
            df_articles_metadata.at[index, "cited_by_count"] = new_cited_by_count
            df_articles_metadata.at[index, "updated_date"] = work["metadata"]["updated_date"]
            counter += 1
        except Exception as e:
            print(f"Failed to update the cited_by_count for PMID: {pmid}")
            print(e)
    else:
        print(f"Citation count for ID: {id.split('/')[-1]} is up-to-date. Citattion count: {current_cited_by_count}. Skipping...")
        continue

print(f"Updated values for {counter} articles.")
if counter > 0:
    pass

Updating the cited_by_count for ID: W4400948832 from 0 to 1
Updating the cited_by_count for ID: W4396620188 from 0 to 2
Citation count for ID: W4393386550 is up-to-date. Citattion count: 2. Skipping...
Updating the cited_by_count for ID: W4392282847 from 1 to 2
Updating the cited_by_count for ID: W4391883684 from 0 to 1
Updating the cited_by_count for ID: W4390576813 from 10 to 16
Updating the cited_by_count for ID: W4390350774 from 8 to 12
Citation count for ID: W4389120393 is up-to-date. Citattion count: 2. Skipping...
Updating the cited_by_count for ID: W4387882122 from 12 to 13
Citation count for ID: W4377103638 is up-to-date. Citattion count: 2. Skipping...
Citation count for ID: W4376134695 is up-to-date. Citattion count: 0. Skipping...
Citation count for ID: W4366603686 is up-to-date. Citattion count: 14. Skipping...
Citation count for ID: W4362637082 is up-to-date. Citattion count: 4. Skipping...
Updating the cited_by_count for ID: W4320032950 from 34 to 38
Updating the cited_b

In [47]:
from copy import deepcopy
from typing import Tuple
import os
from datetime import datetime
import sys
import pandas as pd
sys.path.append(os.path.expanduser("../utils"))
from openalex_api_utils import get_works
import json

def update_citations(file_path: str, verbose: bool = True) -> Tuple[bool, str]:
    """
    Update the citation counts in the articles metadata file.

    Args:
        file_path (str): Path to the articles metadata file.
        verbose (bool): Whether to show verbose messages the process, including progress and errors.

    Returns:
        tuple: A tuple containing a boolean indicating if any updates were made, and a message string with details.
    """

    # Input validation
    assert file_path.endswith(".csv"), "Invalid file format. Please provide a CSV file."
    assert isinstance(verbose, bool), "Verbose must be a boolean."
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return False

    # Read the metadata file and sort by publication date, descending
    if verbose: print("Reading the metadata file...")
    try:
        metadata = pd.read_csv(file_path, dtype=str)
        metadata_bkp = deepcopy(metadata) # Make a deepcopy of the DataFrame to save a backup
        metadata["publication_date"] = pd.to_datetime(metadata["publication_date"]) # Note: Date parsing has been modified in the final version
        metadata = metadata.sort_values(by="publication_date", ascending=False)
        metadata.reset_index(drop=True, inplace=True)
    except Exception as e:
        if verbose:
            print(f"An error occurred while reading the metadata file: {e}")
        return False, f"An error occurred while reading the metadata file: {e}"
    
    # Extract PMIDs from the metadata
    if verbose: print("Extracting PMIDs from the metadata...")
    try:
        pmids = metadata["pmid"].astype(str).tolist()
    except Exception as e:
        if verbose:
            print(f"An error occurred while extracting PMIDs from the metadata: {e}")
        return False, f"An error occurred while extracting PMIDs from the metadata: {e}"
    
    # Make API calls to get the works for the PMIDs
    if verbose: print("Fetching works data from the API...")
    works, failed_calls = get_works(
        ids=pmids,
        email=os.environ.get("EMAIL"), # Will return an empty string if the variable is not set
        select_fields="id,doi,title,cited_by_count,updated_date",
        show_progress=verbose
    )
    
    # Iterate over the rows in metadata and update the cited_by_count and updated_date
    if verbose: print("Updating the citation counts...")
    counter = 0
    for index, row in metadata.iterrows():
        id = row["oaid"]
        doi = row["doi_url"]
        updated_date = row["updated_date"]
        current_cited_by_count = row["cited_by_count"]
        work = next((work for work in works if work["metadata"]["id"] == id), None)
        new_cited_by_count = work["metadata"].get("cited_by_count")
        if new_cited_by_count > int(current_cited_by_count):
            try:
                if verbose: print(f"Updating the cited_by_count for ID: {id} / DOI: {doi} from {current_cited_by_count} to {new_cited_by_count}")
                metadata.at[index, "cited_by_count"] = new_cited_by_count
                metadata.at[index, "updated_date"] = work["metadata"]["updated_date"]
                counter += 1
            except Exception as e:
                if verbose: print(f"Failed to update the cited_by_count for PMID: {pmid}. Error: {e}")
        else:
            if verbose: print(f"Citation count {current_cited_by_count} for ID {id} with DOI {doi} is up-to-date. Skipping...")
            continue

    if verbose: print(f"Updated values for {counter} articles.")
    if counter > 0:
        if verbose: print("Saving the updated metadata to a CSV file...")
        metadata.to_csv(file_path, index=False)
        if verbose: print("Saving a backup file to disk...")
        bkp_file_path = file_path.replace(".csv", f"_bkp-{datetime.now().strftime('%Y%m%d-%Hh%Mm')}.csv")
        metadata_bkp.to_csv(bkp_file_path, index=False)
        if verbose: print("Metadata updated successfully.")

        # Update the log file
        try:
            # get the path to the log file from the metadata file path
            if verbose: print("Updating the log file...")
            log_file_path = os.path.join(os.path.dirname(file_path), "update-log.json")
            with open(log_file_path, "r") as f:
                update_log = json.load(f)
            # format {"last_modified": "2024-08-06"}
            update_log["last_modified"] = datetime.now().strftime("%Y-%m-%d")
            with open(log_file_path, "w") as f:
                json.dump(update_log, f)
            if verbose: print(f"Log file updated successfully.")
        except Exception as e: # Note: Additional error handling has been added in the final version below
            if verbose: print(f"No log file found. Error: {e}. Creating a new log file...")
            with open(log_file_path, "w") as f:
                json.dump({"last_modified": datetime.now().strftime("%Y-%m-%d")}, f)
            if verbose: print(f"Log file created successfully.")

        return True, f"Updated values for {counter} articles and saved file to {file_path}. Backup saved as {bkp_file_path}"
    else:
        return False, "Loaded metadata were up-to-date. No changes were made."

In [39]:
data_dir = os.path.expanduser("~/GitHub/nicomarr.github.io/_py/test_data") # uncomment for local testing
test_file_path = os.path.join(data_dir, "articles-metadata_copy.csv")

In [48]:
update_citations(test_file_path, verbose=True)

Reading the metadata file...
Extracting PMIDs from the metadata...
Fetching works data from the API...


Retrieving works: 100%|██████████| 70/70 [00:27<00:00,  2.58it/s]

Updating the citation counts...
Updating the cited_by_count for ID: https://openalex.org/W4400948832 / DOI: https://doi.org/10.1038/s41586-024-07745-x from 0 to 1
Updating the cited_by_count for ID: https://openalex.org/W4396620188 / DOI: https://doi.org/10.1016/j.cell.2024.04.009 from 0 to 2
Citation count 2 for ID https://openalex.org/W4393386550 with DOI https://doi.org/10.1084/jem.20231044 is up-to-date. Skipping...
Updating the cited_by_count for ID: https://openalex.org/W4392282847 / DOI: https://doi.org/10.1126/science.adh4059 from 1 to 2
Updating the cited_by_count for ID: https://openalex.org/W4391883684 / DOI: https://doi.org/10.1007/s10875-024-01661-5 from 0 to 1
Updating the cited_by_count for ID: https://openalex.org/W4390576813 / DOI: https://doi.org/10.1084/jem.20231353 from 10 to 16
Updating the cited_by_count for ID: https://openalex.org/W4390350774 / DOI: https://doi.org/10.1016/j.cell.2023.11.036 from 8 to 12
Citation count 2 for ID https://openalex.org/W4389120393 w




(True,
 'Updated values for 48 articles and saved file to /Users/user2/GitHub/nicomarr.github.io/_py/test_data/articles-metadata_copy.csv. Backup saved as /Users/user2/GitHub/nicomarr.github.io/_py/test_data/articles-metadata_copy_bkp-20240916-18h46m.csv')

## Generate module from the above functions

In [20]:
%%writefile ../utils/init.py
__version__ = "0.0.2"

Overwriting ../utils/init.py


In [5]:
%%writefile ../utils/website_utils.py
import os
from typing import Tuple, Set, List, Dict, Any
from copy import deepcopy
from datetime import datetime
import pandas as pd
from openalex_api_utils import get_works
import json

def update_citations(file_path: str, verbose: bool = True) -> Tuple[bool, str]:
    """
    Update the citation counts in the articles metadata file.

    Args:
        file_path (str): Path to the articles metadata file.
        verbose (bool): Whether to show verbose messages during the process, including progress and errors.

    Returns:
        tuple: A tuple containing a boolean indicating if any updates were made, and a message string with details.
    """

    # Input validation
    assert file_path.endswith(".csv"), "Invalid file format. Please provide a CSV file."
    assert isinstance(verbose, bool), "Verbose must be a boolean."
    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return False, f"File not found: {file_path}"

    # Read the metadata file and sort by publication date, descending
    if verbose: print("Reading the metadata file...")
    try:
        metadata = pd.read_csv(file_path, dtype=str)
        metadata_bkp = deepcopy(metadata) # Make a deepcopy of the DataFrame to save a backup
        metadata['publication_date'] = pd.to_datetime(metadata['publication_date'], format='%Y-%m-%d', errors='coerce') # The errors='coerce' parameter will replace any unparsable dates with NaT (Not a Time) values
        if metadata['publication_date'].isna().any(): # Check if there are any NaT values
            if verbose:
                print("Warning: Some publication dates could not be parsed. These will be excluded from sorting.")
                entries_with_missing_dates = metadata[metadata['publication_date'].isna()]
                print(
                    f"Entries with publication dates that could not be parsed: idx {entries_with_missing_dates.index.tolist()}, "
                    f"PMIDs {entries_with_missing_dates['pmid'].tolist()}, "
                    f"Article titles {entries_with_missing_dates['article_title'].tolist()}"
                    )
            metadata = metadata.dropna(subset=['publication_date']) # Drop rows with missing publication dates
        metadata = metadata.sort_values(by="publication_date", ascending=False)
        metadata.reset_index(drop=True, inplace=True)
    except Exception as e:
        if verbose:
            print(f"An error occurred while reading the metadata file: {e}")
        return False, f"An error occurred while reading the metadata file: {e}"
    
    # Extract PMIDs from the metadata
    if verbose: print("Extracting PMIDs from the metadata...")
    try:
        pmids = metadata["pmid"].astype(str).tolist()
    except Exception as e:
        if verbose:
            print(f"An error occurred while extracting PMIDs from the metadata: {e}")
        return False, f"An error occurred while extracting PMIDs from the metadata: {e}"
    
    # Make API calls to get the works for the PMIDs
    if verbose: print("Fetching works data from the API...")
    works, failed_calls = get_works(
        ids=pmids,
        email=os.environ.get("EMAIL"), # Will return an empty string if the variable is not set
        select_fields="id,doi,title,cited_by_count,updated_date",
        show_progress=verbose
    )
    
    # Iterate over the rows in metadata and update the cited_by_count and updated_date
    if verbose: print("Updating the citation counts...")
    counter = 0
    for index, row in metadata.iterrows():
        id = row["oaid"]
        doi = row["doi_url"]
        updated_date = row["updated_date"]
        current_cited_by_count = row["cited_by_count"]
        work = next((work for work in works if work["metadata"]["id"] == id), None)
        new_cited_by_count = work["metadata"]["cited_by_count"]
        if new_cited_by_count > int(current_cited_by_count):
            try:
                if verbose: print(f"Updating the cited_by_count for ID: {id} / DOI: {doi} from {current_cited_by_count} to {new_cited_by_count}")
                metadata.at[index, "cited_by_count"] = new_cited_by_count
                metadata.at[index, "updated_date"] = work["metadata"]["updated_date"] # Leave the date as a string
                counter += 1
            except Exception as e:
                if verbose: print(f"Failed to update the cited_by_count for PMID: {pmid}. Error: {e}")
        else:
            if verbose: print(f"Citation count {current_cited_by_count} for ID {id} with DOI {doi} is up-to-date. Skipping...")
            continue

    if verbose: print(f"Updated values for {counter} articles.")
    if counter > 0:
        if verbose: print("Saving the updated metadata to a CSV file...")
        metadata.to_csv(file_path, index=False)
        if verbose: print("Saving a backup file to disk...")
        bkp_file_path = file_path.replace(".csv", f"_bkp-{datetime.now().strftime('%Y%m%d-%Hh%Mm')}.csv")
        metadata_bkp.to_csv(bkp_file_path, index=False)
        if verbose: print("Metadata updated successfully.")

        # Get the path to the log file from the metadata file path
        log_file_path = os.path.join(os.path.dirname(file_path), "update-log.json") 
        
        # Update the log file
        try:
            if verbose: print("Updating the log file...")
            with open(log_file_path, "r") as f:
                update_log = json.load(f)
            current_date = datetime.now().strftime("%Y-%m-%d")
            update_log["last_modified"] = current_date # Expected format: {"last_modified": "2024-08-06"}
            with open(log_file_path, "w") as f:
                json.dump(update_log, f)
            if verbose: print(f"Log file updated successfully.")
        except Exception as e:
            if verbose: print(f"Error updating log file: {e}. Creating a new log file...")
            current_date = datetime.now().strftime("%Y-%m-%d")
            with open(log_file_path, "w") as f:
                json.dump({"last_modified": current_date}, f)
            if verbose: print(f"New log file created successfully.")

        return True, f"Updated values for {counter} articles and saved file to {file_path}. Backup saved as {bkp_file_path}"
    else:
        return False, "Loaded metadata were up-to-date. No changes were made."

Overwriting ../utils/website_utils.py


In [6]:
%%writefile -a ../utils/website_utils.py

from typing import List, Dict, Any

def parse_data(works: List[Dict[str, Any]], exclude_errata: bool = True) -> pd.DataFrame:
    """
    Parse the raw data from the OpenAlex API and create a DataFrame.

    This function extracts relevant information from each work in the input list
    and creates a DataFrame with specified columns. It also removes duplicates
    based on PMID and filters out errata (if specified).

    Args:
        works (List[Dict[str, Any]]): A list of dictionaries, where each dictionary
            contains metadata about a work.
        exclude_errata (bool): Whether to exclude errata from the DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing extracted and processed information
        from the works.

    Example:
        >>> df_works = parse_data(works)
        >>> df_works.head()

    Note:
        The function extracts the following information for each work:
        - First author's last name
        - Article title
        - Journal name
        - Publication year and date
        - PMID, PMCID, and OpenAlex ID
        - PDF URL (if available)
        - DOI URL
        - Citation count and URL
        - Work type and Crossref type
        - Updated date (from the API)
    """

    # Initialize an empty list to store the extracted data, and iterate over the works data to extract relevant information
    oa_data = []
    for work in works:
        metadata = work["metadata"]
        first_author_last_name = metadata["authorships"][0]["author"]["display_name"].split(" ")[-1]
        article_title = metadata["title"]
        journal = metadata["primary_location"]["source"]["display_name"]
        publication_year = str(metadata["publication_year"])
        publication_date = metadata["publication_date"]
        if publication_date:
            try:
                publication_date = pd.to_datetime(publication_date).strftime('%Y-%m-%d')
            except ValueError:
                pass # If the date can't be parsed, keep the original string
        pmid = metadata["ids"].get("pmid", "").split("/")[-1] # To remove the url prefix
        pmcid = metadata["ids"].get("pmcid")
        if pmcid is not None:
            pmcid = pmcid.split("/")[-1] # To remove the url prefix
        else: 
            pmcid = "" # To replace None with an empty string
        oaid = metadata["id"]
        try:
            pdf_url = metadata.get("best_oa_location", {}).get("pdf_url", "not available")
        except AttributeError:
            pdf_url = "not available"
        if pdf_url is None:
            pdf_url = "not available"
        doi_url = metadata["doi"]
        cited_by_count = str(metadata["cited_by_count"])
        cited_by_ui_url = metadata["cited_by_api_url"].replace("api.openalex.org", "openalex.org")
        work_type = metadata.get("type")
        type_crossref = metadata.get("type_crossref")
        updated_date = metadata.get("updated_date")

        oa_data.append([
            first_author_last_name, article_title, journal, publication_year,
            publication_date, pmid, pmcid, oaid, pdf_url, doi_url,
            cited_by_count, cited_by_ui_url, work_type, type_crossref, updated_date
        ])

    columns = [
        'first_author_last_name', 'article_title', 'journal',
        'publication_year', 'publication_date', 'pmid', 'pmcid', 'oaid',
        'pdf_url', 'doi_url', 'cited_by_count', 'cited_by_ui_url', 'type',
        'type_crossref', 'updated_date'
    ]

    # Create a DataFrame with the specified columns
    df_works = pd.DataFrame(oa_data, columns=columns, dtype=str)
    df_works = df_works.drop_duplicates(subset=["pmid"])
    if exclude_errata:
        df_works = df_works[df_works["type"] != "erratum"]

    # Parse the publication date as a datetime object with the format 'YYYY-MM-DD'
    df_works["publication_date"] = pd.to_datetime(df_works["publication_date"], errors='coerce').dt.strftime('%Y-%m-%d')
    
    # Sort the DataFrame by publication date in descending order
    df_works = df_works.sort_values(by="publication_date", ascending=False)
    df_works.reset_index(drop=True, inplace=True)

    return df_works

Appending to ../utils/website_utils.py


In [7]:
%%writefile -a ../utils/website_utils.py

import os
import pandas as pd
from datetime import datetime
import argparse
from copy import deepcopy

def append_metadata(metadata_file_path: str, pmid_file_path: str, exclude_errata: bool = True, verbose: bool = True) -> Tuple[bool, str]:
    """
    Append metadata for missing PMIDs to an existing metadata file.

    Args:
        metadata_file_path (str): Path to CSV file containing existing metadata.
        pmid_file_path (str): Path to file containing list of PMIDs.
        exclude_errata (bool): Whether to exclude errata from the metadata.
        verbose (bool): Whether to show verbose messages during the process.

    Returns:
        tuple: A tuple containing a boolean indicating if any updates were made, and a message string with details.
    """

    # Input validation
    assert metadata_file_path.endswith(".csv"), "Invalid file format. Please provide a CSV file."
    assert os.path.exists(metadata_file_path), "Metadata file not found."
    assert pmid_file_path.endswith(".txt"), "Invalid file format. Please provide a TXT file."
    assert os.path.exists(pmid_file_path), "PMID file not found."
    assert isinstance(verbose, bool), "Verbose must be a boolean."

    # Read existing metadata
    if verbose: print("Reading the existing metadata file...")
    try:
        metadata = pd.read_csv(metadata_file_path, dtype=str)
        metadata_bkp = deepcopy(metadata) # Make a deepcopy of the DataFrame to save a backup
    except Exception as e:
        if verbose:
            print(f"An error occurred while reading the metadata file: {e}")
        return False, f"An error occurred while reading the metadata file: {e}"

    # Read PMIDs from file
    if verbose: print("Reading the PMID file...")
    try:
        with open(pmid_file_path, 'r') as f:
            pmids = set(line.strip() for line in f)
    except Exception as e:
        if verbose:
            print(f"An error occurred while reading the PMID file: {e}")
        return False, f"An error occurred while reading the PMID file: {e}"

    # Find missing PMIDs
    if verbose: print("Searching for new PMIDs not in the metadata...")
    existing_pmids: Set[str] = set(metadata["pmid"])
    new_pmids: Set[str] = pmids - existing_pmids
    new_pmids_str = ", ".join(new_pmids) # Convert to a string with comma-separated values
    if verbose: print(f"Found {len(new_pmids)} new PMID(s): {new_pmids_str}.")
    if len(new_pmids) == 0:
        return False, "No new PMIDs found."
    else:
        try:
            # Make API calls to get the missing PMIDs
            select_fields: str = (
                "id,title,doi,primary_location,authorships,publication_year,"
                "publication_date,ids,best_oa_location,cited_by_count,"
                "cited_by_api_url,type,type_crossref,updated_date"
            )
            new_articles, failed_calls = get_works(
                ids=list(new_pmids),
                email=os.environ.get("EMAIL"),
                select_fields=select_fields,
                show_progress=verbose
            )
            if verbose: print(f"API calls completed. Failed calls: {len(failed_calls)}")
        except Exception as e:
            if verbose:
                print(f"An error occurred while fetching works data from the API: {e}")
            return False, f"An error occurred while fetching works data from the API: {e}"

        # Parse the data for new articles
        if verbose: print("Parsing the data for new articles...")
        try:
            df_new_articles = parse_data(new_articles, exclude_errata=exclude_errata)
        except Exception as e:
            if verbose:
                print(f"An error occurred while parsing the data for new articles: {e}")
            return False, f"An error occurred while parsing the data for new articles: {e}"
        
        if df_new_articles.empty:
            if exclude_errata:
                if verbose: print("No new articles found (Errata excluded).")
                return False, "No new articles found (Errata excluded)."
            else:
                if verbose: print("No new articles found.")
                return False, "No new articles found."
        else:
            # Append the new articles to the existing metadata
            new_pmids = set(df_new_articles["pmid"])
            new_pmids = ", ".join(new_pmids) # Convert to a string with comma-separated values
            if verbose: print(f"Appending {len(df_new_articles)} new article(s) with PMID(s) {new_pmids} to the existing metadata...")
            try:
                metadata = pd.concat([df_new_articles, metadata], ignore_index=True)
            except Exception as e:
                if verbose:
                    print(f"An error occurred while appending the new articles to the existing metadata: {e}")
                return False, f"An error occurred while appending the new articles to the existing metadata: {e}"

            # Save the updated metadata to a CSV file
            if verbose: print("Saving the updated metadata to a CSV file...")
            metadata.to_csv(metadata_file_path, index=False)
            if verbose: print("Saving a backup file to disk...")
            bkp_file_path = metadata_file_path.replace(".csv", f"_bkp-{datetime.now().strftime('%Y%m%d-%Hh%Mm')}.csv")
            metadata_bkp.to_csv(bkp_file_path, index=False)
            if verbose: print("Metadata updated successfully.")

            # Get the path to the log file from the metadata file path
            log_file_path = os.path.join(os.path.dirname(metadata_file_path), "update-log.json")
            
            # Update the log file
            try:
                if verbose: print("Updating the log file...")
                with open(log_file_path, "r") as f:
                    update_log = json.load(f)
                current_date = datetime.now().strftime("%Y-%m-%d")    
                update_log["last_modified"] = current_date # Expected format: {"last_modified": "2024-08-06"}
                with open(log_file_path, "w") as f:
                    json.dump(update_log, f)
                if verbose: print(f"Log file updated successfully.")
            except Exception as e:
                if verbose: print(f"Error updating log file: {e}. Creating a new log file...")
                current_date = datetime.now().strftime("%Y-%m-%d")
                with open(log_file_path, "w") as f:
                    json.dump({"last_modified": current_date}, f)
                if verbose: print(f"New log file created successfully.")

            return True, f"Appended {len(df_new_articles)} article(s) and saved file to {metadata_file_path}. Backup saved as {bkp_file_path}"


Appending to ../utils/website_utils.py


In [8]:
%%writefile ../utils/main.py

import argparse
import os
from website_utils import update_citations, append_metadata

def main():
    parser = argparse.ArgumentParser(description="Manage website metadata and citations.")
    
    # Main operation group
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--update-citations", action="store_true", help="Update citation counts in the metadata file")
    group.add_argument("--append-metadata", action="store_true", help="Append metadata for missing PMIDs")
    group.add_argument("--update-and-append", action="store_true", help="Perform both update and append operations")

    # Common arguments
    parser.add_argument("directory", type=str, help="Directory containing the metadata, log, and PMID files")
    parser.add_argument("--quiet", action="store_true", help="Run in quiet mode (no verbose output)")
    parser.add_argument("--include-errata", action="store_true", help="Include errata in the appended metadata")

    args = parser.parse_args()

    # Define file paths
    metadata_file = os.path.join(args.directory, "articles-metadata.csv")
    log_file = os.path.join(args.directory, "update-log.json")
    pmid_file = os.path.join(args.directory, "PMID-export.txt")

    # Validate file existence
    if not os.path.exists(metadata_file):
        parser.error(f"Metadata file not found: {metadata_file}")
    if not os.path.exists(log_file):
        parser.error(f"Log file not found: {log_file}")
    if (args.append_metadata or args.update_and_append) and not os.path.exists(pmid_file):
        parser.error(f"PMID file not found: {pmid_file}")

    success_messages = []
    error_messages = []

    if args.update_citations or args.update_and_append:
        success, message = update_citations(metadata_file, verbose=not args.quiet)
        if success:
            success_messages.append(f"Update citations operation: {message}")
        else:
            error_messages.append(f"Update citations operation completed without saving new data: {message}")

    if args.append_metadata or args.update_and_append:
        success, message = append_metadata(metadata_file, pmid_file, 
                                           exclude_errata=not args.include_errata, 
                                           verbose=not args.quiet)
        if success:
            success_messages.append(f"Append metadata operation: {message}")
        else:
            error_messages.append(f"Append metadata operation completed without saving new data: {message}")

    # Print results
    for message in success_messages:
        print(message)
    for message in error_messages:
        print(message)

    # Exit with error if any operation failed
    if error_messages:
        exit(1)

if __name__ == "__main__":
    main()



Overwriting ../utils/main.py


## Test the functions

### For execution from REPL / Jupyter

In [53]:
import sys, os
sys.path.append(os.path.expanduser("../utils"))
from website_utils import update_citations, parse_data, append_metadata
from openalex_api_utils import get_works

In [54]:
%ls ~/GitHub/nicomarr.github.io/_data
%ls ~/GitHub/nicomarr.github.io/_py/test_data


PMID-export.txt        articles-metadata.csv  update-log.json
PMID-export.txt
PMID-export_copy.txt
articles-metadata.csv
articles-metadata_copy.csv
articles-metadata_copy_bkp-20240916-18h46m.csv
update-log.json
update-log_copy.json


In [55]:
# Set path to the data directory
# data_dir = os.path.expanduser("~/GitHub/nicomarr.github.io/_data")
data_dir = os.path.expanduser("~/GitHub/nicomarr.github.io/_py/test_data") # uncomment for local testing

In [56]:
# Set path to the data files
pmid_list_path = os.path.join(data_dir, "PMID-export.txt")
print(pmid_list_path)
articles_metadata_path = os.path.join(data_dir, "articles-metadata.csv")
print(articles_metadata_path)
update_log_path = os.path.join(data_dir, "update-log.json")
print(update_log_path)

/Users/user2/GitHub/nicomarr.github.io/_py/test_data/PMID-export.txt
/Users/user2/GitHub/nicomarr.github.io/_py/test_data/articles-metadata.csv
/Users/user2/GitHub/nicomarr.github.io/_py/test_data/update-log.json


In [59]:
append_metadata(articles_metadata_path, pmid_list_path, exclude_errata=True, verbose=True)

Reading the existing metadata file...
Reading the PMID file...
Searching for new PMIDs not in the metadata...
Found 3 new PMID(s): 36342405, 39198650, 38776920.


Retrieving works: 100%|██████████| 3/3 [00:01<00:00,  2.55it/s]

API calls completed. Failed calls: 0
Parsing the data for new articles...
Appending 1 new article(s) with PMID(s) 39198650 to the existing metadata...
Saving the updated metadata to a CSV file...
Saving a backup file to disk...
Metadata updated successfully.
Updating the log file...
Log file updated successfully.





(True,
 'Appended 1 article(s) and saved file to /Users/user2/GitHub/nicomarr.github.io/_py/test_data/articles-metadata.csv. Backup saved as /Users/user2/GitHub/nicomarr.github.io/_py/test_data/articles-metadata_bkp-20240916-18h51m.csv')

In [58]:
update_citations(articles_metadata_path, verbose=True)

Reading the metadata file...
Extracting PMIDs from the metadata...
Fetching works data from the API...


Retrieving works: 100%|██████████| 70/70 [00:26<00:00,  2.68it/s]

Updating the citation counts...
Updating the cited_by_count for ID: https://openalex.org/W4400948832 / DOI: https://doi.org/10.1038/s41586-024-07745-x from 0 to 1
Updating the cited_by_count for ID: https://openalex.org/W4396620188 / DOI: https://doi.org/10.1016/j.cell.2024.04.009 from 0 to 2
Citation count 2 for ID https://openalex.org/W4393386550 with DOI https://doi.org/10.1084/jem.20231044 is up-to-date. Skipping...
Updating the cited_by_count for ID: https://openalex.org/W4392282847 / DOI: https://doi.org/10.1126/science.adh4059 from 1 to 2
Updating the cited_by_count for ID: https://openalex.org/W4391883684 / DOI: https://doi.org/10.1007/s10875-024-01661-5 from 0 to 1
Updating the cited_by_count for ID: https://openalex.org/W4390576813 / DOI: https://doi.org/10.1084/jem.20231353 from 10 to 16
Updating the cited_by_count for ID: https://openalex.org/W4390350774 / DOI: https://doi.org/10.1016/j.cell.2023.11.036 from 8 to 12
Citation count 2 for ID https://openalex.org/W4389120393 w




(True,
 'Updated values for 48 articles and saved file to /Users/user2/GitHub/nicomarr.github.io/_py/test_data/articles-metadata.csv. Backup saved as /Users/user2/GitHub/nicomarr.github.io/_py/test_data/articles-metadata_bkp-20240916-18h48m.csv')

### For execution from command line

***Make sure to activate the virtual environment before running the commands and that the required packages are installed!***

:::{.callout-note}
#### Update citations
To update citations in the `test_data` directory:
```sh
python ./_py/utils/main.py --update-citations ./_py/test_data
```
To update citations in the `_data` directory:
```sh
python ./_py/utils/main.py --update-citations ./_data
```
Or in quiet mode:
```sh
python ./_py/utils/main.py --update-citations ./_data --quiet
```
:::

:::{.callout-note}
#### Append metadata 
To append metadata in the `test_data` directory:
```sh
python ./_py/utils/main.py --append-metadata ./_py/test_data
```

To append metadata in the `_data` directory:
```sh
python ./_py/utils/main.py --append-metadata ./_data
```
To include errata and run in quiet mode:
```sh
python ./_py/utils/main.py --append-metadata ./_data --quiet
```
:::

:::{.callout-note}
#### Perform both update and append operations
```sh
python ./_py/utils/main.py --update-and-append ./_data
```

For help:

In [88]:
%run ~/GitHub/nicomarr.github.io/_py/utils/main.py --help

usage: main.py [-h]
               (--update-citations | --append-metadata | --update-and-append)
               [--quiet] [--include-errata]
               directory

Manage website metadata and citations.

positional arguments:
  directory            Directory containing the metadata, log, and PMID files

options:
  -h, --help           show this help message and exit
  --update-citations   Update citation counts in the metadata file
  --append-metadata    Append metadata for missing PMIDs
  --update-and-append  Perform both update and append operations
  --quiet              Run in quiet mode (no verbose output)
  --include-errata     Include errata in the appended metadata
