# Source code of helper functions to update publications page

## Read local data

In [None]:
import os
import pandas as pd
import json

In [None]:
%ls ~/GitHub/nicomarr.github.io/_data
# %ls ~/GitHub/nicomarr.github.io/_py/test_data # Uncomment for testing and development

In [None]:
data_dir = os.path.expanduser("~/GitHub/nicomarr.github.io/_data")
# data_dir = os.path.expanduser("~/GitHub/nicomarr.github.io/_py/test_data") # Uncomment for local testing


In [None]:
# Set path to the data files
pmid_list_path = os.path.join(data_dir, "PMID-export.txt")
articles_metadata_path = os.path.join(data_dir, "articles-metadata.csv")
update_log_path = os.path.join(data_dir, "update-log.json")
print(f"{pmid_list_path}\n{articles_metadata_path}\n{update_log_path}")

In [None]:
with open(pmid_list_path) as f:
    pmids = f.read().splitlines()
print(f"No. of PMIDs: {len(pmids)}\n{pmids}")

In [None]:
df_articles_metadata = pd.read_csv(articles_metadata_path, dtype=str)
df_articles_metadata["publication_date"] = pd.to_datetime(df_articles_metadata["publication_date"])
df_articles_metadata = df_articles_metadata.sort_values(by="publication_date", ascending=False)
df_articles_metadata

In [None]:
with open(update_log_path) as f:
    update_log = json.load(f)
update_log

## Call OpenAlex API (for testing)

In [None]:
import requests
from pprint import pprint

id = "34427831" # pmid from the list; for testing
base_url = "https://api.openalex.org/works/"
params = {
    "mailto": os.environ["EMAIL"],
    "select": "id,title,doi,primary_location,authorships,publication_year,publication_date,ids,best_oa_location,cited_by_count,cited_by_api_url,type,type_crossref,updated_date",
}
url = f"{base_url}pmid:{id}"

verbose = True  # Define verbose or pass it as a parameter

try:
    response = requests.get(url, params=params)
    response.raise_for_status()  # Raises an HTTPError for bad responses
    work = response.json()
except requests.RequestException as e:
    if verbose:
        print(f"An error occurred while making an API call with UID {id}: {e}")
    work = None
except json.JSONDecodeError as e:
    if verbose:
        print(f"Failed to decode JSON response for UID {id}: {e}")
        print(f"Response content: {response.text}")
    work = None

if work is None:
    print(f"Status code: {response.status_code}")
    print("Response headers:")
    pprint(dict(response.headers), indent=2)
    print("Response content:")
    try:
        pprint(response.json(), indent=2)
    except json.JSONDecodeError:
        print(response.text)
else:
    pprint(work, indent=2)

## Use `openalex_api_utils` to get metadata from the OpenAlex API

In [None]:
import sys
sys.path.append(os.path.expanduser("../utils"))
from openalex_api_utils import get_works

#### Get metadata for a single entry


In [None]:
works, failed_calls = get_works(
    ids=["34427831"], 
    email=os.environ.get("EMAIL"),
    select_fields="id,title,doi,primary_location,authorships,publication_year,publication_date,ids,best_oa_location,cited_by_count,cited_by_api_url,type,type_crossref,updated_date",
    show_progress=True
)

#### Print relevant metadata


In [None]:
try:
    print(works[0]["metadata"]["title"])
    print(works[0]["metadata"]["authorships"][0]["author"]["display_name"])
    print(works[0]["metadata"]["publication_date"])
    print(works[0]["metadata"]["publication_year"])
    print(works[0]["metadata"]["doi"])
    print(works[0]["metadata"]["ids"].get("pmid").split("/")[-1]) # PMID
    print(works[0]["metadata"]["ids"].get("pmcid").split("/")[-1]) # PMCID
    print(works[0]["metadata"]["id"]) # OpenAlex ID
    print(works[0]["metadata"].get("best_oa_location").get("pdf_url")) # may be None
    print(works[0]["metadata"]["cited_by_count"])
    print(works[0]["metadata"]["cited_by_api_url"])
    print(works[0]["metadata"].get("type"))
    print(works[0]["metadata"].get("type_crossref"))
    print(works[0]["metadata"]["updated_date"])
except KeyError as e:
    print(f"KeyError: {e}")

#### Get metadata from multiple entries and iterate over a list of works to extract relevant information

In [None]:
works, failed_calls = get_works( 
    ids=pmids,
    email=os.environ.get("EMAIL"),
    select_fields="id,title,doi,primary_location,authorships,publication_year,publication_date,ids,best_oa_location,cited_by_count,cited_by_api_url,type,type_crossref,updated_date",
    show_progress=True
)


In [None]:
len(works), len(failed_calls)

### Parse metadata

In [None]:
oa_data = []
for work in works:
    metadata = work["metadata"]
    first_author_last_name = metadata["authorships"][0]["author"]["display_name"].split(" ")[-1]
    article_title = metadata["title"]
    journal = metadata["primary_location"]["source"]["display_name"]
    publication_year = str(metadata["publication_year"])
    publication_date = metadata["publication_date"]
    pmid = metadata["ids"].get("pmid", "").split("/")[-1] # remove the url prefix
    pmcid = metadata["ids"].get("pmcid")
    if pmcid is not None:
        pmcid = pmcid.split("/")[-1] # remove the url prefix
    else: 
        pmcid = "NaN"
    oaid = metadata["id"]
    try:
        pdf_url = metadata.get("best_oa_location", {}).get("pdf_url", "not available")
    except AttributeError:
        pdf_url = "not available"
    if pdf_url is None:
        pdf_url = "not available"
    doi_url = metadata["doi"]
    cited_by_count = str(metadata["cited_by_count"])
    cited_by_ui_url = metadata["cited_by_api_url"].replace("api.openalex.org", "openalex.org")
    type = metadata.get("type")
    type_crossref = metadata.get("type_crossref")
    updated_date = metadata.get("updated_date")

    # Append the extracted data to the list
    oa_data.append([
        first_author_last_name, article_title, journal, publication_year,
        publication_date, pmid, pmcid, oaid, pdf_url,
        doi_url, cited_by_count, cited_by_ui_url, type, type_crossref, updated_date
    ])
print(f"{len(oa_data)} entries:")
print(json.dumps(oa_data, indent=2))

In [None]:
# Create a DataFrame with the specified columns
columns = [
    'first_author_last_name', 'article_title', 'journal',
    'publication_year', 'publication_date', 'pmid', 'pmcid', 'oaid',
    'pdf_url', 'doi_url', 'cited_by_count', 'cited_by_ui_url', 'type',
    'type_crossref', 'updated_date'
]

df_works = pd.DataFrame(oa_data, columns=columns)
df_works = df_works.drop_duplicates(subset=["pmid"])
df_works = df_works[df_works["type"] != "erratum"]
df_works.head()

## Function definition to parse metadata

In [None]:
from typing import List, Dict, Any

def parse_data(works: List[Dict[str, Any]], exclude_errata = True) -> pd.DataFrame:
    """
    Parse the raw data from the OpenAlex API and create a DataFrame.

    This function extracts relevant information from each work in the input list
    and creates a DataFrame with specified columns. It also removes duplicates
    based on PMID and filters out errata (if specified).

    Args:
        works (List[Dict[str, Any]]): A list of dictionaries, where each dictionary
            contains metadata about a work.
        exclude_errata (bool): Whether to exclude errata from the DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing extracted and processed information
        from the works.

    Example:
        >>> df_works = create_works_dataframe(works)
        >>> df_works.head()

    Note:
        The function extracts the following information for each work:
        - First author's last name
        - Article title
        - Journal name
        - Publication year and date
        - PMID, PMCID, and OpenAlex ID
        - PDF URL (if available)
        - DOI URL
        - Citation count and URL
        - Work type and Crossref type
        - Updated date (from the API)
    """
    oa_data = []
    for work in works:
        metadata = work["metadata"]
        first_author_last_name = metadata["authorships"][0]["author"]["display_name"].split(" ")[-1]
        article_title = metadata["title"]
        journal = metadata["primary_location"]["source"]["display_name"]
        publication_year = str(metadata["publication_year"])
        publication_date = metadata["publication_date"]
        pmid = metadata["ids"].get("pmid", "").split("/")[-1] # remove the url prefix
        pmcid = metadata["ids"].get("pmcid")
        if pmcid is not None:
            pmcid = pmcid.split("/")[-1] # remove the url prefix
        else: 
            pmcid = "NaN"
        oaid = metadata["id"]
        try:
            pdf_url = metadata.get("best_oa_location", {}).get("pdf_url", "not available")
        except AttributeError:
            pdf_url = "not available"
        if pdf_url is None:
            pdf_url = "not available"
        doi_url = metadata["doi"]
        cited_by_count = str(metadata["cited_by_count"])
        cited_by_ui_url = metadata["cited_by_api_url"].replace("api.openalex.org", "openalex.org")
        work_type = metadata.get("type")
        type_crossref = metadata.get("type_crossref")
        updated_date = metadata.get("updated_date")

        oa_data.append([
            first_author_last_name, article_title, journal, publication_year,
            publication_date, pmid, pmcid, oaid, pdf_url, doi_url,
            cited_by_count, cited_by_ui_url, work_type, type_crossref, updated_date
        ])

    columns = [
        'first_author_last_name', 'article_title', 'journal',
        'publication_year', 'publication_date', 'pmid', 'pmcid', 'oaid',
        'pdf_url', 'doi_url', 'cited_by_count', 'cited_by_ui_url', 'type',
        'type_crossref', 'updated_date'
    ]

    df_works = pd.DataFrame(oa_data, columns=columns, dtype=str)
    df_works = df_works.drop_duplicates(subset=["pmid"])
    if exclude_errata:
        df_works = df_works[df_works["type"] != "erratum"]

    df_works["publication_date"] = pd.to_datetime(df_works["publication_date"], format='%Y-%m-%d', errors='coerce')
    df_works = df_works.sort_values(by="publication_date", ascending=False)
    df_works.reset_index(drop=True, inplace=True)

    return df_works

In [None]:
df_works = parse_data(works)
df_works.head()

## Compare local data with external data and find mismatches

In [None]:
# Compare the shapes of the DataFrames
print("df_works shape:", df_works.shape)
print("df_articles_metadata shape:", df_articles_metadata.shape)
assert df_works.shape == df_articles_metadata.shape, "DataFrames have different shapes."

In [None]:
# Compare the column names:
print("df_works columns:", df_works.columns.tolist())
print("articles_metadata columns:", df_articles_metadata.columns.tolist())
assert df_works.columns.tolist() == df_articles_metadata.columns.tolist(), "Column names are different."

In [None]:
# Check for common columns:
common_columns = set(df_works.columns) & set(df_articles_metadata.columns)
print("Common columns:", common_columns)
print("No. of common columns:", len(common_columns))

# Check for columns only in one of the DataFrames:
works_columns = set(df_works.columns)
articles_metadata_columns = set(df_articles_metadata.columns)
print("Columns only in df_works:", works_columns - articles_metadata_columns)
print("Columns only in articles_metadata:", articles_metadata_columns - works_columns)

In [None]:
# Compare the datatypes:
common_columns = list(common_columns)
df_works_dtypes = df_works[common_columns].dtypes
articles_metadata_dtypes = df_articles_metadata[common_columns].dtypes
dtypes_comparison = df_works_dtypes == articles_metadata_dtypes

print("Columns with mismatched datatypes:")
for column in common_columns:
    if df_works_dtypes[column] == articles_metadata_dtypes[column]:
        continue
    print(f"{column}:")
    print(f"  df_works: {df_works_dtypes[column]}")
    print(f"  articles_metadata: {articles_metadata_dtypes[column]}")
    print(f"  Match: {dtypes_comparison[column]}")
    print()

# leave pmid dtypes as int for consistency

In [None]:
# Compare the "pmid" values:
df_works_pmids = set(df_works["pmid"])
articles_metadata_pmids = set(df_articles_metadata["pmid"].astype(str))
common_pmids = df_works_pmids & articles_metadata_pmids

print("Number of common PMIDs:", len(common_pmids))
print("PMIDs only in df_works:", len(df_works_pmids - articles_metadata_pmids))
print("PMIDs only in articles_metadata:", len(articles_metadata_pmids - df_works_pmids))

In [None]:
# Compare the values for article_title in both DataFrames
series1 = df_works["article_title"].reset_index(drop=True)
series2 = df_articles_metadata["article_title"].reset_index(drop=True)
title_mismatch = series1.compare(series2)
title_mismatch

In [None]:
# Compare the values for pdf_url in both DataFrames
series1 = df_works["pdf_url"].reset_index(drop=True)
series2 = df_articles_metadata["pdf_url"].reset_index(drop=True)
pdf_url_mismatch = series1.compare(series2)
pdf_url_mismatch

In [None]:
# Compare the values for the doi_url in both DataFrames
series1 = df_works["doi_url"].reset_index(drop=True)
series2 = df_articles_metadata["doi_url"].reset_index(drop=True)
pdf_url_mismatch = series1.compare(series2)
pdf_url_mismatch

In [None]:
# Compare all columns
result = df_works.reset_index(drop=True).compare(df_articles_metadata.reset_index(drop=True))
result

## Function definition to add metadata of new articles to the local/existing data

In [None]:
pmids.append("39198650") # Add a new PMID, for testing

In [None]:
# Find IDs in the PMID list missing from the local metadata
new_pmids = set(pmids) - set(df_articles_metadata["pmid"].astype(str))
new_pmids

In [None]:
# Make API calls to get the missing PMIDs
exclude_errata=True
new_works, failed_calls = get_works(
    ids=list(new_pmids),
    email=os.environ.get("EMAIL"),
    select_fields="id,title,doi,primary_location,authorships,publication_year,publication_date,ids,best_oa_location,cited_by_count,cited_by_api_url,type,type_crossref,updated_date",
    show_progress=True
)

df_new_works = parse_data(new_works, exclude_errata=exclude_errata)
if len(df_new_works) == 0:
    print("No new articles found.")
    print(f"Failed calls: {len(failed_calls)}")
    print(f"Errata excluded: {exclude_errata}")
    print(f"No. of errata in new works: {len(df_new_works[df_new_works['type'] == 'erratum'])}")
    
else:
    print(f"\n{len(df_new_works)} new article(s) found.\n")
    print(df_new_works[["article_title"]])
# assert len(df_new_works) == 1, "Number of new articles is not match the expected number."

In [None]:
if len(df_new_works) == 0:
    print("No new articles found.")
else:
    df_updated_works = pd.concat([df_works, df_new_works], ignore_index=False)
    df_updated_works = df_updated_works.sort_values(by="publication_date", ascending=False)
    assert df_new_works.at[0, 'article_title'] in df_updated_works["article_title"].values
    df_updated_works.head(3)

In [None]:
import os
import pandas as pd
from datetime import datetime
import argparse
from typing import Tuple, Set, List, Dict, Any

def append_metadata(metadata_file_path: str, pmid_file_path: str, exclude_errata: bool = True, verbose: bool = True) -> Tuple[bool, str]:
    """
    Append metadata for missing PMIDs to an existing metadata file.

    Args:
        metadata_file_path (str): Path to CSV file containing existing metadata.
        pmid_file_path (str): Path to file containing list of PMIDs.
        verbose (bool): Whether to show verbose messages during the process.

    Returns:
        tuple: A tuple containing a boolean indicating if any updates were made, and a message string with details.
    """

    # Input validation
    assert metadata_file_path.endswith(".csv"), "Invalid file format. Please provide a CSV file."
    assert os.path.exists(metadata_file_path), "Metadata file not found."
    assert pmid_file_path.endswith(".txt"), "Invalid file format. Please provide a TXT file."
    assert os.path.exists(pmid_file_path), "PMID file not found."
    assert isinstance(verbose, bool), "Verbose must be a boolean."

    # Read existing metadata
    if verbose: print("Reading the existing metadata file...")
    try:
        metadata = pd.read_csv(metadata_file_path, dtype=str)
        metadata_bkp = deepcopy(metadata) # Make a deepcopy of the DataFrame to save a backup
    except Exception as e:
        if verbose:
            print(f"An error occurred while reading the metadata file: {e}")
        return False, f"An error occurred while reading the metadata file: {e}"

    # Read PMIDs from file
    if verbose: print("Reading the PMID file...")
    try:
        with open(pmid_file_path, 'r') as f:
            pmids = set(line.strip() for line in f)
    except Exception as e:
        if verbose:
            print(f"An error occurred while reading the PMID file: {e}")
        return False, f"An error occurred while reading the PMID file: {e}"

    # Find missing PMIDs
    if verbose: print("Searching for new PMIDs not in the metadata...")
    existing_pmids: Set[str] = set(metadata["pmid"])
    new_pmids: Set[str] = pmids - existing_pmids
    new_pmids_str = ", ".join(new_pmids) # Convert to a string with comma-separated values
    if verbose: print(f"Found {len(new_pmids)} new PMID(s): {new_pmids_str}.")
    if len(new_pmids) == 0:
        return False, "No new PMIDs found."
    else:
        try:
            # Make API calls to get the missing PMIDs
            exclude_errata: bool = True
            select_fields: str = (
                "id,title,doi,primary_location,authorships,publication_year,"
                "publication_date,ids,best_oa_location,cited_by_count,"
                "cited_by_api_url,type,type_crossref,updated_date"
            )
            new_articles, failed_calls = get_works(
                ids=list(new_pmids),
                email=os.environ.get("EMAIL"),
                select_fields=select_fields,
                show_progress=verbose
            )
            if verbose: print(f"API calls completed. Failed calls: {len(failed_calls)}")
        except Exception as e:
            if verbose:
                print(f"An error occurred while fetching works data from the API: {e}")
            return False, f"An error occurred while fetching works data from the API: {e}"

        # Parse the data for new articles
        if verbose: print("Parsing the data for new articles...")
        try:
            df_new_articles = parse_data(new_articles, exclude_errata=True)
        except Exception as e:
            if verbose:
                print(f"An error occurred while parsing the data for new articles: {e}")
            return False, f"An error occurred while parsing the data for new articles: {e}"
        
        if df_new_articles.empty:
            if exclude_errata:
                if verbose: print("No new articles found (Errata excluded).")
                return False, "No new articles found (Errata excluded)."
            else:
                if verbose: print("No new articles found.")
                return False, "No new articles found."
        else:
            # Append the new articles to the existing metadata
            new_pmids = set(df_new_articles["pmid"])
            new_pmids = ", ".join(new_pmids) # Convert to a string with comma-separated values
            if verbose: print(f"Appending {len(df_new_articles)} new article(s) with PMID(s) {new_pmids} to the existing metadata...")
            try:
                metadata = pd.concat([metadata, df_new_articles], ignore_index=True)
            except Exception as e:
                if verbose:
                    print(f"An error occurred while appending the new articles to the existing metadata: {e}")
                return False, f"An error occurred while appending the new articles to the existing metadata: {e}"

            # Save the updated metadata to a CSV file
            if verbose: print("Saving the updated metadata to a CSV file...")
            metadata.to_csv(metadata_file_path, index=False)
            if verbose: print("Saving a backup file to disk...")
            bkp_file_path = metadata_file_path.replace(".csv", f"_bkp-{datetime.now().strftime('%Y%m%d-%Hh%Mm')}.csv")
            metadata_bkp.to_csv(bkp_file_path, index=False)
            if verbose: print("Metadata updated successfully.")

            # Update the log file
            log_file_path = os.path.join(os.path.dirname(metadata_file_path), "update-log.json")
            try:
                # get the path to the log file from the metadata file path
                if verbose: print("Updating the log file...")
                with open(log_file_path, "r") as f:
                    update_log = json.load(f)
                # format {"last_modified": "2024-08-06"}
                update_log["last_modified"] = datetime.now().strftime("%Y-%m-%d")
                with open(log_file_path, "w") as f:
                    json.dump(update_log, f)
                if verbose: print(f"Log file updated successfully.")
            except Exception as e: # Note: Additional error handling has been added in the final version below
                if verbose: print(f"No log file found. Error: {e}. Creating a new log file...")
                with open(log_file_path, "w") as f:
                    json.dump({"last_modified": datetime.now().strftime("%Y-%m-%d")}, f)
                if verbose: print(f"Log file created successfully.")

            return True, f"Appended {len(df_new_articles)} article(s) and saved file to {metadata_file_path}. Backup saved as {bkp_file_path}"

## Define a function to update citation counts of all articles in the metadata file

In [None]:
# Subset a row in df_articles_metadata based on a condition
# df_articles_metadata[df_articles_metadata["doi_url"] == "https://doi.org/10.1038/s41586-024-07745-x"]
df_articles_metadata[df_articles_metadata["doi_url"] == "https://doi.org/10.12688/f1000research.18048.2"]

In [None]:
# Find a specific article by PMID; for debugging
pmid = "31559014"
df_articles_metadata[df_articles_metadata["pmid"] == pmid]

In [None]:
# Get up-to-date metadata for articles in the local metadata file; only cited_by_count and updated_date will be modified
works, failed_calls = get_works( 
    ids=df_articles_metadata["doi_url"].astype(str).tolist(),
    email=os.environ.get("EMAIL"),
    select_fields="id,ids,doi,title,cited_by_count,updated_date",
    show_progress=True
)
assert len(works) == len(df_articles_metadata), "Number of works does not match the number of articles in the metadata."
if len(failed_calls) > 0:
    print(f"Failed calls: {failed_calls}")
else:
    print("All API calls successful.")

In [None]:
len(works), len(failed_calls)

In [None]:
# Print the citation counts
for work in works:
    print(f"{work["metadata"]["title"][:50]}... {work["metadata"]["doi"].replace("https://doi.org/","")}, Citation count: {work["metadata"]["cited_by_count"]}, {work["metadata"]["id"]}")

In [None]:
# Test loop for debugging
id = "https://openalex.org/W2914632197"
work = next((work for work in works if work["metadata"]["id"] == id), None)
if work is not None:
    print(f"{work['metadata']['title'][:50]}... {work['metadata']['doi'].replace('https://doi.org/','')}, Citation count: {work['metadata']['cited_by_count']}, {work['metadata']['id']}")
else:
    print(f"Work with ID {id} not found.")


In [None]:
# Load the metadata file
df_articles_metadata = pd.read_csv(articles_metadata_path, dtype=str)
df_articles_metadata["publication_date"] = pd.to_datetime(df_articles_metadata["publication_date"])
df_articles_metadata = df_articles_metadata.sort_values(by="publication_date", ascending=False)

# Iterate over the rows in df_articles_metadata and update the cited_by_count and updated_date
counter = 0

# make a deepcopy of the DataFrame to save a backup
from copy import deepcopy
df_articles_metadata_bkp = deepcopy(df_articles_metadata)

for index, row in df_articles_metadata.iterrows():
    id = row["oaid"]
    title = row["article_title"]
    current_cited_by_count = row["cited_by_count"]
    work = next((work for work in works if work["metadata"]["id"] == id), None)
    try:
        new_cited_by_count = work["metadata"]["cited_by_count"] # <-- raises TypeError: 'NoneType' object is not subscriptable for one of the PMIDs, https://api.openalex.org/works/W2914632197
    except TypeError as e:
        print(f"TypeError: {e}") # For debugging
        print(f"Work: {work}") # For debugging
        print(f"row: {row["type"]}") # For debugging
        continue
    if new_cited_by_count > int(current_cited_by_count):
        try:
            print(f"Updating the cited_by_count for {id.split('/')[-1]}: {title[:50]} from {current_cited_by_count} to {new_cited_by_count}")
            df_articles_metadata.at[index, "cited_by_count"] = new_cited_by_count
            df_articles_metadata.at[index, "updated_date"] = work["metadata"]["updated_date"]
            counter += 1
        except Exception as e:
            print(f"Failed to update the cited_by_count for PMID: {pmid}")
            print(e)
    else:
        print(f"Citation count for ID: {id.split('/')[-1]} is up-to-date. Citattion count: {current_cited_by_count}. Skipping...")
        continue

print(f"Updated values for {counter} articles.")
if counter > 0:
    pass


In [None]:
import os
import sys
import json
from typing import Tuple, Set, List, Dict, Any
from copy import deepcopy
from datetime import datetime
import pandas as pd

sys.path.append(os.path.expanduser("../utils"))
from openalex_api_utils import get_works

def update_citations(
    file_path: str,
    save_metadata_to_disk: bool = True,
    save_backup: bool = True,
    save_log_file: bool = True, 
    verbose: bool = True
) -> Tuple[bool, str]:
    """
    Update citation counts in articles metadata file using OpenAlex API data.

    Args:
        file_path (str): Path to the articles metadata CSV file.
        save_metadata_to_disk (bool): Whether to save the updated metadata to disk. Default is True. Set to False for testing on actual metadata.
        save_backup (bool): Whether to save a backup of the original metadata Default is True. Set to False for testing on actual metadata.
        save_log_file (bool): Whether to update the log file. Default is True. Set to False for testing on actual metadata.
        verbose (bool): Whether to show detailed progress messages. Default is True.

    Returns:
        Tuple[bool, str]: (success status, detailed message)
    """
    # Basic input validation
    file_path = os.path.expanduser(file_path) # Expand relative paths to absolute paths
    if not os.path.exists(file_path):
        return False, f"File not found: {file_path}"
    if not file_path.endswith('.csv'):
        return False, "Invalid file format. Must be CSV."
    assert isinstance(save_metadata_to_disk, bool), "save_to_disk must be a boolean."
    assert isinstance(save_backup, bool), "save_backup must be a boolean."
    assert isinstance(save_log_file, bool), "save_log_file must be a boolean."
    assert isinstance(verbose, bool), "verbose must be a boolean."

    # Read metadata file
    if verbose:
        print("Reading metadata file...")
    try:
        metadata = pd.read_csv(file_path, dtype=str)
        metadata["publication_date"] = pd.to_datetime(metadata["publication_date"])
        metadata = metadata.sort_values(by="publication_date", ascending=False)
        
        if metadata.empty:
            return False, "Empty metadata file"
        
        required_cols = ['oaid', 'cited_by_count', 'updated_date', 'doi_url']
        if not all(col in metadata.columns for col in required_cols):
            return False, f"Missing required columns: {set(required_cols) - set(metadata.columns)}"
            
        metadata_backup = deepcopy(metadata)
        
    except Exception as e:
        return False, f"Error reading metadata file: {str(e)}"

    # Fetch works data from OpenAlex API
    if verbose:
        print("Calling OpenAlex API ...")
    try:
        valid_ids = []
        for _, row in metadata.iterrows():
            oaid = str(row['oaid'])
            if pd.notna(oaid):
                oaid_clean = oaid.split('/')[-1] if '/' in oaid else oaid
                valid_ids.append(oaid_clean)

        if not valid_ids:
            return False, "No valid OpenAlex IDs found"

        works, failed_calls = get_works(
            ids=valid_ids,
            email=os.getenv("EMAIL"),
            select_fields="id,doi,cited_by_count,updated_date",
            show_progress=verbose
        )
    except Exception as e:
        return False, f"API error: {str(e)}"

    # Update citation counts
    updated_count = 0
    errors = []

    if verbose:
        print("Updating citation counts...")
    for idx, row in metadata.iterrows():
        try:
            oaid = str(row["oaid"])
            doi = row["doi_url"]
            current_citations = int(row["cited_by_count"]) if pd.notna(row["cited_by_count"]) else 0
            
            work = next((w for w in works if w["metadata"]["id"] == oaid), None)
            
            if not work:
                continue
                
            try:
                new_citations = work["metadata"]["cited_by_count"]
            except TypeError as e:
                if verbose:
                    print(f"TypeError: {e}")
                    print(f"Work: {work}")
                    print(f"Row type: {row['type']}")
                continue

            if new_citations > current_citations:
                if verbose:
                    print(f"Updating citations for OAID: {oaid} / DOI: {doi} from {current_citations} to {new_citations}")
                metadata.at[idx, 'cited_by_count'] = str(new_citations)
                metadata.at[idx, 'updated_date'] = work["metadata"]["updated_date"]
                updated_count += 1
            else:
                if verbose:
                    print(f"Citation count for OAID: {oaid} / DOI: {doi} is up-to-date. Citation count: {current_citations}. Skipping...")
                    
        except Exception as e:
            errors.append(f"Error processing {oaid}: {str(e)}")
            if verbose:
                print(f"Failed to update the cited_by_count for ID: {oaid}")
                print(e)
            continue

    # Save updates if any were made
    if updated_count > 0:
        if save_metadata_to_disk:
            if save_backup:
                try:
                    if verbose:
                        print("Saving a backup of the original metadata file...")
                    backup_timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
                    backup_path = file_path.replace(".csv", f"_bkp-{backup_timestamp}.csv")
                    metadata_backup.to_csv(backup_path, index=False)
                except Exception as e:
                    return False, f"Error saving backup: {str(e)}"

            if verbose:
                print("Saving updated metadata to disk...")
            try:
                metadata.to_csv(file_path, index=False)
            except Exception as e:
                return False, f"Error saving updated metadata to disk: {str(e)}"
            
            if save_log_file:
                if verbose:
                    print("Updating the log file...")
                try:
                    log_data = {
                        "last_modified": datetime.now().strftime('%Y-%m-%d'),
                        "status_message": f"Successfully updated citation counts for {updated_count} articles",
                    }
                    with open(os.path.join(os.path.dirname(file_path), "update-log.json"), 'w') as f:
                        json.dump(log_data, f, indent=2)
                except Exception as e:
                    return False, f"Error updating log file: {str(e)}"
        
            return True, f"Successfully updated citation counts for {updated_count} articles and saved metadata to disk."
        else: 
            return True, f"Successfully updated citation counts for {updated_count} articles. No changes saved to disk."
    else:
        return True, "No updates made. Citation counts are up-to-date."

### Test cases for update_citations()

In [None]:
ls ~/GitHub/nicomarr.github.io/_data/articles-metadata_bkp-20241018-16h30m.csv

In [None]:
# Test the function without saving the metadata to disk
file_path = "~/GitHub/nicomarr.github.io/_data/articles-metadata_bkp-20241018-16h30m.csv"
update_citations(
    file_path, 
    save_metadata_to_disk=False, 
    save_backup=False, 
    save_log_file=False,
    verbose=True)

In [None]:
ls ~/GitHub/nicomarr.github.io/_py/test_data

In [None]:
# Test the function on the actual metadata file in the test_data directory, update the metadata file and log file
file_path = "~/GitHub/nicomarr.github.io/_py/test_data/articles-metadata.csv"
update_citations(
    file_path, 
    save_metadata_to_disk=True, 
    save_backup=True, 
    save_log_file=True,
    verbose=True)

In [None]:
# Rename the backup file in test directory to remove the timestamp, overwrite the existing metadata file for subsequent tests
from pathlib import Path
import re

test_dir_path = Path("~/GitHub/nicomarr.github.io/_py/test_data/").expanduser()
backup_files = sorted(test_dir_path.glob("articles-metadata_bkp-*.csv"), key=lambda p: p.stat().st_mtime)
try:
    last_backup_file = backup_files[-1]
    new_name = re.sub(r'_bkp-\d{8}-\d{6}', '', last_backup_file.stem) + '.csv'
    last_backup_file.rename(last_backup_file.parent / new_name)
    print(f"Renamed {last_backup_file.name} to {new_name}")
except IndexError:
    print("No backup files found.")

In [None]:
# Run comprehensive test cases for update_citations function
import pandas as pd
import os
from datetime import datetime, UTC
import json

def test_update_citations():
    """Run comprehensive test cases for update_citations function"""
    
    print("Running update_citations tests...\n")
    results = []
    
    def run_test(name, test_function):
        """Helper to run a test and format results"""
        try:
            success, message = test_function()
            status = "PASSED" if success else "FAILED"
            print(f"{name}: {status}")
            print(f"Message: {message}\n")
            return {'name': name, 'status': status, 'message': message}
        except Exception as e:
            print(f"{name}: ERROR")
            print(f"Error: {str(e)}\n")
            return {'name': name, 'status': 'ERROR', 'message': str(e)}

    def test_valid_update():
        """Test successful citation update with actual data"""
        test_file = 'test_metadata.csv'
        
        # Create test data matching actual metadata structure
        test_data = pd.DataFrame({
            'oaid': ['https://openalex.org/W3194117818'],
            'cited_by_count': ['5'],  # Citation count to be updated
            'updated_date': ['2024-07-13T16:04:21.884242'],
            'doi_url': ['https://doi.org/10.1007/s10875-021-01115-2'],
            'first_author_last_name': ['Guennoun'],
            'article_title': ['Test Article'],
            'journal': ['Test Journal'],
            'publication_year': ['2021'],
            'publication_date': ['2021-08-24'],
            'type': ['article'],
            'type_crossref': ['journal-article']
        })
        
        test_data.to_csv(test_file, index=False)

        # Create a log file for testing
        log_data = {
            "last_modified": "2024-07-13",
            "status_message": "Initial metadata file created"
        }
        with open("update-log.json", 'w') as f:
            json.dump(log_data, f, indent=2)
        
        try:
            # Run update
            success, message = update_citations(
                test_file, 
                save_metadata_to_disk=True,
                save_backup=True,
                save_log_file=False,
                verbose=True
            )
            
            # Read updated file
            if os.path.exists(test_file):
                updated_data = pd.read_csv(test_file)
                print("\nDebug Information:")
                print(f"Update success: {success}")
                print(f"Update message: {message}")
                
                # Verify the update
                new_count = updated_data['cited_by_count'].iloc[0]
                
                # Convert new count to int for comparison
                if int(new_count) < 6:
                    return False, f"Expected citation count >5, but got {new_count}"
                
                # Return success with specific message about the update
                return True, f"Citation count successfully updated from 5 to {new_count}"
                
        except Exception as e:
            return False, f"Test failed with error: {str(e)}"
            
        finally:
            # Clean up
            if os.path.exists(test_file):
                os.remove(test_file)
            for f in os.listdir():
                if f.startswith("test_metadata_bkp-"):
                    os.remove(f)
            if os.path.exists("update-log.json"):
                os.remove("update-log.json")

    def test_invalid_format():
        """Test handling of invalid file format"""
        test_file = 'test.txt'
        with open(test_file, 'w') as f:
            f.write("test")
            
        try:
            success, message = update_citations(test_file)
            
            if success:
                return False, "Should fail for invalid file format"
            if "Invalid file format" not in message:
                return False, "Wrong error message for invalid format"
                
            return True, "Correctly handled invalid format"
            
        finally:
            if os.path.exists(test_file):
                os.remove(test_file)

    def test_missing_file():
        """Test handling of non-existent file"""
        success, message = update_citations("nonexistent.csv")
        
        if success:
            return False, "Should fail for missing file"
        if "File not found" not in message:
            return False, "Wrong error message for missing file"
            
        return True, "Correctly handled missing file"

    def test_empty_file():
        """Test handling of empty CSV file"""
        test_file = "empty.csv"
        
        # Create empty file with correct columns
        pd.DataFrame(columns=[
            'oaid', 'cited_by_count', 'updated_date', 'doi_url',
            'first_author_last_name', 'article_title', 'journal',
            'publication_year', 'publication_date', 'type', 'type_crossref'
        ]).to_csv(test_file, index=False)
        
        try:
            success, message = update_citations(test_file)
            
            if success:
                return False, "Should fail for empty file"
            if "Empty metadata file" not in message:
                return False, "Wrong error message for empty file"
                
            return True, "Correctly handled empty file"
            
        finally:
            if os.path.exists(test_file):
                os.remove(test_file)

    # Run all tests
    tests = [
        ("Valid Update", test_valid_update),
        ("Invalid Format", test_invalid_format),
        ("Missing File", test_missing_file),
        ("Empty File", test_empty_file)
    ]

    for test_name, test_func in tests:
        results.append(run_test(test_name, test_func))

    # Print summary
    print("\nTest Summary:")
    passed = sum(1 for r in results if r['status'] == 'PASSED')
    failed = sum(1 for r in results if r['status'] == 'FAILED')
    errors = sum(1 for r in results if r['status'] == 'ERROR')
    
    print(f"Passed: {passed}")
    print(f"Failed: {failed}")
    print(f"Errors: {errors}")
    
    return results

# Run tests
if __name__ == "__main__":
    test_results = test_update_citations()


## Generate module to execute the functions from the command line interface

In [None]:
%%writefile ../utils/__init__.py
__version__ = "0.0.3"

In [None]:
%%writefile ../utils/website_utils.py
import os
import json
from typing import Tuple, Set, List, Dict, Any
from copy import deepcopy
from datetime import datetime
import pandas as pd
from openalex_api_utils import get_works

def update_citations(
    file_path: str,
    save_metadata_to_disk: bool = True,
    save_backup: bool = True,
    save_log_file: bool = True, 
    verbose: bool = True
) -> Tuple[bool, str]:
    """
    Update citation counts in articles metadata file using OpenAlex API data.

    Args:
        file_path (str): Path to the articles metadata CSV file.
        save_metadata_to_disk (bool): Whether to save the updated metadata to disk. Default is True. Set to False for testing on actual metadata.
        save_backup (bool): Whether to save a backup of the original metadata Default is True. Set to False for testing on actual metadata.
        save_log_file (bool): Whether to update the log file. Default is True. Set to False for testing on actual metadata.
        verbose (bool): Whether to show detailed progress messages. Default is True.

    Returns:
        Tuple[bool, str]: (success status, detailed message)
    """
    # Basic input validation
    file_path = os.path.expanduser(file_path) # Expand relative paths to absolute paths
    if not os.path.exists(file_path):
        return False, f"File not found: {file_path}"
    if not file_path.endswith('.csv'):
        return False, "Invalid file format. Must be CSV."
    assert isinstance(save_metadata_to_disk, bool), "save_to_disk must be a boolean."
    assert isinstance(save_backup, bool), "save_backup must be a boolean."
    assert isinstance(save_log_file, bool), "save_log_file must be a boolean."
    assert isinstance(verbose, bool), "verbose must be a boolean."

    # Read metadata file
    if verbose:
        print("Reading metadata file...")
    try:
        metadata = pd.read_csv(file_path, dtype=str)
        metadata["publication_date"] = pd.to_datetime(metadata["publication_date"])
        metadata = metadata.sort_values(by="publication_date", ascending=False)
        
        if metadata.empty:
            return False, "Empty metadata file"
        
        required_cols = ['oaid', 'cited_by_count', 'updated_date', 'doi_url']
        if not all(col in metadata.columns for col in required_cols):
            return False, f"Missing required columns: {set(required_cols) - set(metadata.columns)}"
            
        metadata_backup = deepcopy(metadata)
        
    except Exception as e:
        return False, f"Error reading metadata file: {str(e)}"

    # Fetch works data from OpenAlex API
    if verbose:
        print("Calling OpenAlex API ...")
    try:
        valid_ids = []
        for _, row in metadata.iterrows():
            oaid = str(row['oaid'])
            if pd.notna(oaid):
                oaid_clean = oaid.split('/')[-1] if '/' in oaid else oaid
                valid_ids.append(oaid_clean)

        if not valid_ids:
            return False, "No valid OpenAlex IDs found"

        works, failed_calls = get_works(
            ids=valid_ids,
            email=os.getenv("EMAIL"),
            select_fields="id,doi,cited_by_count,updated_date",
            show_progress=verbose
        )
    except Exception as e:
        return False, f"API error: {str(e)}"

    # Update citation counts
    updated_count = 0
    errors = []

    if verbose:
        print("Updating citation counts...")
    for idx, row in metadata.iterrows():
        try:
            oaid = str(row["oaid"])
            doi = row["doi_url"]
            current_citations = int(row["cited_by_count"]) if pd.notna(row["cited_by_count"]) else 0
            
            work = next((w for w in works if w["metadata"]["id"] == oaid), None)
            
            if not work:
                continue
                
            try:
                new_citations = work["metadata"]["cited_by_count"]
            except TypeError as e:
                if verbose:
                    print(f"TypeError: {e}")
                    print(f"Work: {work}")
                    print(f"Row type: {row['type']}")
                continue

            if new_citations > current_citations:
                if verbose:
                    print(f"Updating citations for OAID: {oaid} / DOI: {doi} from {current_citations} to {new_citations}")
                metadata.at[idx, 'cited_by_count'] = str(new_citations)
                metadata.at[idx, 'updated_date'] = work["metadata"]["updated_date"]
                updated_count += 1
            else:
                if verbose:
                    print(f"Citation count for OAID: {oaid} / DOI: {doi} is up-to-date. Citation count: {current_citations}. Skipping...")
                    
        except Exception as e:
            errors.append(f"Error processing {oaid}: {str(e)}")
            if verbose:
                print(f"Failed to update the cited_by_count for ID: {oaid}")
                print(e)
            continue

    # Save updates if any were made
    if updated_count > 0:
        if save_metadata_to_disk:
            if save_backup:
                try:
                    if verbose:
                        print("Saving a backup of the original metadata file...")
                    backup_timestamp = datetime.now().strftime('%Y%m%d-%H%M%S')
                    backup_path = file_path.replace(".csv", f"_bkp-{backup_timestamp}.csv")
                    metadata_backup.to_csv(backup_path, index=False)
                except Exception as e:
                    return False, f"Error saving backup: {str(e)}"

            if verbose:
                print("Saving updated metadata to disk...")
            try:
                metadata.to_csv(file_path, index=False)
            except Exception as e:
                return False, f"Error saving updated metadata to disk: {str(e)}"
            
            if save_log_file:
                if verbose:
                    print("Updating the log file...")
                try:
                    log_data = {
                        "last_modified": datetime.now().strftime('%Y-%m-%d'),
                        "status_message": f"Successfully updated citation counts for {updated_count} articles",
                    }
                    with open(os.path.join(os.path.dirname(file_path), "update-log.json"), 'w') as f:
                        json.dump(log_data, f, indent=2)
                except Exception as e:
                    return False, f"Error updating log file: {str(e)}"
        
            return True, f"Successfully updated citation counts for {updated_count} articles and saved metadata to disk."
        else: 
            return True, f"Successfully updated citation counts for {updated_count} articles. No changes saved to disk."
    else:
        return True, "No updates made. Citation counts were up-to-date."

In [None]:
%%writefile -a ../utils/website_utils.py

from typing import List, Dict, Any

def parse_data(works: List[Dict[str, Any]], exclude_errata: bool = True) -> pd.DataFrame:
    """
    Parse the raw data from the OpenAlex API and create a DataFrame.

    This function extracts relevant information from each work in the input list
    and creates a DataFrame with specified columns. It also removes duplicates
    based on PMID and filters out errata (if specified).

    Args:
        works (List[Dict[str, Any]]): A list of dictionaries, where each dictionary
            contains metadata about a work.
        exclude_errata (bool): Whether to exclude errata from the DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing extracted and processed information
        from the works.

    Example:
        >>> df_works = parse_data(works)
        >>> df_works.head()

    Note:
        The function extracts the following information for each work:
        - First author's last name
        - Article title
        - Journal name
        - Publication year and date
        - PMID, PMCID, and OpenAlex ID
        - PDF URL (if available)
        - DOI URL
        - Citation count and URL
        - Work type and Crossref type
        - Updated date (from the API)
    """

    # Initialize an empty list to store the extracted data, and iterate over the works data to extract relevant information
    oa_data = []
    for work in works:
        metadata = work["metadata"]
        first_author_last_name = metadata["authorships"][0]["author"]["display_name"].split(" ")[-1]
        article_title = metadata["title"]
        journal = metadata["primary_location"]["source"]["display_name"]
        publication_year = str(metadata["publication_year"])
        publication_date = metadata["publication_date"]
        if publication_date:
            try:
                publication_date = pd.to_datetime(publication_date).strftime('%Y-%m-%d')
            except ValueError:
                pass # If the date can't be parsed, keep the original string
        pmid = metadata["ids"].get("pmid", "").split("/")[-1] # To remove the url prefix
        pmcid = metadata["ids"].get("pmcid")
        if pmcid is not None:
            pmcid = pmcid.split("/")[-1] # To remove the url prefix
        else: 
            pmcid = "" # To replace None with an empty string
        oaid = metadata["id"]
        try:
            pdf_url = metadata.get("best_oa_location", {}).get("pdf_url", "not available")
        except AttributeError:
            pdf_url = "not available"
        if pdf_url is None:
            pdf_url = "not available"
        doi_url = metadata["doi"]
        cited_by_count = str(metadata["cited_by_count"])
        cited_by_ui_url = metadata["cited_by_api_url"].replace("api.openalex.org", "openalex.org")
        work_type = metadata.get("type")
        type_crossref = metadata.get("type_crossref")
        updated_date = metadata.get("updated_date")

        oa_data.append([
            first_author_last_name, article_title, journal, publication_year,
            publication_date, pmid, pmcid, oaid, pdf_url, doi_url,
            cited_by_count, cited_by_ui_url, work_type, type_crossref, updated_date
        ])

    columns = [
        'first_author_last_name', 'article_title', 'journal',
        'publication_year', 'publication_date', 'pmid', 'pmcid', 'oaid',
        'pdf_url', 'doi_url', 'cited_by_count', 'cited_by_ui_url', 'type',
        'type_crossref', 'updated_date'
    ]

    # Create a DataFrame with the specified columns
    df_works = pd.DataFrame(oa_data, columns=columns, dtype=str)
    df_works = df_works.drop_duplicates(subset=["pmid"])
    if exclude_errata:
        df_works = df_works[df_works["type"] != "erratum"]

    # Parse the publication date as a datetime object with the format 'YYYY-MM-DD'
    df_works["publication_date"] = pd.to_datetime(df_works["publication_date"], errors='coerce').dt.strftime('%Y-%m-%d')
    
    # Sort the DataFrame by publication date in descending order
    df_works = df_works.sort_values(by="publication_date", ascending=False)
    df_works.reset_index(drop=True, inplace=True)

    return df_works

In [None]:
%%writefile -a ../utils/website_utils.py

import os
import pandas as pd
from datetime import datetime
import argparse
from copy import deepcopy

def append_metadata(metadata_file_path: str, pmid_file_path: str, exclude_errata: bool = True, verbose: bool = True) -> Tuple[bool, str]:
    """
    Append metadata for missing PMIDs to an existing metadata file.

    Args:
        metadata_file_path (str): Path to CSV file containing existing metadata.
        pmid_file_path (str): Path to file containing list of PMIDs.
        exclude_errata (bool): Whether to exclude errata from the metadata.
        verbose (bool): Whether to show verbose messages during the process.

    Returns:
        tuple: A tuple containing a boolean indicating if any updates were made, and a message string with details.
    """

    # Input validation
    assert metadata_file_path.endswith(".csv"), "Invalid file format. Please provide a CSV file."
    assert os.path.exists(metadata_file_path), "Metadata file not found."
    assert pmid_file_path.endswith(".txt"), "Invalid file format. Please provide a TXT file."
    assert os.path.exists(pmid_file_path), "PMID file not found."
    assert isinstance(verbose, bool), "Verbose must be a boolean."

    # Read existing metadata
    if verbose: print("Reading the existing metadata file...")
    try:
        metadata = pd.read_csv(metadata_file_path, dtype=str)
        metadata_bkp = deepcopy(metadata) # Make a deepcopy of the DataFrame to save a backup
    except Exception as e:
        if verbose:
            print(f"An error occurred while reading the metadata file: {e}")
        return False, f"An error occurred while reading the metadata file: {e}"

    # Read PMIDs from file
    if verbose: print("Reading the PMID file...")
    try:
        with open(pmid_file_path, 'r') as f:
            pmids = set(line.strip() for line in f)
    except Exception as e:
        if verbose:
            print(f"An error occurred while reading the PMID file: {e}")
        return False, f"An error occurred while reading the PMID file: {e}"

    # Find missing PMIDs
    if verbose: print("Searching for new PMIDs not in the metadata...")
    existing_pmids: Set[str] = set(metadata["pmid"])
    new_pmids: Set[str] = pmids - existing_pmids
    new_pmids_str = ", ".join(new_pmids) # Convert to a string with comma-separated values
    if verbose: print(f"Found {len(new_pmids)} new PMID(s): {new_pmids_str}.")
    if len(new_pmids) == 0:
        return False, "No new PMIDs found."
    else:
        try:
            # Make API calls to get the missing PMIDs
            select_fields: str = (
                "id,title,doi,primary_location,authorships,publication_year,"
                "publication_date,ids,best_oa_location,cited_by_count,"
                "cited_by_api_url,type,type_crossref,updated_date"
            )
            new_articles, failed_calls = get_works(
                ids=list(new_pmids),
                email=os.environ.get("EMAIL"),
                select_fields=select_fields,
                show_progress=verbose
            )
            if verbose: print(f"API calls completed. Failed calls: {len(failed_calls)}")
        except Exception as e:
            if verbose:
                print(f"An error occurred while fetching works data from the API: {e}")
            return False, f"An error occurred while fetching works data from the API: {e}"

        # Parse the data for new articles
        if verbose: print("Parsing the data for new articles...")
        try:
            df_new_articles = parse_data(new_articles, exclude_errata=exclude_errata)
        except Exception as e:
            if verbose:
                print(f"An error occurred while parsing the data for new articles: {e}")
            return False, f"An error occurred while parsing the data for new articles: {e}"
        
        if df_new_articles.empty:
            if exclude_errata:
                if verbose: print("No new articles found (Errata excluded).")
                return False, "No new articles found (Errata excluded)."
            else:
                if verbose: print("No new articles found.")
                return False, "No new articles found."
        else:
            # Append the new articles to the existing metadata
            new_pmids = set(df_new_articles["pmid"])
            new_pmids = ", ".join(new_pmids) # Convert to a string with comma-separated values
            if verbose: print(f"Appending {len(df_new_articles)} new article(s) with PMID(s) {new_pmids} to the existing metadata...")
            try:
                metadata = pd.concat([df_new_articles, metadata], ignore_index=True)
            except Exception as e:
                if verbose:
                    print(f"An error occurred while appending the new articles to the existing metadata: {e}")
                return False, f"An error occurred while appending the new articles to the existing metadata: {e}"

            # Save the updated metadata to a CSV file
            if verbose: print("Saving the updated metadata to a CSV file...")
            metadata.to_csv(metadata_file_path, index=False)
            if verbose: print("Saving a backup file to disk...")
            bkp_file_path = metadata_file_path.replace(".csv", f"_bkp-{datetime.now().strftime('%Y%m%d-%Hh%Mm')}.csv")
            metadata_bkp.to_csv(bkp_file_path, index=False)
            if verbose: print("Metadata updated successfully.")

            # Get the path to the log file from the metadata file path
            log_file_path = os.path.join(os.path.dirname(metadata_file_path), "update-log.json")
            
            # Update the log file
            try:
                if verbose: print("Updating the log file...")
                with open(log_file_path, "r") as f:
                    update_log = json.load(f)
                current_date = datetime.now().strftime("%Y-%m-%d")    
                update_log["last_modified"] = current_date # Expected format: {"last_modified": "2024-08-06"}
                with open(log_file_path, "w") as f:
                    json.dump(update_log, f)
                if verbose: print(f"Log file updated successfully.")
            except Exception as e:
                if verbose: print(f"Error updating log file: {e}. Creating a new log file...")
                current_date = datetime.now().strftime("%Y-%m-%d")
                with open(log_file_path, "w") as f:
                    json.dump({"last_modified": current_date}, f)
                if verbose: print(f"New log file created successfully.")

            return True, f"Appended {len(df_new_articles)} article(s) and saved file to {metadata_file_path}. Backup saved as {bkp_file_path}"


In [None]:
%%writefile ../utils/main.py
import argparse
import os
import sys
sys.path.append(os.path.expanduser("../utils"))
from website_utils import update_citations, append_metadata

def main():
    parser = argparse.ArgumentParser(description="Manage website metadata and citations.")
    
    # Main operation group
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--update-citations", action="store_true", help="Update citation counts in the metadata file")
    group.add_argument("--append-metadata", action="store_true", help="Append metadata for missing PMIDs")
    group.add_argument("--update-and-append", action="store_true", help="Perform both update and append operations")

    # Common arguments
    parser.add_argument("directory", type=str, help="Directory containing the metadata, log, and PMID files")
    parser.add_argument("--quiet", action="store_true", help="Run in quiet mode (no verbose output)")
    parser.add_argument("--include-errata", action="store_true", help="Include errata in the appended metadata")

    args = parser.parse_args()

    # Define file paths
    metadata_file = os.path.join(args.directory, "articles-metadata.csv")
    log_file = os.path.join(args.directory, "update-log.json")
    pmid_file = os.path.join(args.directory, "PMID-export.txt")

    # Validate file existence
    if not os.path.exists(metadata_file):
        parser.error(f"Metadata file not found: {metadata_file}")
    if not os.path.exists(log_file):
        parser.error(f"Log file not found: {log_file}")
    if (args.append_metadata or args.update_and_append) and not os.path.exists(pmid_file):
        parser.error(f"PMID file not found: {pmid_file}")

    success_messages = []
    error_messages = []

    if args.update_citations or args.update_and_append:
        success, message = update_citations(metadata_file, verbose=not args.quiet)
        if success:
            success_messages.append(f"Update citations operation: {message}")
        else:
            error_messages.append(f"Update citations operation completed without saving new data: {message}")

    if args.append_metadata or args.update_and_append:
        success, message = append_metadata(metadata_file, pmid_file, 
                                           exclude_errata=not args.include_errata, 
                                           verbose=not args.quiet)
        if success:
            success_messages.append(f"Append metadata operation: {message}")
        else:
            error_messages.append(f"Append metadata operation completed without saving new data: {message}")

    # Print results
    for message in success_messages:
        print(message)
    for message in error_messages:
        print(message)

    # Exit with error if any operation failed
    if error_messages:
        exit(1)

if __name__ == "__main__":
    main()



## Test the functions

### For execution from REPL / Jupyter

In [None]:
import sys, os
sys.path.append(os.path.expanduser("../utils"))
from website_utils import update_citations, parse_data, append_metadata
from openalex_api_utils import get_works

In [None]:
# Path to test metadata file
%ls ~/GitHub/nicomarr.github.io/_py/test_data


### Test command line execution

In [None]:
%run ../../_py/utils/main.py --update-citations ../../_py/test_data

In [None]:
# Rename the backup file in the test directory to remove the timestamp
# This will overwrite the existing metadata file in the test directory for subsequent tests
from pathlib import Path
import re

test_dir_path = Path("~/GitHub/nicomarr.github.io/_py/test_data/").expanduser()
backup_files = sorted(test_dir_path.glob("articles-metadata_bkp-*.csv"), key=lambda p: p.stat().st_mtime)
try:
    last_backup_file = backup_files[-1]
    new_name = re.sub(r'_bkp-\d{8}-\d{6}', '', last_backup_file.stem) + '.csv'
    last_backup_file.rename(last_backup_file.parent / new_name)
    print(f"Renamed {last_backup_file.name} to {new_name}")
except IndexError:
    print("No backup files found.")

## Instructions for execution from command line in the root directory of the repository

***Make sure to activate the virtual environment before running the commands and that the required packages are installed!***

### Update citation counts
To update citation counts in the `test_data` directory:
```sh
python ./_py/utils/main.py --update-citations ./_py/test_data
```
To update citation counts in the `_data` directory:
```sh
python ./_py/utils/main.py --update-citations ./_data
```
Or in quiet mode:
```sh
python ./_py/utils/main.py --update-citations ./_data --quiet
```

### Append metadata with new articles
To append metadata in the `test_data` directory:
```sh
python ./_py/utils/main.py --append-metadata ./_py/test_data
```

To append metadata in the `_data` directory:
```sh
python ./_py/utils/main.py --append-metadata ./_data
```
To include errata and run in quiet mode:
```sh
python ./_py/utils/main.py --append-metadata ./_data --quiet
```

### Perform both update and append operations
```sh
python ./_py/utils/main.py --update-and-append ./_data
```

### For help

In [None]:
%run ~/GitHub/nicomarr.github.io/_py/utils/main.py --help