<a href="https://colab.research.google.com/github/micha-blip/Simple-article-reference-checker/blob/main/Reference_checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Simple reference checker

This notebook provides tools to check the validity of references in scientific articles using DOIs. The goal is to help identify potentially fabricated or hallucinated references, which can be a concern with AI-generated content, and thereby contribute to improving the reliability and integrity of scientific literature.

In [113]:
import requests
import numpy as np
import pandas as pd

def get_references_from_doi(doi, return_dataframe = True, verbose = True):
  """
  Fetches references for a given DOI, checks their existence via Crossref,
  and returns a summary or a DataFrame of the results.

  Args:
    doi (str): The DOI of the article to fetch references from.
    return_dataframe (bool, optional): If True, returns a pandas DataFrame
                                        with DOI and status. If False, returns
                                        a dictionary summary. Defaults to True.
    verbose (bool, optional): If True, prints detailed status during the
                              reference check. Defaults to True.

  Returns:
    pandas.DataFrame or dict: A DataFrame with reference DOIs and their status
                              ('article found', 'no article', 'no DOI'),
                              or a dictionary summarizing the counts of each status.
                              Returns None if the initial article DOI is not found.
  """

  api_url = f"https://api.crossref.org/v1/works/{doi}"
  headers = {"User-Agent": "SimpleFetcher"}
  print('Sending request...')

  try:
      response = requests.get(api_url, headers=headers)
      response.raise_for_status()  # Check for bad responses
      data = response.json()

      # Get the list of references from the response
      references = data["message"].get("reference", [])
      title = data["message"].get('title', [])
      title = str(title[0]).replace("\\n", " ").replace('<sup>+</sup>', "")
      print('Processing response for: ' + '\x1B[3m'  + title)

      doi_list = []

      if references:
          for i, ref in enumerate(references):
              ref_key = ref.get('key', f"substitute_key_{i}") # Use a substitute key if 'key' is missing
              if "DOI" in ref:
                  doi_list.append(ref['DOI'])
              elif "unstructured" in ref:
                  doi_list.append('not found')
              else:
                  doi_list.append('not found')

  except requests.exceptions.RequestException as e:
      print(f"An error occurred: The article was not found")
      return

  check_status = check_references_for_doi(doi_list, verbose)
  print("Done, found " + str(sum(check_status == "article found")) + " existing documents, " + str(sum(check_status == "no article")) + " non-existing documents" + " and " + str(sum(check_status == 'no DOI')) + " missing DOI")

  if verbose == False:
    result = {"Found" : int(sum(check_status == "article found")), "Not found" : int(sum(check_status == "no article")), "Missing DOI" : int(sum(check_status == 'no DOI'))}
    return pd.DataFrame(result, index = range(1))



  if return_dataframe == True:
    references_checked = pd.DataFrame(doi_list,  columns=['DOI'])
    references_checked['status'] = check_status
    return references_checked



def check_references_for_doi(doi_list, verbose):
  """
  Checks the existence of a list of DOIs using the Crossref API.

  Args:
    doi_list (list): A list of DOIs (strings) to check. 'not found' indicates
                     a missing DOI in the original reference list.
    verbose (bool): If True, prints the status and title (if found) for each DOI.

  Returns:
    numpy.ndarray: A numpy array of strings indicating the status for each DOI
                   in the input list ('article found', 'no article', 'no DOI').
  """
  headers = {"User-Agent": "SimpleFetcher"}
  status = np.full(len(doi_list), "not processed")

  for i, doi in enumerate(doi_list):
    api_url = f"https://api.crossref.org/v1/works/{doi}"
    if doi == 'not found': # if there is no doi
      status[i] = 'no DOI'
      if verbose == True:
        print('\033[33m' + str(i+1) +" " +  "DOI missing" + '\033[0m')
      continue
    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        data = response.json()
        if verbose == True:
          # Added a check for 'title' in data["message"] before accessing it
          title = data["message"].get("title", ["No Title Available"])
          print('\033[32m'  + str(i+1) + " " + title[0] + '\033[0m')
        status[i] = "article found"

    except requests.exceptions.RequestException as e:
        if verbose == True:
           # Modified the error message to reflect that fetching data failed
          print('\033[31m' + str(i+1) + " " +  f"Error fetching data for DOI: {doi}" + '\033[0m')
        status[i] = 'no article'
  return status

In [114]:
x = get_references_from_doi(doi =  "10.1186/s13023-025-03778-1", return_dataframe=True, verbose = True)

Sending request...
Processing response for: [3mExploring the uncharted role of cell senescence in rare diseases
[32m1 The role of senescent cells in ageing[0m
[32m2 Hallmarks of Cellular Senescence[0m
[32m3 The serial cultivation of human diploid cell strains[0m
[32m4 Cellular senescence: the good, the bad and the unknown[0m
[32m5 Chemotherapy-induced senescence, an adaptive mechanism driving resistance and tumor heterogeneity[0m
[32m6 Mitochondrial Dysfunction Induces Senescence with a Distinct Secretory Phenotype[0m
[32m7 Oncogene-induced senescence is part of the tumorigenesis barrier imposed by DNA damage checkpoints[0m
[32m8 Cellular Senescence, Neurological Function, and Redox State[0m
[32m9 Cellular senescence: when bad things happen to good cells[0m
[32m10 Aging of the cells: Insight into cellular senescence and detection Methods[0m
[32m11 Senescence-Associated Secretory Phenotypes Reveal Cell-Nonautonomous Functions of Oncogenic RAS and the p53 Tumor Supp

  return datetime.utcnow().replace(tzinfo=utc)


In [115]:
doi_list = ['10.1073/pnas.2426992122',"10.1172/JCI173193", "10.1186/s13023-025-03778-1", '10.1186/s13287-025-04610-0', '10.1186/s12951-025-03630-5', '10.1186/s12916-025-04358-7', '10.1186/s12967-025-06991-5']

In [None]:
for doi in doi_list:
  get_references_from_doi(doi = doi, return_dataframe=True, verbose = False)

Sending request...
Processing response for: [3mTherapeutic restoration of mitochondria–endoplasmic reticulum cross talk for osteoarthritis
Done, found 61 existing documents, 0 non-existing documents and 2 missing DOI
Sending request...
Processing response for: [3mIntegrated screening identifies GPR31 as a key driver and druggable target for metabolic dysfunction–associated steatohepatitis
Done, found 45 existing documents, 0 non-existing documents and 0 missing DOI
Sending request...
Processing response for: [3mExploring the uncharted role of cell senescence in rare diseases
Done, found 171 existing documents, 0 non-existing documents and 7 missing DOI
Sending request...
Processing response for: [3mEngineering modification of human umbilical cord mesenchymal stem cell-derived small extracellular vesicles ameliorates polycystic ovary syndrome by enhancing the ovarian environment and regulating follicular development
Done, found 76 existing documents, 1 non-existing documents and 0 m