<a href="https://colab.research.google.com/github/micha-blip/Bird-images-unsupervised-clustering/blob/main/Reference_checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [86]:
import requests
import numpy as np
import pandas as pd

# this is a simple function which takes DOI, extracts references and checks if those articles actually exist
# made to be used as a simple weapon against LLM - hallucinated references
# it takes DOI as for example 10.1016/j.cell.2025.08.007
# in the output
#   green means reference was found
#   red means reference was not found
#   yellow means DOI of the reference is missing

def get_references_from_doi(doi, return_dataframe = True, verbose = True):

  api_url = f"https://api.crossref.org/v1/works/{doi}"
  headers = {"User-Agent": "SimpleFetcher"}
  print('Sending request...')

  try:
      response = requests.get(api_url, headers=headers)
      response.raise_for_status()  # Check for bad responses
      data = response.json()

      # Get the list of references from the response
      references = data["message"].get("reference", [])
      title = data["message"].get('title', [])
      title = str(title[0]).replace("\\n", " ").replace('<sup>+</sup>', "")
      print('Processing response for: ' + '\x1B[3m'  + title)

      doi_list = []

      if references:
          for i, ref in enumerate(references):
              ref_key = ref.get('key', f"substitute_key_{i}") # Use a substitute key if 'key' is missing
              if "DOI" in ref:
                  doi_list.append(ref['DOI'])
              elif "unstructured" in ref:
                  doi_list.append('not found')
              else:
                  doi_list.append('not found')

  except requests.exceptions.RequestException as e:
      print(f"An error occurred: The article was not found")
      return

  check_status = check_references_for_doi(doi_list, verbose)
  print("Done, found " + str(sum(check_status == "article found")) + " existing documents, " + str(sum(check_status == "no article")) + " non-existing documents" + " and " + str(sum(check_status == 'no DOI')) + " missing DOI")

  if verbose == False:
    result = {"Found" : int(sum(check_status == "article found")), "Not found" : int(sum(check_status == "no article")), "Missing DOI" : int(sum(check_status == 'no DOI'))}
    return pd.DataFrame(result, index = range(1))



  if return_dataframe == True:
    references_checked = pd.DataFrame(doi_list,  columns=['DOI'])
    references_checked['status'] = check_status
    return references_checked



def check_references_for_doi(doi_list, verbose):
  headers = {"User-Agent": "SimpleFetcher"}
  status = np.full(len(doi_list), "not processed")

  for i, doi in enumerate(doi_list):
    api_url = f"https://api.crossref.org/v1/works/{doi}"
    if doi == 'not found': # if there is no doi
      status[i] = 'no DOI'
      if verbose == True:
        print('\033[33m' + str(i+1) +" " +  "DOI missing" + '\033[0m')
      continue
    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        data = response.json()
        if verbose == True:
          print('\033[32m'  + str(i+1) + " " + data["message"].get("title", ["No Title Available"])[0] + '\033[0m')
        status[i] = "article found"

    except requests.exceptions.RequestException as e:
        if verbose == True:
          print('\033[31m' + str(i+1) + " " +  f"Error fetching data for " + data["message"].get("title", ["No Title Available"])[0] + '\033[0m')
        status[i] = 'no article'
  return status



In [87]:
x = get_references_from_doi(doi =  '10.1093/jas/skae224', return_dataframe=True, verbose = False)

Sending request...
Processing response for: [3mEffects of yeast-enriched functionalized canola meal supplementation on apparent total tract macronutrient digestibility and fecal characteristics, fecal microbiota, and immune function of healthy adult dogs


KeyboardInterrupt: 

In [105]:
doi_list = ['10.1073/pnas.2426992122',"10.1172/JCI173193", "10.1186/s13023-025-03778-1", '10.1186/s13287-025-04610-0', '10.1186/s12951-025-03630-5', '10.1186/s12916-025-04358-7', '10.1186/s12967-025-06991-5']

In [None]:
for doi in doi_list:
  get_references_from_doi(doi = doi, return_dataframe=True, verbose = False )

Sending request...
Processing response for: [3mTherapeutic restoration of mitochondria–endoplasmic reticulum cross talk for osteoarthritis
Done, found 61 existing documents, 0 non-existing documents and 2 missing DOI
Sending request...
Processing response for: [3mIntegrated screening identifies GPR31 as a key driver and druggable target for metabolic dysfunction–associated steatohepatitis
Done, found 45 existing documents, 0 non-existing documents and 0 missing DOI
Sending request...
Processing response for: [3mExploring the uncharted role of cell senescence in rare diseases
Done, found 171 existing documents, 0 non-existing documents and 7 missing DOI
Sending request...
Processing response for: [3mEngineering modification of human umbilical cord mesenchymal stem cell-derived small extracellular vesicles ameliorates polycystic ovary syndrome by enhancing the ovarian environment and regulating follicular development
Done, found 76 existing documents, 1 non-existing documents and 0 m