<a href="https://colab.research.google.com/github/micha-blip/Bird-images-unsupervised-clustering/blob/main/Reference_checker.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [136]:
import requests
import numpy as np
import pandas as pd

# this is a simple function which takes DOI, extracts references and checks if those articles actually exist
# made to be used as a simple weapon against LLM - hallucinated references
# it takes DOI as for example 10.1016/j.cell.2025.08.007
# in the output
#   green means reference was found
#   red means reference was not found
#   yellow means DOI of the reference is missing

def get_references_from_doi(doi, return_dataframe = True):

  api_url = f"https://api.crossref.org/v1/works/{doi}"
  headers = {"User-Agent": "SimpleFetcher"}
  print('Sending request...')

  try:
      response = requests.get(api_url, headers=headers)
      response.raise_for_status()  # Check for bad responses
      data = response.json()

      # Get the list of references from the response
      references = data["message"].get("reference", [])
      title = data["message"].get('title', [])
      title = str(title[0]).replace("\\n", " ").replace('<sup>+</sup>', "")
      print('Processing response for: ' + '\x1B[3m'  + title)

      doi_list = []

      if references:
          for i, ref in enumerate(references):
              ref_key = ref.get('key', f"substitute_key_{i}") # Use a substitute key if 'key' is missing
              # The 'unstructured' field often contains the full reference string
              if "DOI" in ref:
                  doi_list.append(ref['DOI'])
              elif "unstructured" in ref:
                  doi_list.append('not found')
              else:
                  doi_list.append('not found')

  except requests.exceptions.RequestException as e:
      print(f"An error occurred: The article was not found")
      return

  check_status = check_references_for_doi(doi_list)

  print("Done, found " + str(sum(check_status == "article found")) + " existing documents, " + str(sum(check_status == "no article")) + " non-existing documents" + " and " + str(sum(check_status == 'no DOI')) + " missing DOI")

  if return_dataframe == True:
    references_checked = pd.DataFrame(doi_list,  columns=['DOI'])
    references_checked['status'] = check_status
    return references_checked



def check_references_for_doi(doi_list):
  headers = {"User-Agent": "SimpleFetcher"}
  status = np.full(len(doi_list), "not processed")

  for i, doi in enumerate(doi_list):
    api_url = f"https://api.crossref.org/v1/works/{doi}"
    if doi == 'not found': # if there is no doi
      status[i] = 'no DOI'
      print('\033[33m' + str(i+1) +" " +  "DOI missing" + '\033[0m')
      continue
    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()
        data = response.json()
        print('\033[32m'  + str(i+1) + " " + data["message"].get("title", ["No Title Available"])[0] + '\033[0m')
        status[i] = "article found"
    except requests.exceptions.RequestException as e:
        print('\033[31m' + str(i+1) + " " +  f"Error fetching data for " + data["message"].get("title", ["No Title Available"])[0] + '\033[0m')
        status[i] = 'no article'
  return status

In [137]:
x = get_references_from_doi(doi =  '10.1093/jas/skaf174', return_dataframe=True )

Sending request...
Processing response for: [3mChanges of fecal microbiota with supplementation of <i>Acremonium terricola</i> culture and yeast culture in ewes during lactation
[32m1 Effect of supplementing live Saccharomyces cerevisiae yeast on performance, rumen function, and metabolism during the transition period in Holstein dairy cows[0m
[32m2 Proteomics and metabolomics characterizing the pathophysiology of adaptive reactions to the metabolic challenges during the transition from late pregnancy to early lactation in dairy cows[0m
[32m3 Glycine, serine and threonine metabolism confounds efficacy of complement-mediated killing[0m
[32m4 Re-print of “Intestinal luminal nitrogen metabolism: Role of the gut microbiota and consequences for the host”[0m
[32m5 Utilizing the fecal microbiota to understand foal gut transitions from birth to weaning[0m
[32m6 Changes in rumen microbiota composition and in situ degradation kinetics during the dry period and early lactation as affe

In [138]:
x

Unnamed: 0,DOI,status
0,10.3168/jds.2022-23046,article found
1,10.1016/j.jprot.2017.10.010,article found
2,10.1038/s41467-019-11129-5,article found
3,10.1016/j.phrs.2013.01.003,article found
4,10.1371/journal.pone.0216211,article found
5,10.3168/jds.2016-11982,article found
6,10.1007/s00253-007-1113-7,article found
7,10.3390/jof7060447,article found
8,10.1016/j.anscip.2023.01.240,article found
9,10.1007/s00726-017-2391-8,article found
