<a href="https://colab.research.google.com/github/micha-blip/Bird-images-unsupervised-clustering/blob/main/Reference_checker_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Simple reference checker

This notebook provides tools to check the validity of references in scientific articles using DOIs. The goal is to help identify potentially fabricated or hallucinated references, which can be a concern with AI-generated content, and thereby contribute to improving the reliability and integrity of scientific literature.

In [138]:
!pip install Bio

Collecting Bio
  Downloading bio-1.8.0-py3-none-any.whl.metadata (5.7 kB)
Collecting biopython>=1.80 (from Bio)
  Downloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl.metadata (11 kB)
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.4.1-py3-none-any.whl.metadata (10 kB)
Downloading bio-1.8.0-py3-none-any.whl (321 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.1/321.1 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading biopython-1.85-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m60.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gprofiler_official-1.0.0-py3-none-any.whl (9.3

In [154]:

import requests
import numpy as np
import pandas as pd
import json
from Bio import Entrez


def get_references_from_doi(doi, return_dataframe = True, verbose = True):
  """
  Fetches references for a given DOI, checks their existence via Crossref, and PubMed
  and returns a summary or a DataFrame of the results.

  Args:
    doi (str): The DOI of the article to fetch references from.
    return_dataframe (bool, optional): If True, returns a pandas DataFrame
                                        with DOI and status. If False, returns
                                        a dictionary summary. Defaults to True.
    verbose (bool, optional): If True, prints detailed status during the
                              reference check. Defaults to True.

  Returns:
    pandas.DataFrame or dict: A DataFrame with reference DOIs and their status
                              ('article found', 'no article', 'no DOI'),
                              or a dictionary summarizing the counts of each status.
                              Returns None if the initial article DOI is not found.
  """

  api_url = f"https://api.crossref.org/v1/works/{doi}"
  headers = {"User-Agent": "SimpleFetcher"}
  print('Sending request...')

  try:
      response = requests.get(api_url, headers=headers)
      response.raise_for_status()  # Check for bad responses
      data = response.json()

      # Get the list of references from the response
      references = data["message"].get("reference", [])
      title = data["message"].get('title', [])
      title = str(title[0]).replace("\\n", " ").replace('<sup>+</sup>', "")
      print('Processing response for: ' + '\x1B[3m'  + title)

      doi_list = []  # this is the list of DOIs obtained from references
      title_list = []
      reference_full_text = []

      if references:
          for i, ref in enumerate(references):
              ref_key = ref.get('key', f"substitute_key_{i}") # Use a substitute key if 'key' is missing
              if "DOI" in ref:
                  doi_list.append(ref['DOI'])
                  reference_full_text.append("skipped")
              elif "unstructured" in ref:
                  doi_list.append('not found')
                  reference_full_text.append(ref['unstructured'])
              else:
                  doi_list.append('not found')
                  reference_full_text.append('not found')
              if 'article-title' in ref:
                  title_list.append(ref['article-title'])
              else:
                  title_list.append('not found')

  except requests.exceptions.RequestException as e:
      print(f"An error occurred: The article was not found")
      return


  DOI_check_crossref = check_references_for_doi_crossref(doi_list, verbose)
  print("Done, found " + str(sum(DOI_check_crossref == "article found")) + " existing documents, " + str(sum(DOI_check_crossref == "no article")) + " non-existing documents" + " and " + str(sum(DOI_check_crossref == 'no DOI')) + " missing DOI")
  DOI_check_pubmed = check_references_for_doi_pubmed(doi_list, verbose)
  print("Done, found " + str(sum( DOI_check_pubmed == "article found")) + " existing documents, " + str(sum( DOI_check_pubmed == "no article")) + " non-existing documents" + " and " + str(sum( DOI_check_pubmed == 'no DOI')) + " missing DOI")

  if return_dataframe == True:
    references_checked = pd.DataFrame(doi_list,  columns=['reference_DOI'])
    references_checked['DOI_search_crossref'] = DOI_check_crossref
    references_checked['DOI_search_pubmed'] = DOI_check_pubmed
    references_checked['title'] = title
    references_checked['referenced_title'] = title_list
    references_checked['reference index'] = range(1,len( references_checked.index)+ 1)
    references_checked['full_reference_text'] = reference_full_text
    references_checked = references_checked[['title', 'reference index', 'referenced_title', 'reference_DOI', 'DOI_search_crossref', 'DOI_search_pubmed', 'full_reference_text']]
    return references_checked


def check_references_for_doi_crossref(doi_list, verbose):
  """
  Checks the existence of a list of DOIs using the Crossref API.

  Args:
    doi_list (list): A list of DOIs (strings) to check. 'not found' indicates
                     a missing DOI in the original reference list.
    verbose (bool): If True, prints the status and title (if found) for each DOI.

  Returns:
    numpy.ndarray: A numpy array of strings indicating the status for each DOI
                   in the input list ('article found', 'no article', 'no DOI').
  """

  crossref_status = np.full(len(doi_list), "not processed")
  headers = {"User-Agent": "SimpleFetcher"}

  for i, doi in enumerate(doi_list):
    if doi == 'not found': # if there is no doi in the original reference
      crossref_status[i] = 'no DOI'
      if verbose == True:
        print('\033[33m' + str(i+1) +" " +  "DOI missing in reference list" + '\033[0m')
      continue

    api_url = f"https://api.crossref.org/v1/works/{doi}"

    try:
        response = requests.get(api_url, headers=headers)
        response.raise_for_status()  # Check for bad responses

        if response.status_code == 200:
            data = response.json()
            title = data["message"].get('title', [])
            title = str(title[0]).replace("\\n", " ").replace('<sup>+</sup>', "") if title else 'No Title Available'
            if verbose == True:
               print('\033[32m'  + str(i+1) + " " + title + '\033[0m')
            crossref_status[i] = "article found"
        else:
            if verbose == True:
               print('\033[31m' + str(i+1) + " " +  f"No article found on Crossref for DOI: {doi}" + '\033[0m')
            crossref_status[i] = 'no article'

    except requests.exceptions.RequestException as e:
        if verbose == True:
           print('\033[31m' + str(i+1) + " " +  f"An error occurred checking DOI '{doi}' on Crossref: {e}" + '\033[0m')
        crossref_status[i] = 'no article' # Treat any other error as article not found for simplicity

  return crossref_status


def check_references_for_doi_pubmed(doi_list, verbose):
  """
  Checks the existence of a list of DOIs using the PubMed API (Entrez).

  Args:
    doi_list (list): A list of DOIs (strings) to check. 'not found' indicates
                     a missing DOI in the original reference list.
    verbose (bool): If True, prints the status and title (if found) for each DOI.

  Returns:
    numpy.ndarray: A numpy array of strings indicating the status for each DOI
                   in the input list ('article found', 'no article', 'no DOI').
  """
  # Always tell Entrez who you are
  Entrez.email = "your_email@example.com" # Replace with your email address

  status_pubmed = np.full(len(doi_list), "not processed")

  for i, doi in enumerate(doi_list):
    if doi == 'not found': # if there is no doi in the original reference
      status_pubmed[i] = 'no DOI'
      if verbose == True:
        print('\033[33m' + str(i+1) +" " +  "DOI missing in reference list" + '\033[0m')
      continue

    try:
        # Search PubMed for the DOI to get the PMID
        # Using a more specific search term to improve accuracy
        search_term = f'{doi}[doi]'
        search_handle = Entrez.esearch(db="pubmed", term=search_term, retmax=10) # Use retmax to limit results
        search_record = Entrez.read(search_handle)
        search_handle.close()
        pubmed_ids = search_record["IdList"]

        if pubmed_ids:
            # If PMID(s) found, fetch the article details to confirm existence and get title
            # Fetch details for the first result, as we expect only one match for a DOI
            fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_ids[0], retmode="xml")
            article_record = Entrez.read(fetch_handle)
            fetch_handle.close()

            if 'PubmedArticle' in article_record and len(article_record['PubmedArticle']) > 0:
                article = article_record['PubmedArticle'][0]['MedlineCitation']['Article']
                title = article.get('ArticleTitle', 'No Title Available')

                if verbose == True:
                   print('\033[32m'  + str(i+1) + " " + title + '\033[0m')
                status_pubmed[i] = "article found"
            else:
                # This case is less likely if a PMID was found, but included for robustness
                if verbose == True:
                   print('\033[31m' + str(i+1) + " " +  f"Could not fetch details for DOI: {doi}" + '\033[0m')
                status_pubmed[i] = 'no article'

        else:
            if verbose == True:
               print('\033[31m' + str(i+1) + " " +  f"No article found on PubMed for DOI: {doi}" + '\033[0m')
            status_pubmed[i] = 'no article' # No PMID found for the DOI

    except Exception as e: # Catch potential errors during Entrez interaction
        if verbose == True:
           print('\033[31m' + str(i+1) + " " +  f"An error occurred checking DOI '{doi}' on PubMed: {e}" + '\033[0m')
        status_pubmed[i] = 'no article' # Treat any other error as article not found for simplicity

  return status_pubmed


In [None]:
get_references_from_doi(doi =  '10.3390/ph16010014', return_dataframe=True, verbose = True)

Sending request...
Processing response for: [3mDehydroeburicoic Acid, a Dual Inhibitor against Oxidative Stress in Alcoholic Liver Disease
[32m1 Recent advances in alcohol-related liver disease (ALD): summary of a Gut round table meeting[0m
[32m2 Protective role of HO-1 and carbon monoxide in ethanol-induced hepatocyte cell death and liver injury in mice[0m
[32m3 Metadoxine improves the three- and six-month survival rates in patients with severe alcoholic hepatitis[0m
[32m4 Gastrointestinal and liver side effects of drugs in elderly patients[0m
[32m5 Hepatoprotective effect of gastrodin against alcohol-induced liver injury in mice[0m
[32m6 Role of mitochondria in alcoholic liver disease[0m
[33m7 DOI missing in reference list[0m
[32m8 Overexpression of FGF19 alleviates hypoxia/reoxygenation-induced injury of cardiomyocytes by regulating GSK-3β/Nrf2/ARE signaling[0m
[32m9 Dynamic Adaptation of Liver Mitochondria to Chronic Alcohol Feeding in Mice[0m
[32m10 Glucopyrano

In [None]:
# add column checking the publisher (is id indexed / is it predatory)

In [None]:
x[x.DOI_search != "article found"]

Unnamed: 0,title,reference index,referenced_title,reference_DOI,DOI_search,full_reference_text
6,"Dehydroeburicoic Acid, a Dual Inhibitor agains...",7,Mitochondrial dysfunction and alcohol-associat...,not found,no DOI,not found
34,"Dehydroeburicoic Acid, a Dual Inhibitor agains...",35,Antrodia cinnamomea reconsidered and A. salmon...,not found,no DOI,not found
46,"Dehydroeburicoic Acid, a Dual Inhibitor agains...",47,not found,not found,no DOI,"Wang, L., Lewis, T., Zhang, Y.L., Khodier, C.,..."
50,"Dehydroeburicoic Acid, a Dual Inhibitor agains...",51,Oxidation of human catalase by singlet oxygen ...,not found,no DOI,not found
55,"Dehydroeburicoic Acid, a Dual Inhibitor agains...",56,IL-6-deficient mice are susceptible to ethanol...,not found,no DOI,not found


In [128]:
doi =  '10.3390/ph16010014'
api_url = f"https://api.crossref.org/v1/works/{doi}"
headers = {"User-Agent": "SimpleFetcher"}
print('Sending request...')

response = requests.get(api_url, headers=headers)
response.raise_for_status()  # Check for bad responses
data = response.json()

# Get the list of references from the response
references = data["message"].get("reference", [])

Sending request...


In [None]:
article_list = ["https://doi.org/10.1007/978-3-031-66209-6_1",
                "https://doi.org/10.1007/978-3-031-66209-6_2",
                "https://doi.org/10.1007/978-3-031-66209-6_3",
                "https://doi.org/10.1007/978-3-031-66209-6_4",
                "https://doi.org/10.1007/978-3-031-66209-6_5",
                "https://doi.org/10.1007/978-3-031-66209-6_6",
                "https://doi.org/10.1007/978-3-031-66209-6_7",
                "https://doi.org/10.1007/978-3-031-66209-6_8",
                "https://doi.org/10.1007/978-3-031-66209-6_9"]




In [None]:
results_list = []

for article_doi in article_list:
  result = get_references_from_doi(doi = article_doi, return_dataframe=True, verbose = True)
  #result = result[result['DOI_search'] != 'article found']
  results_list.append(result)

combined_results = pd.concat(results_list, ignore_index=True)
combined_results[combined_results.DOI_search != "article found"]



Sending request...
Processing response for: [3mWhat is Circular Plastics Economy?
[33m1 DOI missing in reference list[0m
[33m2 DOI missing in reference list[0m
[33m3 DOI missing in reference list[0m
[32m4 Reassessing the projections of the World Water Development Report[0m
[33m5 DOI missing in reference list[0m
[33m6 DOI missing in reference list[0m
[33m7 DOI missing in reference list[0m
[33m8 DOI missing in reference list[0m
[32m9 Kunststofftechnik[0m
[33m10 DOI missing in reference list[0m
[33m11 DOI missing in reference list[0m
[32m12 Global Plastics Outlook[0m
[32m13 Global Plastics Outlook[0m
[32m14 Plastic futures and their CO2 emissions[0m
[33m15 DOI missing in reference list[0m
[31m16 An error occurred checking DOI '10.24406/umsicht-n-633611' on Crossref: 404 Client Error: Not Found for url: https://api.crossref.org/v1/works/10.24406/umsicht-n-633611[0m
[33m17 DOI missing in reference list[0m
[33m18 DOI missing in reference list[0m
[33m19 D

Unnamed: 0,title,reference index,referenced_title,reference_DOI,DOI_search,full_reference_text
0,What is Circular Plastics Economy?,1,not found,not found,no DOI,"United Nations Environment Program UNEP, Inter..."
1,What is Circular Plastics Economy?,2,not found,not found,no DOI,Circle Economy (ed.): The Circularity Gap Repo...
2,What is Circular Plastics Economy?,3,not found,not found,no DOI,International Energy Agency IEA (ed.): CO2 Emi...
4,What is Circular Plastics Economy?,5,not found,not found,no DOI,Global Cement and Concrete Association (ed.): ...
5,What is Circular Plastics Economy?,6,not found,not found,no DOI,Worldsteel Association (ed): World Steel in Fi...
...,...,...,...,...,...,...
618,Renewable Carbon for Plastics: Quo Vadis?,204,not found,not found,no DOI,N.N.: Covestro signs first global agreement to...
619,Renewable Carbon for Plastics: Quo Vadis?,205,not found,not found,no DOI,"Benzing, T: Einsatz nachwachsender Rohstoffe i..."
620,Renewable Carbon for Plastics: Quo Vadis?,206,not found,not found,no DOI,N.N.: BASF’s biomass balance approach. https:/...
621,Renewable Carbon for Plastics: Quo Vadis?,207,not found,not found,no DOI,European Commission Voluntary schemes set stan...


In [None]:
combined_results[combined_results.DOI_search != "article found"].to_csv("combined_results.csv")

In [None]:
# next steps
# this code if for searching without DOI

def search_without_doi(text):
  api_url = f"https://api.crossref.org/v1/works?query={text}" # Changed q to query as recommended in Crossref API docs for this type of search
  headers = {"User-Agent": "SimpleFetcher"} # Make sure headers is defined

  try:
      response = requests.get(api_url, headers=headers)
      response.raise_for_status()  # Check for bad responses
      print("Request successful:")

      data = response.json()
      items = data.get('message', {}).get('items', [])

      if not items:
          print("No items found in the search results.")
      else:
          # Find the item with the highest score
          best_match = max(items, key=lambda item: item.get('score', 0))
          print("Best match found:")
          title = best_match.get('title', ['No Title Available'])[0]
          doi = best_match.get('DOI', 'No DOI Available')
          author = best_match.get('author', 'No DOI Available')
          print(f"Title: {title}")
          print(f"DOI: {doi}")
          print(f"Author: {author}")


  except json.JSONDecodeError:
      print("Error decoding JSON response.")
  except Exception as e:
      print(f"An error occurred: {e}")