In [1]:
#NOTE: Create a .env file and put gemini keys in there as `GEMINI_API_KEY=...`
%load_ext dotenv
%dotenv

In [None]:
import os

parent_dir = os.path.dirname(os.getcwd())
data_dir = f'{parent_dir}/data'
results_dir = f'{parent_dir}/gemini_output/markdown_pubmed'

## Download data from PubMed

In [None]:
import os
import re
from time import sleep

from Bio import Entrez, Medline
from tqdm import tqdm

# Set your email and API key
Entrez.email = "your_email@example.com"
api_key = os.environ['PUBMED_API_KEY']

# Search for the latest articles in below categories
search_term = "biology OR medicine OR healthcare"
articles = []

# Number of years to retrieve and batch size for each request
num_years = 5
max_size = 9999

for i in tqdm(range(num_years)):
    year = 2024 - i
    handle1 = Entrez.esearch(
        db="pubmed",
        sort="relevance",
        mindate=year,
        maxdate=year,
        term=search_term,
        retmax=max_size,
        api_key=api_key
    )
    record = Entrez.read(handle1)
    handle1.close()

    # Fetch details for the articles
    id_list = record["IdList"]
    handle2 = Entrez.efetch(db="pubmed", id=id_list, rettype="medline", retmode="text", api_key=api_key)
    records = Medline.parse(handle2)

    for index, record in enumerate(records):
        title = record.get('TI', None)
        abstract = record.get('AB', None)
        if title and abstract:
            num_words = len(abstract.split())
            articles.append([title, abstract, num_words])

    handle2.close()

In [None]:
import pandas as pd

df = pd.DataFrame(columns=['title', 'abstract', '#words'], data=articles)
df.drop_duplicates(inplace=True) # Some articles have both their peer reviewed and preprint versions listed
df.to_csv(f'{data_dir}/PubMed_articles.tsv', sep='\t', index=False)

### Exploring the data

In [6]:
df.describe()

Unnamed: 0,#words
count,40891.0
mean,222.882223
std,86.550538
min,1.0
25%,163.0
50%,227.0
75%,272.0
max,1148.0


In [7]:
articles[0]

['N-of-1 medicine.',
 "The fields of precision and personalised medicine have led to promising advances in tailoring treatment to individual patients. Examples include genome/molecular alteration-guided drug selection, single-patient gene therapy design and synergy-based drug combination development, and these approaches can yield substantially diverse recommendations. Therefore, it is important to define each domain and delineate their commonalities and differences in an effort to develop novel clinical trial designs, streamline workflow development, rethink regulatory considerations, create value in healthcare and economics assessments, and other factors. These and other segments are essential to recognise the diversity within these domains to accelerate their respective workflows towards practice-changing healthcare. To emphasise these points, this article elaborates on the concept of digital health and digital medicine-enabled N-of-1 medicine, which individualises combination regim

### Clean abstracts
* Some abstracts may contain HTML tags wheras others may contain URL links.
* We decided to retain the URL links but remove the HTML tags.
* Section headings such as `AIM`, `OBSERVATION`, `CONCLUSION`, etc present in the PubMed abstracts are removed as most model tend to summarise and thus section headings are not needed.

In [8]:
from bs4 import BeautifulSoup
from typing import List

def clean_html(strings: List[str]) -> List[str]:
    count_html = 0
    count_url = 0
    
    url_regex = r"(https?://[^\s]+)"
    # We remove special patterns which can be misidentified as HTML tags
    patterns_to_exclude = [r'<<.*?>>']

    for i in range(len(strings)):
        for pattern in patterns_to_exclude:
            strings[i] = re.sub(pattern, '', strings[i])
        soup = BeautifulSoup(strings[i], "html.parser")
        if soup.find():
            strings[i] = soup.get_text()
            count_html += 1
        if bool(re.search(url_regex, strings[i])):
            count_url += 1
    
    print(f"Number of abstracts with HTML tags: {count_html}")
    print(f"Number of abstracts with URLs: {count_url}")
    
    return strings

df['abstract'] = clean_html(df['abstract'].tolist())

  soup = BeautifulSoup(strings[i], "html.parser")


Number of abstracts with HTML tags: 25
Number of abstracts with URLs: 655


In [None]:
import re
from collections import Counter

# Sections headings were observed to be in CAPS followed by colon and whitespace.
regex = r"[A-Z]{4,}:\s+"

matches = []
for abstract in abstracts:
    matches.extend(re.findall(regex, abstract))

# Count the frequency of each match and display in descending order
counts = Counter(matches)
counts = counts.most_common()

# Print the frequency of each match
sections_to_remove = list()
for match, count in counts:
    if count < 10:
        break
    sections_to_remove.append(match)
    print(f"{match}: {count}")

# Removing these sections
section_regex = '|'.join(re.escape(section) for section in sections_to_remove)
df['abstract'] = [re.sub(section_regex, '', abstract) for abstract in df['abstract'].tolist()]

In [None]:
# Save the downloaded and cleaned data
df.to_csv(f'{data_dir}/PubMed_articles_cleaned.tsv', sep='\t', index=False)

## Using Google Gemini API

See the getting started guide for more information:
https://ai.google.dev/gemini-api/docs/get-started/python

In [2]:
import os

import google.generativeai as genai
from google.api_core.retry import Retry

genai.configure(api_key=os.environ['GEMINI_API_KEY'])

# Create the model
# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}
# Safety settings are disabled as input text describing
# patient's mental health experience can contain disturbing
# content which is blocked by Gemini API filters.
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_LOW_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_LOW_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_LOW_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_LOW_AND_ABOVE",
  },
]

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    safety_settings=safety_settings,
    generation_config=generation_config,
)

def generate(description: str) -> str:
    '''
    Generate output using Gemini-Flash API.
    Response is in markdown format.
    '''
    intro = "The below text contains some biomedical literature which is difficult for a layperson to understand."
    # Below instructions are used by the model to convert the description into a structured format
    instructions = "For the above text, create a simplified English version of the text which can be understood by a native English layperson with no medical background. Put the section heading as English simplified. The output section should have 1 paragraphs corresponding to the input text.\nNext, create an even more simpler version of the text which can be understood by a native English school kid with no medical background. Put the section heading as English super simplified. The output section should have 1 paragraphs corresponding to the input text.\n\nNext, created translated version of the simplified text in the following languages: Mandarin, followed by Spanish, followed by Arabic, followed by Hindi, followed by Bengali, followed by Portuguese, followed by Russian, followed by Japanese, followed by Punjabi\nPut the section heading as Langauge name Simplified. The output section should have 1 paragraphs corresponding to the input text. If some English terms excluding acronyms and numbers can't be translated then transliterate them.\n\nLet the voice in simplified text be same as in the original text so that the person narrating appears consistent. If there any URL links present in the original text then retain them in the simplified text as well."

    response = model.generate_content(f"{instructions}\n\n{description}", request_options={'timeout': 150, 'retry': Retry()})
    return response.text

### Test sample

#### Pubmed abstract sample abstract

A 68-year-old man was admitted with hematochezia. Emergency computed tomography showed multiple diverticula throughout the colon. Initial colonoscopy on day 2 showed no active bleeding, but massive hematochezia on day 3 led to the performance of an emergency endoscopy. Substantial bleeding in the ileocecal area obscured the visual field, making it challenging to view the area around the bleeding site. Two endoscopic band ligations (EBLs) were applied at the suspected bleeding sites. Hemostasis was achieved without active bleeding after EBL. However, the patient developed lower right abdominal pain and fever (39.4 degrees C) on day 6. Urgent computed tomography revealed appendiceal inflammation, necessitating emergency open ileocecal resection for acute appendicitis. Pathological examination confirmed acute phlegmonous appendicitis, with EBLs noted at the appendiceal orifice and on the anal side. This case illustrates the efficacy of EBL in managing colonic diverticular bleeding. However, it also highlights the risk of appendicitis due to EBL in cases of ileocecal hemorrhage exacerbated by poor visibility due to substantial bleeding. Endoscopists need to consider this rare but important complication when performing EBL in similar situations.

In [None]:
sample = "A 68-year-old man was admitted with hematochezia. Emergency computed tomography showed multiple diverticula throughout the colon. Initial colonoscopy on day 2 showed no active bleeding, but massive hematochezia on day 3 led to the performance of an emergency endoscopy. Substantial bleeding in the ileocecal area obscured the visual field, making it challenging to view the area around the bleeding site. Two endoscopic band ligations (EBLs) were applied at the suspected bleeding sites. Hemostasis was achieved without active bleeding after EBL. However, the patient developed lower right abdominal pain and fever (39.4 degrees C) on day 6. Urgent computed tomography revealed appendiceal inflammation, necessitating emergency open ileocecal resection for acute appendicitis. Pathological examination confirmed acute phlegmonous appendicitis, with EBLs noted at the appendiceal orifice and on the anal side. This case illustrates the efficacy of EBL in managing colonic diverticular bleeding. However, it also highlights the risk of appendicitis due to EBL in cases of ileocecal hemorrhage exacerbated by poor visibility due to substantial bleeding. Endoscopists need to consider this rare but important complication when performing EBL in similar situations."
output = generate(sample)

#### Simplified texts generated

In [None]:
from IPython.display import Markdown, display

display(Markdown(output.replace('**\n', '**<br>')))

### Loading PubMed dataset

* Load previously downloaded data as a pandas dataframe

In [3]:
import pandas as pd

df = pd.read_csv(f'{data_dir}/PubMed_articles_cleaned.tsv', sep='\t')

### Generate in batches & parallel

In [5]:
import importlib
import os
import re
from time import sleep
from typing import List, Tuple

import numpy as np
from google.api_core.exceptions import ResourceExhausted
from joblib import Parallel, delayed
from requests.exceptions import RequestException
from tqdm import tqdm

progress_bar = None
RETRIES = 200 # Occasionally, the Gemini API can have glitches
failures = list() # All failed inputs get stored here
SAVE = True
DEBUG = True # Displays errors

REQUEST_TIMEOUT = 180 # If it takes longer than 3 minutes then timeout
GEMINI_API_LIMIT = 1500
GEMINI_MAX_OUT = 8196 # Maximum number of tokens that can be returned
TOKEN_RETURN_RATIO = 16 # Approximate output tokens returned for input text

if not os.path.exists(results_dir):
    os.makedirs(results_dir)


def get_model(api_key: str) -> genai.GenerativeModel:
    '''Returns a model configured with the API key to be used for parallel requests.'''
    module_name = 'google.generativeai'
    _genai = importlib.import_module(module_name)
    _genai.configure(api_key=api_key)
    return _genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        safety_settings=safety_settings,
        generation_config=generation_config,
    )


def save_batch(text: str) -> None:
    matches = list(re.finditer(r"#*\s*Text ID (\d+)", text))
    # Split the text based on Text ids
    for i in range(len(matches)):
        start = matches[i].start()
        if i < len(matches) - 1:
            end = matches[i + 1].start()
        else:
            end = len(text)
        record_number = int(matches[i].group(1))

        with open(f'{results_dir}/{record_number}.md', 'w') as fp:
            fp.write(text[start:end].strip())


def generate_and_save(batch: List[Tuple[int, str]]) -> None:
    '''
    Generates and stores simplified text for the
    given batch using the Google Gemini Flash API.
    Response can be in markdown format or sometimes as plain text.
    '''
    # Below instructions are used by the model to convert the description into a structured format
    intro = f"{len(batch)} biomedical literature texts are provided below which are difficult for a layperson to understand."
    # Below instructions are used by the model to convert the description into a structured format
    instructions = "For each of the above texts, create a simplified English version of the text which can be understood by a native English layperson with no medical background. Put the section heading as English simplified. The output section should have 1 paragraphs corresponding to the input text.\nNext, create an even more simpler version of the text which can be understood by a native English school kid with no medical background. Put the section heading as English super simplified. The output section should have 1 paragraphs corresponding to the input text.\n\nNext, created translated version of the simplified text in the following languages: Mandarin, followed by Spanish, followed by Arabic, followed by Hindi, followed by Bengali, followed by Portuguese, followed by Russian, followed by Japanese, followed by Punjabi\nPut the section heading as Langauge name Simplified. The output section should have 1 paragraphs corresponding to the input text. If some English terms excluding acronyms and numbers can't be translated then transliterate them. Put the heading for each text as ## Text ID X, where X is the id of the text.\n\nLet the voice in simplified text be same as in the original text so that the person narrating appears consistent. If there any URL links present in the original text then retain them in the simplified text as well."
    batch_description = '\n\n'.join([f'Text ID {i}: {desc}'.replace('\n', ' ') for i, desc in batch])
    
    try:
        response = model.generate_content(f"{intro}\n\n{batch_description}\n\n{instructions}", request_options={'timeout': REQUEST_TIMEOUT})

        if SAVE:
            save_batch(response.text)

    except (RequestException, ValueError):
        # For very long output the request can timeout
        # For output containing unsafe text, ValueError is raised
        if DEBUG:
            print(f'Skipped the following indices for producing unsafe outputs:', [i for i, desc in batch])

    except Exception as e:
        global RETRIES
        if RETRIES <= 0:
            print(f"Error for batch: {e}")
            failures.append(batch_description)
        else:
            RETRIES -= 1
            if DEBUG:
                print('Retries left:', RETRIES, f'| {type(e).__name__}')
            sleep(10+RETRIES%10)
            return generate_and_save(batch)

    progress_bar.update(1)


def batch_generate(descriptions: List[str], start_at: int = 0, n_jobs: int = 1) -> None:
    '''
    Generates and stores simplified medical text in batches and in parallel.
    '''
    tasks = list()

    # Gemini has a max output limit of 8196, based on which we dynamically select the size of every batch.
    i = start_at
    while i < len(descriptions) and len(tasks)<GEMINI_API_LIMIT:
        batch = []
        num_words = 0
        for j in range(i, len(descriptions)):
            num_words += len(descriptions[j].split())
            if j > i and num_words * TOKEN_RETURN_RATIO >= GEMINI_MAX_OUT:
                break
            batch.append((j, descriptions[j]))
        tasks.append((batch,))
        i += len(batch)

    global progress_bar
    progress_bar = tqdm(total=len(tasks))

    Parallel(n_jobs=n_jobs, prefer='threads')(delayed(generate_and_save)(*task) for task in tasks)

In [None]:
model = get_model(os.environ[f'GEMINI_API_KEY'])
batch_generate(
    descriptions = df['abstract'].tolist(), # Get all records
    start_at = max([int(n.split('.')[0])+1 for n in os.listdir(results_dir) if '.md' in n], default=0), # Skip if previously mined
    n_jobs = 8 # Adjust based on hardware and Gemini API per minute token rate limit
)