In [1]:
#NOTE: Create a .env file and put gemini keys in there as `GEMINI_API_KEY=...`
%load_ext dotenv
%dotenv

In [2]:
import os

parent_dir = os.path.dirname(os.getcwd())
data_dir = f'{parent_dir}/data'
results_dir = f'{parent_dir}/gemini_output/markdown_cochrane'

In [3]:
import pandas as pd

df = pd.read_csv(f'{data_dir}/Cochrane.tsv', sep='\t')
df['#words'] = [len(a.split()) for a in df['abstract']]

### Exploring the data

In [4]:
df.describe()

Unnamed: 0,#words
count,22439.0
mean,166.059361
std,132.96237
min,50.0
25%,68.0
50%,99.0
75%,249.0
max,511.0


In [5]:
df['abstract'][0]

'Cranberries (particularly in the form of cranberry juice) have been used widely for several decades for the prevention and treatment of urinary tract infections (UTIs). The aim of this review is to assess the effectiveness of cranberries in treating such infections. To assess the effectiveness of cranberries for the treatment of UTIs. We searched the Cochrane Kidney and Transplant Register of Studies up to 1 August 2023 through contact with the Information Specialist using search terms relevant to this review. Studies in the Register are identified through searches of CENTRAL, MEDLINE, and EMBASE, conference proceedings, the International Clinical Trials Registry Portal (ICTRP) Search Portal and ClinicalTrials.gov. All randomised controlled trials (RCTs) or quasi-RCTs of cranberry juice or cranberry products for the treatment of UTIs. Studies of men, women or children were to be included. Titles and abstracts of studies that were potentially relevant to the review were screened and st

### Clean abstracts
* The abstracts have already been cleaned before loading them.
* Step is skipped here.

## Using Google Gemini API

See the getting started guide for more information:
https://ai.google.dev/gemini-api/docs/get-started/python

In [6]:
import os

import google.generativeai as genai
from google.api_core.retry import Retry

genai.configure(api_key=os.environ['GEMINI_API_KEY'])

# Create the model
# See https://ai.google.dev/api/python/google/generativeai/GenerativeModel
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 64,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}
# Safety settings are disabled as input text describing
# patient's mental health experience can contain disturbing
# content which is blocked by Gemini API filters.
safety_settings = [
  {
    "category": "HARM_CATEGORY_HARASSMENT",
    "threshold": "BLOCK_LOW_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_HATE_SPEECH",
    "threshold": "BLOCK_LOW_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
    "threshold": "BLOCK_LOW_AND_ABOVE",
  },
  {
    "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
    "threshold": "BLOCK_LOW_AND_ABOVE",
  },
]

model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    safety_settings=safety_settings,
    generation_config=generation_config,
)

def generate(description: str) -> str:
    '''
    Generate output using Gemini-Flash API.
    Response is in markdown format.
    '''
    intro = "The below text contains some biomedical literature which is difficult for a layperson to understand."
    # Below instructions are used by the model to convert the description into a structured format
    instructions = "For the above text, create a simplified English version of the text which can be understood by a native English layperson with no medical background. Put the section heading as English simplified. The output section should have 1 paragraphs corresponding to the input text.\nNext, create an even more simpler version of the text which can be understood by a native English school kid with no medical background. Put the section heading as English super simplified. The output section should have 1 paragraphs corresponding to the input text.\n\nNext, created translated version of the simplified text in the following languages: Mandarin, followed by Spanish, followed by Arabic, followed by Hindi, followed by Bengali, followed by Portuguese, followed by Russian, followed by Japanese, followed by Punjabi\nPut the section heading as Langauge name Simplified. The output section should have 1 paragraphs corresponding to the input text. If some English terms excluding acronyms and numbers can't be translated then transliterate them.\n\nLet the voice in simplified text be same as in the original text so that the person narrating appears consistent. If there any URL links present in the original text then retain them in the simplified text as well."

    response = model.generate_content(f"{instructions}\n\n{description}", request_options={'timeout': 150, 'retry': Retry()})
    return response.text

### Test sample

#### Cochrane abstract sample abstract

We included diagnostic accuracy studies of symptomatic or asymptomatic nonpregnant women and men reproductive age. Included trials should have prospectively enrolled participants without previous diagnostic testing, co-infections or complications and consecutively or through random sampling at primary or secondary care facilities. Only studies reporting that all participants received the index test and the reference standard and presenting 2 x 2 data were eligible for inclusion. We excluded diagnostic case-control studies.

In [12]:
sample = "We included diagnostic accuracy studies of symptomatic or asymptomatic nonpregnant women and men reproductive age. Included trials should have prospectively enrolled participants without previous diagnostic testing, co-infections or complications and consecutively or through random sampling at primary or secondary care facilities. Only studies reporting that all participants received the index test and the reference standard and presenting 2 x 2 data were eligible for inclusion. We excluded diagnostic case-control studies."
output = generate(sample)

#### Simplified texts generated

In [13]:
from IPython.display import Markdown, display

display(Markdown(output.replace('**\n', '**<br>')))

## English Simplified

We looked at studies that tested how well certain tests could find out if people had a particular condition. These studies focused on people who were not pregnant, both men and women of childbearing age. The studies needed to include people who were:

* **Not showing symptoms or had symptoms of the condition.**<br>* **Hadn't been tested for the condition before.**<br>* **Didn't have other infections or complications.**<br>* **Were recruited consecutively or randomly from primary or secondary healthcare facilities.**<br>
We only included studies where everyone received both the test being studied and the standard test used to confirm the diagnosis. These studies also needed to provide specific data about the results of both tests. We excluded studies that looked at cases and controls separately. 

## English Super Simplified

We studied how good certain tests were at finding out if people had a specific problem. We looked at people who were not pregnant, both men and women who could have babies. The people in the studies:

* **Didn't have any signs of the problem or had some signs.**<br>* **Had never been tested for the problem before.**<br>* **Didn't have other problems or infections.**<br>* **Were chosen from regular clinics or hospitals.**<br>
We only looked at studies where everyone took both the new test and the regular test used to find out if they had the problem.  We also needed the studies to show the results of both tests in a specific way. We didn't include studies that looked at groups of people with and without the problem separately. 

## Mandarin Simplified

我们研究了某些测试在发现人们是否患有特定疾病方面的效果。我们研究了非孕妇人群，包括能够生育的男性和女性。这些研究中的人们：

* **没有表现出任何疾病症状或表现出了一些症状。**<br>* **以前从未接受过该疾病的测试。**<br>* **没有其他疾病或感染。**<br>* **是从普通诊所或医院中随机选择的。**<br>
我们只研究了所有人接受了新测试和用于确定他们是否患有该疾病的常规测试的研究。我们还需要这些研究以特定方式展示两种测试的结果。我们没有包括单独研究有病和无病人群体的研究。

## Spanish Simplified

Estudiamos qué tan buenas eran ciertas pruebas para determinar si las personas tenían un problema específico. Analizamos a personas que no estaban embarazadas, tanto hombres como mujeres en edad fértil. Las personas en los estudios:

* **No tenían ningún signo del problema o tenían algunos signos.**<br>* **Nunca habían sido examinados para el problema antes.**<br>* **No tenían otros problemas o infecciones.**<br>* **Fueron seleccionados de clínicas o hospitales regulares.**<br>
Solo analizamos estudios donde todos realizaron tanto la nueva prueba como la prueba regular utilizada para determinar si tenían el problema. También necesitábamos que los estudios mostraran los resultados de ambas pruebas de una manera específica. No incluimos estudios que analizaran grupos de personas con y sin el problema por separado. 

## Arabic Simplified

درسنا مدى جودة بعض الاختبارات في الكشف عن وجود مشكلة معينة لدى الأشخاص. درسنا أشخاصًا غير حوامل، سواء كانوا رجالًا أو نساءً في سن الإنجاب. الأشخاص المشمولون في الدراسات:

* **لم تظهر عليهم أي علامات للمشكلة أو ظهرت عليهم بعض العلامات.**<br>* **لم يتم اختبارهم للمشكلة من قبل.**<br>* **لم يعانوا من مشاكل أو عدوى أخرى.**<br>* **تم اختيارهم من عيادات أو مستشفيات عادية.**<br>
درسنا فقط الدراسات التي خضع فيها جميع الأشخاص لكل من الاختبار الجديد والاختبار المعتاد المستخدم لمعرفة ما إذا كانت لديهم المشكلة. كما كنا بحاجة إلى أن تُظهر الدراسات نتائج الاختبارين بطريقة محددة. لم نقم بتضمين الدراسات التي درست مجموعات من الأشخاص المصابين بالمشكلة وغير المصابين بها بشكل منفصل.

## Hindi Simplified

हमने यह अध्ययन किया कि कुछ परीक्षणों में किसी विशिष्ट समस्या का पता लगाने में कितनी सफलता होती है। हमने ऐसे लोगों का अध्ययन किया जो गर्भवती नहीं थीं, जिसमें बच्चे पैदा करने योग्य पुरुष और महिलाएं शामिल थीं। इन अध्ययनों में शामिल लोगों ने:

* **समस्या के कोई लक्षण नहीं दिखाए या कुछ लक्षण दिखाए।**<br>* **इस समस्या के लिए पहले कभी परीक्षण नहीं कराया था।**<br>* **किसी अन्य समस्या या संक्रमण से ग्रस्त नहीं थे।**<br>* **नियमित क्लीनिक या अस्पतालों से चुने गए थे।**<br>
हमने केवल उन अध्ययनों को शामिल किया जिनमें सभी लोगों ने नया परीक्षण और समस्या का पता लगाने के लिए इस्तेमाल किया जाने वाला नियमित परीक्षण दोनों कराए थे। हमें यह भी जरूरी था कि ये अध्ययन दोनों परीक्षणों के परिणाम एक विशिष्ट तरीके से दिखाएं। हमने उन अध्ययनों को शामिल नहीं किया जो अलग-अलग रूप से समस्या वाले और समस्या न होने वाले लोगों के समूहों का अध्ययन करते थे।

## Bengali Simplified

আমরা কিছু পরীক্ষার ক্ষমতা পরীক্ষা করেছি, যেগুলো কোনও নির্দিষ্ট সমস্যার সনাক্তকরণে কতটা সফল। আমরা এমন লোকেদের উপর গবেষণা করেছি যারা গর্ভবতী ছিলেন না, যার মধ্যে রয়েছে সন্তানধারণের সক্ষম পুরুষ এবং মহিলা। এই গবেষণায় অংশগ্রহণকারীরা:

* **সমস্যার কোন লক্ষণ দেখাননি বা কিছু লক্ষণ দেখিয়েছেন।**<br>* **এই সমস্যার জন্য পূর্বে কখনও পরীক্ষা করাননি।**<br>* **অন্য কোনও সমস্যা বা সংক্রমণে ভুগছেন না।**<br>* **নিয়মিত ক্লিনিক বা হাসপাতাল থেকে নির্বাচিত হয়েছেন।**<br>
আমরা কেবলমাত্র এমন গবেষণা অন্তর্ভুক্ত করেছি যেখানে সকলেই নতুন পরীক্ষা এবং সমস্যা সনাক্তকরণের জন্য ব্যবহৃত নিয়মিত পরীক্ষা উভয়টিই করেছেন। আমাদের এটিও প্রয়োজন ছিল যে এই গবেষণাগুলো উভয় পরীক্ষার ফলাফল একটি নির্দিষ্ট উপায়ে দেখাবে। আমরা এমন গবেষণা অন্তর্ভুক্ত করিনি যা সমস্যাযুক্ত এবং সমস্যাবিহীন লোকেদের পৃথকভাবে গ্রুপ করে গবেষণা করে।

## Portuguese Simplified

Estudamos a precisão de certos testes na detecção de um problema específico. Analisamos pessoas que não estavam grávidas, tanto homens como mulheres em idade fértil. As pessoas nos estudos:

* **Não apresentaram nenhum sinal do problema ou apresentaram alguns sinais.**<br>* **Nunca tinham sido testadas para o problema antes.**<br>* **Não tinham outros problemas ou infecções.**<br>* **Foram selecionadas de clínicas ou hospitais regulares.**<br>
Incluímos apenas estudos onde todos realizaram tanto o novo teste quanto o teste regular usado para determinar se tinham o problema. Também precisávamos que os estudos mostrassem os resultados de ambos os testes de uma maneira específica. Não incluímos estudos que analisavam grupos de pessoas com e sem o problema separadamente. 

## Russian Simplified

Мы изучили точность некоторых тестов в выявлении определенной проблемы. Мы проанализировали людей, которые не были беременны, как мужчин, так и женщин репродуктивного возраста. Люди, участвующие в исследованиях:

* **Не демонстрировали никаких признаков проблемы или имели некоторые симптомы.**<br>* **Никогда ранее не проходили тестирование на эту проблему.**<br>* **Не имели других проблем или инфекций.**<br>* **Были отобраны в обычных клиниках или больницах.**<br>
Мы включили только исследования, в которых все участники прошли как новый тест, так и стандартный тест, используемый для определения наличия проблемы. Нам также требовалось, чтобы исследования показывали результаты обоих тестов определенным образом. Мы не включали исследования, которые анализировали группы людей с проблемой и без нее отдельно. 

## Japanese Simplified

私たちは、特定の問題を検出する特定のテストの精度を調べました。私たちは、妊娠していない、出産可能な年齢の男性と女性の両方を対象に分析を行いました。研究に参加した人は：

* **問題の兆候を示さなかったか、またはいくつかの兆候を示していました。**<br>* **以前にその問題について検査を受けたことがありませんでした。**<br>* **他の問題や感染症にかかっていませんでした。**<br>* **通常のクリニックまたは病院から選ばれました。**<br>
私たちは、すべての参加者が新しいテストと、問題があるかどうかを判断するために使用される標準的なテストの両方を受けた研究のみを含めました。また、研究には両方のテストの結果を特定の方法で示す必要もありました。私たちは、問題のある人と問題のない人のグループを別々に分析した研究は含めていませんでした。

## Punjabi Simplified

ਅਸੀਂ ਇੱਕ ਖਾਸ ਸਮੱਸਿਆ ਦਾ ਪਤਾ ਲਗਾਉਣ ਵਿੱਚ ਕੁਝ ਟੈਸਟਾਂ ਦੀ ਸ਼ੁੱਧਤਾ ਦਾ ਅਧਿਐਨ ਕੀਤਾ। ਅਸੀਂ ਗਰਭਵਤੀ ਨਾ ਹੋਣ ਵਾਲੇ, ਪ੍ਰਜਨਨ ਉਮਰ ਦੇ ਮਰਦਾਂ ਅਤੇ ਔਰਤਾਂ ਦਾ ਵਿਸ਼ਲੇਸ਼ਣ ਕੀਤਾ। ਇਨ੍ਹਾਂ ਅਧਿਐਨਾਂ ਵਿੱਚ ਸ਼ਾਮਲ ਲੋਕ:

* **ਸਮੱਸਿਆ ਦੇ ਕੋਈ ਲੱਛਣ ਨਹੀਂ ਦਿਖਾਉਂਦੇ ਸਨ ਜਾਂ ਕੁਝ ਲੱਛਣ ਦਿਖਾਉਂਦੇ ਸਨ।**<br>* **ਪਹਿਲਾਂ ਕਦੇ ਵੀ ਇਸ ਸਮੱਸਿਆ ਲਈ ਟੈਸਟ ਨਹੀਂ ਕਰਾਇਆ ਸੀ।**<br>* **ਕੋਈ ਹੋਰ ਸਮੱਸਿਆ ਜਾਂ ਇਨਫੈਕਸ਼ਨ ਨਹੀਂ ਸੀ।**<br>* **ਨਿਯਮਤ ਕਲੀਨਿਕਾਂ ਜਾਂ ਹਸਪਤਾਲਾਂ ਤੋਂ ਚੁਣੇ ਗਏ ਸਨ।**<br>
ਅਸੀਂ ਸਿਰਫ ਉਨ੍ਹਾਂ ਅਧਿਐਨਾਂ ਨੂੰ ਸ਼ਾਮਲ ਕੀਤਾ ਜਿੱਥੇ ਸਾਰੇ ਭਾਗੀਦਾਰਾਂ ਨੇ ਨਵਾਂ ਟੈਸਟ ਅਤੇ ਸਮੱਸਿਆ ਦੀ ਪਛਾਣ ਕਰਨ ਲਈ ਵਰਤੇ ਜਾਂਦੇ ਮਿਆਰੀ ਟੈਸਟ ਦੋਵੇਂ ਕਰਾਏ ਸਨ। ਸਾਨੂੰ ਇਹ ਵੀ ਜ਼ਰੂਰੀ ਸੀ ਕਿ ਇਨ੍ਹਾਂ ਅਧਿਐਨਾਂ ਵਿੱਚ ਦੋਵੇਂ ਟੈਸਟਾਂ ਦੇ ਨਤੀਜੇ ਇੱਕ ਖਾਸ ਤਰੀਕੇ ਨਾਲ ਦਿਖਾਏ ਜਾਣ। ਅਸੀਂ ਉਨ੍ਹਾਂ ਅਧਿਐਨਾਂ ਨੂੰ ਸ਼ਾਮਲ ਨਹੀਂ ਕੀਤਾ ਜੋ ਸਮੱਸਿਆ ਵਾਲੇ ਅਤੇ ਸਮੱਸਿਆ ਵਾਲੇ ਲੋਕਾਂ ਦੇ ਸਮੂਹਾਂ ਦਾ ਵੱਖਰਾ ਵਿਸ਼ਲੇਸ਼ਣ ਕਰਦੇ ਸਨ।


### Loading Cochrane dataset

* Load previously cleaned and filtered data from Devaraj et al. work in Paragraph level text simplification

In [7]:
import pandas as pd

df = pd.read_csv(f'{data_dir}/Cochrane.tsv', sep='\t')

### Generate in batches & parallel
* We append the multiple queries into one single request which we call as a batch.

In [8]:
import importlib
import os
import re
from time import sleep
from typing import List, Tuple

import numpy as np
from google.api_core.exceptions import ResourceExhausted
from joblib import Parallel, delayed
from requests.exceptions import RequestException
from tqdm import tqdm

progress_bar = None
RETRIES = 200 # Occasionally, the Gemini API can have glitches
failures = list() # All failed inputs get stored here
SAVE = True
DEBUG = True # Displays errors

REQUEST_TIMEOUT = 180 # If it takes longer than 3 minutes then timeout
GEMINI_API_LIMIT = 1500
GEMINI_MAX_OUT = 8196 # Maximum number of tokens that can be returned
TOKEN_RETURN_RATIO = 16 # Approximate output tokens returned for input text

if not os.path.exists(results_dir):
    os.makedirs(results_dir)


def get_model(api_key: str) -> genai.GenerativeModel:
    '''Returns a model configured with the API key to be used for parallel requests.'''
    module_name = 'google.generativeai'
    _genai = importlib.import_module(module_name)
    _genai.configure(api_key=api_key)
    return _genai.GenerativeModel(
        model_name="gemini-1.5-flash",
        safety_settings=safety_settings,
        generation_config=generation_config,
    )


def save_batch(text: str) -> None:
    matches = list(re.finditer(r"#*\s*Text ID (\d+)", text))
    # Split the text based on Text ids
    for i in range(len(matches)):
        start = matches[i].start()
        if i < len(matches) - 1:
            end = matches[i + 1].start()
        else:
            end = len(text)
        record_number = int(matches[i].group(1))

        with open(f'{results_dir}/{record_number}.md', 'w') as fp:
            fp.write(text[start:end].strip())


def generate_and_save(batch: List[Tuple[int, str]]) -> None:
    '''
    Generates and stores simplified text for the
    given batch using the Google Gemini Flash API.
    Response can be in markdown format or sometimes as plain text.
    '''
    # Below instructions are used by the model to convert the description into a structured format
    intro = f"{len(batch)} biomedical literature texts are provided below which are difficult for a layperson to understand."
    # Below instructions are used by the model to convert the description into a structured format
    instructions = "For each of the above texts, create a simplified English version of the text which can be understood by a native English layperson with no medical background. Put the section heading as English simplified. The output section should have 1 paragraphs corresponding to the input text.\nNext, create an even more simpler version of the text which can be understood by a native English school kid with no medical background. Put the section heading as English super simplified. The output section should have 1 paragraphs corresponding to the input text.\n\nNext, created translated version of the simplified text in the following languages: Mandarin, followed by Spanish, followed by Arabic, followed by Hindi, followed by Bengali, followed by Portuguese, followed by Russian, followed by Japanese, followed by Punjabi\nPut the section heading as Langauge name Simplified. The output section should have 1 paragraphs corresponding to the input text. If some English terms excluding acronyms and numbers can't be translated then transliterate them. Put the heading for each text as ## Text ID X, where X is the id of the text.\n\nLet the voice in simplified text be same as in the original text so that the person narrating appears consistent. If there any URL links present in the original text then retain them in the simplified text as well."
    batch_description = '\n\n'.join([f'Text ID {i}: {desc}'.replace('\n', ' ') for i, desc in batch])
    
    try:
        response = model.generate_content(f"{intro}\n\n{batch_description}\n\n{instructions}", request_options={'timeout': REQUEST_TIMEOUT})

        if SAVE:
            save_batch(response.text)

    except (RequestException, ValueError):
        # For very long output the request can timeout
        # For output containing unsafe text, ValueError is raised
        if DEBUG:
            print(f'Skipped the following indices for producing unsafe outputs:', [i for i, desc in batch])

    except Exception as e:
        global RETRIES
        if RETRIES <= 0:
            print(f"Error for batch: {e}")
            failures.append(batch_description)
        else:
            RETRIES -= 1
            if DEBUG:
                print('Retries left:', RETRIES, f'| {type(e).__name__}')
            sleep(10+RETRIES%10)
            return generate_and_save(batch)

    progress_bar.update(1)


def batch_generate(descriptions: List[str], start_at: int = 0, n_jobs: int = 1) -> None:
    '''
    Generates and stores simplified medical text in batches and in parallel.
    '''
    tasks = list()

    # Gemini has a max output limit of 8196, based on which we dynamically select the size of every batch.
    i = start_at
    while i < len(descriptions) and len(tasks)<GEMINI_API_LIMIT:
        batch = []
        num_words = 0
        for j in range(i, len(descriptions)):
            num_words += len(descriptions[j].split())
            if j > i and num_words * TOKEN_RETURN_RATIO >= GEMINI_MAX_OUT:
                break
            batch.append((j, descriptions[j]))
        tasks.append((batch,))
        i += len(batch)

    global progress_bar
    progress_bar = tqdm(total=len(tasks))

    Parallel(n_jobs=n_jobs, prefer='threads')(delayed(generate_and_save)(*task) for task in tasks)

In [None]:
model = get_model(os.environ[f'GEMINI_API_KEY'])
batch_generate(
    descriptions = df['abstract'].tolist(), # Get all records
    start_at = max([int(n.split('.')[0])+1 for n in os.listdir(results_dir) if '.md' in n], default=0), # Skip if previously mined
    n_jobs = 8 # Adjust based on hardware and Gemini API per minute token rate limit
)

In [11]:
max([int(n.split('.')[0])+1 for n in os.listdir(results_dir) if '.md' in n], default=0)

18406