# PubMed Search tool

The pubmed search tool itself will use Claude to conduct literature searches, which will allow it to formulate advanced queries in a smart way !

In [1]:
import os
import sys
import json
from dotenv import load_dotenv

# Get the current working directory (where the notebook is running)
cwd = os.getcwd()

# Add the backend directory to sys.path
sys.path.append(os.path.join(cwd, '..'))
from src.tools.utils import (
    formulate_pico, 
    run_parallel_mesh_queries, 
    formulate_esearch_query, 
    esearch_abstracts, 
    efetch_query_with_key,
    answer_from_abstracts,
    simple_research_pipeline
)

In [2]:
load_dotenv()

with open("../data/metrics_only.csv", "r", encoding="utf-8") as f:
    csv_string = f.read()

patient_info="""
    PATIENT CHARACTERISTICS
    sex: Male
    age: 32
    height: 177cm
    weight: 72KG
    """

context = patient_info + "\n\n" + csv_string


research_question = 'Strategies to increase HDL cholesterol in adult males'

In [3]:
raw_pico_terms = formulate_pico(research_question, context)

print(raw_pico_terms)

{'population_keywords': ['adult male', 'men', 'young adult', 'healthy male'], 'intervetion_keywords': ['HDL increase', 'cholesterol therapy', 'lipid management', 'diet', 'exercise', 'lifestyle modification', 'supplements'], 'comparator_keywords': ['placebo', 'standard care', 'no intervention'], 'outcome_keywords': ['HDL cholesterol', 'high density lipoprotein', 'lipid profile', 'cardiovascular risk']}


### By reducing the context down to one specific research question, and removing the conversation history (but keeping the initial context), we get pretty good performance !

In [4]:
import asyncio
from concurrent.futures import ThreadPoolExecutor, as_completed

# Usage
data = {
    'population_keywords': ['adult male', 'men', 'young adult', 'male', 'healthy male'], 
    'intervetion_keywords': ['diet', 'exercise', 'lifestyle', 'omega-3', 'niacin', 'fibrates', 'statins', 'alcohol', 'physical activity', 'weight loss', 'nutraceuticals'], 
    'comparator_keywords': ['placebo', 'control', 'usual care'], 
    'outcome_keywords': ['HDL cholesterol', 'high density lipoprotein', 'HDL-C', 'lipoproteins', 'cholesterol', 'cardiovascular risk', 'lipid profile']
}

results = await run_parallel_mesh_queries(data, max_workers=5)

mesh_result_context = str(results)
    

### Again, naïve approach: these outputs may not even really need parsing to be useful to the LLM 
- It's about 36k tokens, let's feed them as-is



In [5]:
results = formulate_esearch_query(
    research_question,
    context,
    raw_pico_terms,
    mesh_result_context
)
query = results.content[0].text

print(query)

(("men"[MeSH Terms] OR "male"[MeSH Terms]) AND ("young adult"[MeSH Terms] OR "adult"[MeSH Terms])) AND ("cholesterol, hdl"[MeSH Terms] OR "lipoproteins, hdl"[MeSH Terms]) AND ("diet"[MeSH Terms] OR "exercise"[MeSH Terms] OR "life style"[MeSH Terms] OR "dietary supplements"[MeSH Terms] OR "fatty acids, omega-3"[MeSH Terms] OR "niacin"[MeSH Terms] OR "fibric acids"[MeSH Terms] OR "hydroxymethylglutaryl-coa reductase inhibitors"[MeSH Terms]) AND ("prevention and control"[Subheading] OR "therapy"[Subheading])


In [6]:
query = '(("HDL cholesterol"[MeSH Terms] OR "high density lipoprotein"[Title/Abstract] OR "HDL-C"[Title/Abstract]) AND ("male"[MeSH Terms] OR "men"[MeSH Terms] OR "adult male"[Title/Abstract] OR "young adult"[MeSH Terms]) AND ("diet"[MeSH Terms] OR "exercise"[MeSH Terms] OR "lifestyle"[MeSH Terms] OR "niacin"[MeSH Terms] OR "fibrates"[MeSH Terms] OR "statins"[MeSH Terms] OR "omega-3"[MeSH Terms] OR "physical activity"[Title/Abstract] OR "weight loss"[MeSH Terms])) AND ("therapy"[Subheading] OR "treatment outcome"[MeSH Terms])'

abstracts_ids = await esearch_abstracts(query)

abstracts_ids_list = abstracts_ids["esearchresult"]["idlist"]
webenv = abstracts_ids["esearchresult"]["webenv"]
query_key = abstracts_ids["esearchresult"]["querykey"]

In [7]:
efetch_result = await efetch_query_with_key(query_key, webenv)

for i, article in enumerate(efetch_result):
    article["link"] = "https://pubmed.ncbi.nlm.nih.gov/" + article["pmid"]
    article.pop("pmid")
    article.pop("doi")
    efetch_result[i] = str(article)

parsed_efetch_result = "\n".join(efetch_result)
print(parsed_efetch_result)

✅ Successfully parsed 15 articles from PubMed
{'title': 'Beneficial Effect of the Mediterranean Diet on the Reduction of Prediabetes-Results of the Bialystok PLUS Study.', 'abstract': 'NotFound', 'pagination': '', 'link': 'https://pubmed.ncbi.nlm.nih.gov/40573145'}
{'title': 'Adherence to Mediterranean Diet and Implications for Cardiovascular Risk Prevention.', 'abstract': 'NotFound', 'pagination': '', 'link': 'https://pubmed.ncbi.nlm.nih.gov/40573102'}
{'title': 'NotFound', 'abstract': 'Excessive fat intake results in lipid metabolic disorders accompanied by inflammation and other complications. However, the effectiveness of drug interventions for metabolic disorders is not ideal, owing to their inherent limitations. Here, we introduce the probiotic ', 'pagination': '1519058', 'link': 'https://pubmed.ncbi.nlm.nih.gov/40547525'}
{'title': 'Effects of combined diet and physical activity on glycemic control and body composition in male recreational athletes with type 2 diabetes mellitus.

# The full pipeline

In [8]:
# Run the simplified automated research pipeline
result = await simple_research_pipeline(
    research_question=research_question,
    context=context
)

if result['status'] == 'success':
    print(result)
    
elif result['status'] == 'no_results':
    print("\nNo results found for the research question")
    print(f"Query used: {result['query']}")
    
else:
    print(f"\nError occurred: {result.get('error', 'Unknown error')}")

✅ Successfully parsed 1 articles from PubMed
{'status': 'success', 'answer': "# Strategies to Increase HDL Cholesterol in Adult Males\n\n## Direct Answer\nBased on the search results, there is limited direct evidence from the provided abstracts specifically addressing strategies to increase HDL cholesterol in adult males. The most relevant study examined red grape seed extract (RGSE) as a potential intervention for improving lipid profiles, including HDL-C, in patients with mild to moderate hyperlipidemia.\n\n## Summary of Most Relevant Study\n**Red grape seed extract for hyperlipidemia** (Gharipour et al.)\n- RGSE contains oligomeric proanthocyanidin complexes (flavonoids) that function as potent antioxidants\n- The study examined RGSE's effects on various lipid parameters including HDL-C in patients with mild to moderate hyperlipidemia\n- While the abstract doesn't specify the results for HDL-C specifically, it investigated RGSE as a potential natural intervention for improving lipid