### Retrieval pipeline

Importing relevant modules

In [1]:
from elasticsearch import Elasticsearch
import os
import requests
import json
from bioBERT_encoder import BioBERTQueryEncooder
from medCPT_encoder import MedCPTQueryEncoder
from dotenv import load_dotenv

Initializing query encoder

In [2]:
bioBERT_encoder = BioBERTQueryEncooder()
med_cpt_encoder = MedCPTQueryEncoder()

Initializing Elastic Search connection

In [3]:
load_dotenv(dotenv_path="../pass.env", override=True)
elastic_password = os.getenv('ELASTIC_PASSWORD')
ca_cert = os.getenv("CA_CERT")
es = Elasticsearch(
    ['https://localhost:9200'],
    basic_auth=('elastic', elastic_password),
    verify_certs=True,
    ca_certs=ca_cert,
    request_timeout=60
)

es.info()

ObjectApiResponse({'name': 'LAP-CN-202', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'dZuZa2UEQhKesVFtLNUaBg', 'version': {'number': '9.2.0', 'build_flavor': 'default', 'build_type': 'zip', 'build_hash': '25d88452371273dd27356c98598287b669a03eae', 'build_date': '2025-10-21T10:06:21.288851013Z', 'build_snapshot': False, 'lucene_version': '10.3.1', 'minimum_wire_compatibility_version': '8.19.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
# Define a search query
def bm25_search(query: str, k: int = 10):
    query = {
        "size": k,
        "query": {
            "match": {
                "content": f"{query}"
            }
        },
        "_source": ["PMID", "title", "embeddings"]
    }
    # Elasticsearch nutzt standardmässig das BM25-Modell, um die Relevanz der Dokumente zu berechnen
    return es.search(index='pubmed_index_embedded', body=query)

In [5]:
bm25_search("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 2)

ObjectApiResponse({'took': 6, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 0.9671856, 'hits': [{'_index': 'pubmed_index_embedded', '_id': 'YWJeU5oBpz1dlC2qjVaY', '_score': 0.9671856, '_source': {'PMID': '34567890', 'title': 'COVID-19 vaccine efficacy in clinical trials'}}]}})

In [6]:
def get_docs_via_PMIDs(PMIDs: list):
    query = {
        "size": len(PMIDs),
        "query": {
            "terms": {
                "pmid": PMIDs
            }
        },
        "_source": ["PMID", "title", "content"]
    }

    return es.search(index='pubmed_index_embedded', body=query)

In [7]:
def query_to_vector(text, encoder):
    embedding = encoder.encode(text)
    return embedding

def query(query: str, encoder:object, k: int = 2, url='http://localhost:5000/search'):
    vec = query_to_vector(query, encoder).tolist()  # Chuyển đổi mảng NumPy thành danh sách
    data = {
        'queries': [vec],  # Hãy chắc chắn rằng 'queries' là một danh sách các danh sách
        'k': k
    }
    response = requests.post(url, headers={'Content-Type': 'application/json'}, data=json.dumps(data))

    return response.json()

response = query("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", bioBERT_encoder)
print(response)

{'PMIDs': [[34567890, 12345678]], 'distances': [[20.790725708007812, 26.157302856445312]]}


In [8]:
PMIDs = response['PMIDs'][0]
distances = response['distances'][0]

print(f"Distances: {distances}")
print(f"PMIDs: {PMIDs}")

Distances: [20.790725708007812, 26.157302856445312]
PMIDs: [34567890, 12345678]


In [11]:
docs = get_docs_via_PMIDs(PMIDs)
print(docs)

{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}}


Now testing implemented classes

In [12]:
from bioBERT_retriever import BioBERTRetriever
retriever = BioBERTRetriever()

Retrieving 3 most relevant docs 

In [97]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 2)
print(response)

TlsError: TLS error caused by: TlsError(TLS error caused by: SSLError([Errno 2] No such file or directory))

Now the BM25 retriever

In [1]:
from bm25_retriever import BM25Retriever

retriever = BM25Retriever()

In [2]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 3)
print(response)

{
    "doc1": {
        "PMID": 28941848,
        "title": "Drug development against tuberculosis: Past, present and future.",
        "content": "Infection of Mycobacterium tuberculosis (MTB) was observed as early as 5000 years ago with evidence, which is a primeval enemy of the humanoid race. MTB is the pathogen which is responsible for causing the infectious disease tuberculosis; it remains a major cause of morbidity and mortality in poor low-income countries as well as in developing countries because of non-availability of reliable laboratory facilities. The current treatment for drug-resistant tuberculosis (TB) is lengthy, complex, and connected with severe harmful side effects and poor outcomes. The present cure against tuberculosis has substantial restrictions, in terms of their efficiency, side-effect outline, and complication of handling. Furthermore, the emergence of multi-drug resistant tuberculosis (MDR-TB) outbreaks during the 1990s and additionally in recent times the vas

Now trying the medCPT retriever without reranking

In [1]:
from medCPT_retriever import SemanticRetrieverMedCPT
retriever = SemanticRetrieverMedCPT(rerank=False)

In [2]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 5)
print(response)

{
    "doc1": {
        "PMID": 3280912,
        "title": "Antituberculosis agents.",
        "content": "Tuberculosis, once considered a problem solved, is now dramatically on the rise. New approaches to chemotherapy will hopefully help to control this again serious problem. This article reviews the current status of tuberculosis chemotherapy, including the management of drug-resistant cases."
    },
    "doc2": {
        "PMID": 1640921,
        "title": "Management of persons exposed to multidrug-resistant tuberculosis.",
        "content": "Recent outbreaks of multidrug-resistant tuberculosis (MDR-TB) have posed challenges for the management of exposed persons. This report offers suggestions for evaluating and managing persons (i.e., contacts) who have been exposed to patients with infectious MDR-TB (TB due to strains of Mycobacterium tuberculosis resistant to both isoniazid [INH] and rifampin [RIF]), provides background information on alternative preventive therapy regimens with d

Now with reranking using the medCPT cross encoder

In [1]:
from medCPT_retriever import SemanticRetrieverMedCPT
retriever = SemanticRetrieverMedCPT(rerank=True)

In [2]:
response = retriever.retrieve_docs("Identify clinical trials investigating novel treatments for drug-resistant strains of tuberculosis.", 5)
print(response)

{
    "doc1": {
        "PMID": 1477244,
        "title": "Evaluation of new anti-infective drugs for the treatment and prevention of tuberculosis. Infectious Diseases Society of America and the Food and Drug Administration.",
        "content": "This guideline addresses the evaluation of new antimycobacterial drugs in the treatment and prevention (secondary prophylaxis) of infection by M. tuberculosis. Patients may be enrolled in clinical trials on the basis of clinical and/or microbiological criteria. A therapeutic regimen will likely include a combination of drugs; a randomized, active-control, comparative clinical trial is recommended. If appropriate samples can be obtained for culture during follow-up without placing the patient at unwarranted risk, the assessment of microbiological outcome is paramount. Prophylaxis will probably require a single drug, and a similar study design is preferred.",
        "score": 6.256978511810303
    },
    "doc2": {
        "PMID": 3280912,
      

In [1]:
from hybrid_retriever import HybridRetriever
retriever = HybridRetriever()

In [4]:
response = retriever.retrieve_docs("How to treat ADHD?", top_n=10, k=100)
print(response)

{
    "doc1": {
        "PMID": 28004618,
        "title": "Organisation of services for managing ADHD.",
        "content": "There is considerable variation in practice, both between and with different countries in the management of attention deficit hyperactivity disorder (ADHD). Whilst there is no one optimal model of service organisation there are general principles of care that can be introduced to reduce this variability. There are frequent debates and discussions about which professional group is best placed to manage ADHD at different points in the life cycle. Who delivers care is however less important than ensuring that training schemes provide adequate exposure, training and experience to both the core and non-core skills required to provide a comprehensive package of care. Most evidence-based guidelines recommend a multi-modal, multi-professional and multi-agency approach. Many also promote the use of both stepped care and shared care approaches for the management of ADHD. 

### RAG system

Now testing the combined RAG system using retriever number 1, semantic similarity search.

In [22]:
from med_rag import MedRAG

rag = MedRAG(retriever=1, question_type=1)

rag.get_answer("What deep learning in medical?")

ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))

KeyboardInterrupt: 

In [18]:
from med_rag import MedRAG

rag = MedRAG(retriever=2, question_type=1)

rag.get_answer("What is the mode of inheritance of Wilson's disease?")

TlsError: TLS error caused by: TlsError(TLS error caused by: SSLError([Errno 2] No such file or directory))

In [2]:
import os

print(os.getenv('OPENAI_API_KEY'))

None


In [1]:
from med_rag import MedRAG

rag = MedRAG(retriever=3, question_type=1)

rag.get_answer("What is the mode of inheritance of Wilson's disease?")

'{"response": "Wilson\'s disease is inherited in an autosomal recessive mode. This means that an individual must inherit two copies of the mutated gene (one from each parent) to develop the disease.", "used_PMIDs": ["26817129", "6109943"], "retrieved_PMIDs": [26817129, 6109943, 838566, 2724779, 6620327, 16810973, 20662462, 8186659, 11254776, 23518715], "retrieval_time": 1.5806100368499756, "generation_time": 1.6769413948059082}'

In [2]:
from med_rag import MedRAG

rag = MedRAG(retriever=3, question_type=2)

rag.get_answer("Is stop codon bypass possible?")

'{"response": "yes", "used_PMIDs": ["24535059", "12711673", "2103444", "26382736", "17881586", "2691247", "17961216", "2010914"], "retrieved_PMIDs": [24535059, 12711673, 2103444, 21930924, 26382736, 17881586, 2207158, 2691247, 17961216, 2010914], "retrieval_time": 0.371307373046875, "generation_time": 1.5111398696899414}'

In [34]:
from med_rag import MedRAG

rag = MedRAG(retriever=4, question_type=1)

rag.get_answer("Is stop codon bypass possible?")

'{"response": "Stop codon bypass is possible through ribosomal frameshifting at hungry codons, allowing for readthrough of stop codons and continuation of translation in a shifted reading frame.", "used_PMIDs": [3199440, 1515416, 1731076, 3477671, 1779848], "retrieved_PMIDs": [3199440, 1515416, 1731076, 2253710, 1628840, 1689389, 2439408, 3477671, 1779848, 1814364]}'

In [1]:
from med_rag import MedRAG

rag = MedRAG(retriever=4, question_type=2)

In [2]:
rag.get_answer("Is stop codon bypass possible?")

'{"response": "yes", "used_PMIDs": ["1814364", "1731076", "1628840", "1689389", "3199440"], "retrieved_PMIDs": [1814364, 1731076, 1628840, 2439408, 1689389, 2253710, 1779848, 3199440, 1515416, 3477671], "retrieval_time": 3.8009798526763916, "generation_time": 1.5287399291992188}'

In [14]:
import google.generativeai as genai
genai.configure(api_key="AIzaSyCKSvOPlCW_P2BY2UlJlGYtX6KBxxSacu0")

for m in genai.list_models():
    print(m.name)


models/embedding-gecko-001
models/gemini-2.5-pro-preview-03-25
models/gemini-2.5-flash-preview-05-20
models/gemini-2.5-flash
models/gemini-2.5-flash-lite-preview-06-17
models/gemini-2.5-pro-preview-05-06
models/gemini-2.5-pro-preview-06-05
models/gemini-2.5-pro
models/gemini-2.0-flash-exp
models/gemini-2.0-flash
models/gemini-2.0-flash-001
models/gemini-2.0-flash-exp-image-generation
models/gemini-2.0-flash-lite-001
models/gemini-2.0-flash-lite
models/gemini-2.0-flash-preview-image-generation
models/gemini-2.0-flash-lite-preview-02-05
models/gemini-2.0-flash-lite-preview
models/gemini-2.0-pro-exp
models/gemini-2.0-pro-exp-02-05
models/gemini-exp-1206
models/gemini-2.0-flash-thinking-exp-01-21
models/gemini-2.0-flash-thinking-exp
models/gemini-2.0-flash-thinking-exp-1219
models/gemini-2.5-flash-preview-tts
models/gemini-2.5-pro-preview-tts
models/learnlm-2.0-flash-experimental
models/gemma-3-1b-it
models/gemma-3-4b-it
models/gemma-3-12b-it
models/gemma-3-27b-it
models/gemma-3n-e4b-it
mo

In [1]:
from dotenv import dotenv_values

pass_config = dotenv_values("../pass.env")
print(pass_config["HF_KEY"])


AIzaSyCKSvOPlCW_P2BY2UlJlGYtX6KBxxSacu0


In [4]:
import google.generativeai as genai

API_KEY = pass_config["HF_KEY"]   # <-- đặt API key vào đây
model=genai.GenerativeModel("models/gemini-2.5-flash")
def test_gemini():
    try:
        genai.configure(api_key=API_KEY)

        response = model.generate_content(
            model="gemini-2.5-flash",
            contents="Hello! Test connection OK?"
        )

        print("=== Gemini API working ===")
        print(response.text)

    except Exception as e:
        print("=== Gemini API error ===")
        print(e)

test_gemini()

=== Gemini API error ===
GenerativeModel.generate_content() got an unexpected keyword argument 'model'
