In [None]:
import time
import pandas as pd
from Bio import Entrez
from tqdm import tqdm

# --- CONFIGURATION ---
Entrez.email = "yourEmail@gmail.com"    # Add your email before executing
SEARCH_TERM = "oral health [Title/Abstract]"
MAX_RESULTS = 1000

def search_pubmed(query, max_results=100):
    print(f"Searching PubMed for: {query}...")
    handle = Entrez.esearch(db="pubmed", term=query, retmax=max_results)
    record = Entrez.read(handle)
    handle.close()
    return record["IdList"]

def fetch_details(id_list):
    ids = ",".join(id_list)
    handle = Entrez.efetch(db="pubmed", id=ids, retmode="xml")
    records = Entrez.read(handle)
    handle.close()
    return records

def parse_records(records):
    data = []
    for article in records['PubmedArticle']:
        try:
            medline = article['MedlineCitation']
            article_data = medline['Article']
            pmid = str(medline['PMID'])
            title = article_data.get('ArticleTitle', '')
            
            abstract_raw = article_data.get('Abstract', {}).get('AbstractText', [])
            abstract = " ".join(abstract_raw) if isinstance(abstract_raw, list) else abstract_raw
            
            if abstract:
                data.append({
                    'pmid': pmid,
                    'title': title,
                    'abstract': abstract
                })
        except Exception:
            continue
    return data

# --- EXECUTION ---
ids = search_pubmed(SEARCH_TERM, MAX_RESULTS)
all_data = []
for i in tqdm(range(0, len(ids), 100)):
    batch_ids = ids[i:i+100]
    if batch_ids:
        records = fetch_details(batch_ids)
        all_data.extend(parse_records(records))
        time.sleep(1)

df = pd.DataFrame(all_data)
df.to_csv("pubmed_data_OralHealth.csv", index=False)
print(f"âœ… Saved {len(df)} oral health abstracts.")

Searching PubMed for: oral health [Title/Abstract]...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10/10 [00:18<00:00,  1.84s/it]

âœ… Saved 933 oral health abstracts.





In [9]:
import numpy as np
import faiss
import ollama

print("ðŸš€ Building the Permanent FAISS Index...")

# 1. Generate embeddings using the same model as our RAG function
# (If you prefer 'all-MiniLM-L6-v2', just ensure the RAG function uses it too)
embeddings = []
for text in tqdm(df['abstract'].tolist()):
    res = ollama.embeddings(model="nomic-embed-text:latest", prompt=text)
    embeddings.append(res['embedding'])

embeddings = np.array(embeddings).astype('float32')

# 2. Build the Index with a UNIQUE NAME
dimension = embeddings.shape[1] 
vector_db_index = faiss.IndexFlatL2(dimension) # RENAMED to avoid loop conflicts
vector_db_index.add(embeddings)

print(f"âœ… FAISS Index successfully secured as 'vector_db_index' with {vector_db_index.ntotal} vectors.")

ðŸš€ Building the Permanent FAISS Index...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 933/933 [00:59<00:00, 15.70it/s]

âœ… FAISS Index successfully secured as 'vector_db_index' with 933 vectors.





In [None]:
import re
import pandas as pd
import ollama
from tqdm import tqdm

def extract_clinical_triples_master(text):
    """
    PROMPT: Optimized for high-fidelity extraction.
    """
    MODEL_NAME = "llama3.2:latest"
    
    system_instruction = """ 
    Extract only DIRECT clinical/biological relationships.
    
    FORMAT: Subject | Predicate | Object
    
    STRICT RULES:
    1. NO META-TALK: Ignore "The study", "Researchers", "Results", "Data".
    2. SINGULAR ONLY: Use "Tooth", not "Teeth". "Caries", not "Carious lesions".
    3. LOGIC INVERSION: If text says "Lack of brushing causes decay", extract: "Brushing | prevents | Dental Caries".
    4. PREDICATES: Use only [causes, prevents, occurs_in, part_of, increases_risk_of, improves, treats].
    5. ANATOMY: Always try to link the disease to a structure (e.g., Enamel, Gingiva, Pulp).
    
    EXAMPLES:
    Input: "Periodontitis is a major risk factor for heart disease."
    Output: Periodontitis | increases_risk_of | Heart Disease
    """
    
    prompt = f"Extract high-fidelity OHD triples from: {text[:1200]}"
    
    try:
        response = ollama.chat(model=MODEL_NAME, messages=[
            {'role': 'system', 'content': system_instruction},
            {'role': 'user', 'content': prompt}
        ])
        
        raw_output = response['message']['content'].strip()
        triples = []
        
        for line in raw_output.split('\n'):
            if "|" in line:
                parts = [p.strip() for p in line.split("|")]
                if len(parts) == 3:
                    # Final sanity check: skip if meta-words are in the subject
                    if any(meta in parts[0].lower() for meta in ['study', 'we', 'researcher', 'participant']):
                        continue
                    triples.append({"source": parts[0], "relation": parts[1], "target": parts[2]})
        return triples
    except Exception:
        return []

# --- CLEANER ---
def clean_triple_part(text):
    # Removes leading numbers, dots, dashes, asterisks, and extra spaces
    # Examples: "* Type 1 Diabetes" -> "Type 1 Diabetes"
    #           "1. Poor Oral Health" -> "Poor Oral Health"
    return re.sub(r'^[\d\.\-\*\s]+', '', text).strip()

# --- THE FINAL PRODUCTION EXECUTION ---
FINAL_LIMIT = len(df)   # Max rows to process (E.g: 100 rows or abstracts) or set to len(df) for full dataset
final_ohd_kb = []

print(f"ðŸš€ EXECUTING ABSTRACT RUN...")

for index, row in tqdm(df.head(FINAL_LIMIT).iterrows(), total=FINAL_LIMIT):
    raw_facts = extract_clinical_triples_master(row['abstract'])
    
    for f in raw_facts:
        source = clean_triple_part(f['source'])
        target = clean_triple_part(f['target'])
        
        # FINAL GATEKEEPER: Ensure the triple isn't empty or meta-talk
        if source and target and len(source) > 2:
            if not any(meta in source.lower() for meta in ['study', 'we', 'researcher']):
                final_ohd_kb.append({
                    "source": source,
                    "relation": f['relation'],
                    "target": target,
                    "pmid": row['pmid']
                })

# --- SAVE RESULTS ---
kb_df = pd.DataFrame(final_ohd_kb)
kb_df.to_csv("FINAL_OHD_KNOWLEDGE_BASE.csv", index=False)

print(f"\n FINAL SUCCESS: Extracted {len(kb_df)} clean clinical triples.")

ðŸš€ EXECUTING ABSTRACT RUN...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 100/100 [08:54<00:00,  5.34s/it]


âœ¨ FINAL SUCCESS: Extracted 663 clean clinical triples.





In [None]:
# for i in range(0,10):
#     print(f"Sample Fact: {final_ohd_kb[i]['source']} --({final_ohd_kb[i]['relation']})--> {final_ohd_kb[i]['target']}")

In [11]:
import networkx as nx
from pyvis.network import Network
import pandas as pd

def visualize_ohd_graph(triples_list):
    # 1. Initialize a Directed Graph
    G = nx.DiGraph()

    # 2. Add edges from your extracted triples
    for triple in triples_list:
        G.add_edge(
            triple['source'], 
            triple['target'], 
            title=triple['relation'], # Hover text
            label=triple['relation']  # Edge text
        )

    # 3. Create a PyVis Network object
    # height/width '100%' for full screen; notebook=True for Jupyter integration
    net = Network(height='750px', width='100%', bgcolor='#222222', font_color='white', directed=True)
    
    # 4. Load the NetworkX graph into PyVis
    net.from_nx(G)
    
    # 5. Physics configuration (makes the graph look like a floating organic structure)
    net.toggle_physics(True)
    
    # 6. Save and display
    output_path = "ohd_graph_visualization.html"
    net.save_graph(output_path)
    print(f"âœ… Visualization created: Open '{output_path}' in your browser to interact with the graph!")

# --- EXECUTION ---
# Using the list you just generated from the 20 abstracts
visualize_ohd_graph(final_ohd_kb)

âœ… Visualization created: Open 'ohd_graph_visualization.html' in your browser to interact with the graph!


In [12]:
import ollama
import numpy as np

def run_dual_comparison(query, kb_triples, faiss_index_obj, metadata_df):
    print(f"\n{'='*30} DUAL COMPARISON REPORT {'='*30}")
    print(f"QUERY: {query}\n")

    # --- PART 1: GENERAL KNOWLEDGE (No Data) ---
    general_prompt = f"Using your internal general knowledge, answer: {query}"
    general_res = ollama.chat(model="llama3.2:latest", messages=[{'role': 'user', 'content': general_prompt}])
    
    print("ðŸ§  [GENERAL AI KNOWLEDGE]")
    print("Source: LLM Internal Training")
    print("-" * 50)
    print(general_res['message']['content'])
    print("\n" + "*"*50 + "\n")

    # --- PART 2: HYBRID RAG (Vector + Graph) ---
    # A. Get Vector Context
    q_emb = np.array(ollama.embeddings(model="nomic-embed-text:latest", prompt=query)['embedding']).reshape(1, -1)
    distances, indices = faiss_index_obj.search(q_emb.astype('float32'), k=3)
    baseline_context = " ".join(metadata_df.iloc[indices[0]]['abstract'].tolist())

    # B. PURE KNOWLEDGE GRAPH SEARCH (No Filtering)
    graph_evidence = []
    seen_facts = set()
    
    # ðŸš¨ PURE QUERY: Just lowercase and split. No punctuation removal, no stop-words.
    raw_query_words = query.lower().split()
    
    for t in kb_triples:
        fact_text = (t['source'] + " " + t['target']).lower()
        
        # ðŸš¨ PURE MATCHING: If the exact raw word is in the fact, score it.
        score = sum(5 for word in raw_query_words if word in fact_text)
        
        # Boost specific clinical relations
        if t['relation'] in ['improves', 'prevents', 'causes', 'increases_risk_of']:
            score += 3
        
        # Lowered threshold so you can see every single pure match
        if score > 0: 
            fact_str = f"{t['source']} {t['relation']} {t['target']}"
            if fact_str not in seen_facts:
                graph_evidence.append({"fact": fact_str, "pmid": t.get('pmid', 'N/A'), "score": score})
                seen_facts.add(fact_str)

    # Sort and take the top 8 facts
    graph_evidence = sorted(graph_evidence, key=lambda x: x['score'], reverse=True)[:8]
    evidence_str = "\n".join([f"- {f['fact']} (PMID: {f['pmid']})" for f in graph_evidence])
    
    # C. Generate Hybrid Answer
    hybrid_prompt = f"""
    Answer as a Senior Clinician.
    TEXT CONTEXT: {baseline_context}
    GRAPH EVIDENCE: {evidence_str}
    QUERY: {query}
    INSTRUCTION: Use the GRAPH EVIDENCE to confirm specific biological relationships.
    """
    hybrid_res = ollama.chat(model="llama3.2:latest", messages=[{'role': 'user', 'content': hybrid_prompt}])
    
    print("âœ… [YOUR HYBRID SYSTEM]")
    print("Source: Vector DB + Pure KG Match")
    print("-" * 50)
    print(hybrid_res['message']['content'])
    
    print("\nðŸ”— [EVIDENCE USED BY HYBRID]")
    if not graph_evidence:
        print("â€¢ No direct graph evidence matched the pure query.")
    for e in graph_evidence:
        print(f"â€¢ {e['fact']} (Source: https://pubmed.ncbi.nlm.nih.gov/{e['pmid']}/)")

# --- EXECUTION ---
# Change this variable to test different clinical relationships
# Note: Use pure keywords without stop-words (e.g., "what is") for best graph matching.

user_query = "analyze oral pain alzheimers disease risk"

run_dual_comparison(user_query, final_ohd_kb, vector_db_index, df)


QUERY: analyze oral pain alzheimers disease risk

ðŸ§  [GENERAL AI KNOWLEDGE]
Source: LLM Internal Training
--------------------------------------------------
Oral pain in Alzheimer's disease (AD) is a common symptom that can significantly impact the quality of life for individuals with the condition. Here's an analysis of the relationship between oral pain and Alzheimer's disease:

**Prevalence:** Studies suggest that up to 70% of individuals with Alzheimer's disease experience oral pain, which can manifest as dental problems, such as tooth decay, gum disease, or tooth loss.

**Causes:**

1. **Dental problems**: Poor dental hygiene, difficulty chewing and swallowing, and reduced salivary flow contribute to an increased risk of dental problems.
2. **Neuropathic pain**: AD can cause nerve damage, leading to neuropathic pain in the mouth, face, and head.
3. **Medication side effects**: Certain medications used to manage Alzheimer's symptoms, such as antipsychotics, opioids, and antidepr