In [1]:
!pip install pymupdf faiss-cpu transformers torch langchain


Collecting pymupdf
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecti

In [6]:
import os
import fitz  # PyMuPDF for PDF extraction
import numpy as np
import faiss
import pickle
import logging
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModel
import torch

# Set up logging
logging.basicConfig(level=logging.INFO)

# Paths to the remedies PDFs
PDF_FILES = [
    "/content/remedies1.pdf",
    "/content/remedies2.pdf",
    "/content/remedies4.pdf"
]
INDEX_FILE = "remedies_index.faiss"
CHUNKS_FILE = "remedies_chunks.pkl"

# Function to extract text from a PDF file with per-page error handling
def extract_text(pdf_path):
    text = ""
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        logging.error(f"Error opening {pdf_path}: {e}")
        return ""

    for page_num in range(len(doc)):
        try:
            page = doc.load_page(page_num)
            page_text = page.get_text("text")
            text += page_text + "\n"
        except Exception as e:
            logging.error(f"Error extracting text from page {page_num} in {pdf_path}: {e}")
    return text

# Extract text from all PDFs
all_text = " ".join([extract_text(pdf) for pdf in PDF_FILES])
logging.info("PDF texts extracted.")

# Function to clean and consolidate text
def clean_text(text):
    lines = text.split("\n")
    filtered_lines = [line.strip() for line in lines if line.strip() and len(line) > 20]
    return " ".join(filtered_lines)

clean_text_data = clean_text(all_text)
logging.info("Text cleaned.")

# Split text into chunks for efficient embedding and retrieval
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
text_chunks = splitter.split_text(clean_text_data)
logging.info(f"Text split into {len(text_chunks)} chunks.")

# Load the Bio_ClinicalBERT model & tokenizer (suitable for biomedical text)
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model.eval()

# Function to generate embeddings for a list of texts
def encode_texts(text_list):
    embeddings = []
    with torch.no_grad():
        for text in text_list:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
            outputs = model(**inputs)
            token_embeddings = outputs.last_hidden_state
            attention_mask = inputs["attention_mask"].unsqueeze(-1)
            summed = torch.sum(token_embeddings * attention_mask, dim=1)
            count = torch.clamp(attention_mask.sum(dim=1), min=1e-9)
            embedding = (summed / count).squeeze(0).cpu().numpy()
            embeddings.append(embedding)
    return embeddings

# Generate embeddings for all text chunks
embeddings = encode_texts(text_chunks)
logging.info("Embeddings generated.")

# Create a FAISS index from the embeddings
embeddings_np = np.array(embeddings).astype("float32")
dim = embeddings_np.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings_np)
faiss.write_index(index, INDEX_FILE)

# Save the text chunks to a pickle file for later retrieval
with open(CHUNKS_FILE, "wb") as f:
    pickle.dump(text_chunks, f)

logging.info("Vector database created and saved successfully.")

# Function to retrieve the top-k relevant chunks for a given query
def retrieve_similar(query, top_k=5):
    query_embedding = encode_texts([query])[0]
    query_embedding = np.array([query_embedding]).astype("float32")
    distances, indices = index.search(query_embedding, top_k)

    with open(CHUNKS_FILE, "rb") as f:
        stored_chunks = pickle.load(f)

    results = [stored_chunks[i] for i in indices[0]]
    return results

# Example Query: Find home remedies for eczema
query = "What are the home remedies for eczema?"
retrieved_chunks = retrieve_similar(query)
for i, chunk in enumerate(retrieved_chunks):
    print(f"Result {i+1}:\n{chunk}\n")


Result 1:
decreases pain. Approximately 900 grams  (g) of salt is added to 150 litres  (L) of tub water, that is, 6  g/L of water. Since the common household bucket size is 10–20 L, 60–120 g of salt is to D.	 Saline soaks: Saline‑soaked swab is kept on the lesion for 10–15 minutes to remove crusts from sensitive areas like the periorbital, perioral, face, and genital region. It can also be used to decrease pain and inflammation in lesions of hidradenitis suppurativa or pyoderma Onion juice is effective in the treatment of patchy alopecia areata. Its therapeutic effect is comparable to other available topical immunotherapeutic agents. Sharquie et  al.[45] reported significant regrowth of hair in 86.9% of patients  (20/23) after twice daily application of onion juice for 2  months. The suggested mechanism of action includes induction of an immunological reaction and antigenic competition which stimulates the regrowth of hair and causes irritant contact dermatitis due to its constituents,