In [1]:
import os
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as pdf:
        for page_num in range(pdf.page_count):
            page = pdf[page_num]
            text += page.get_text()
    return text

# Path to your folder containing PDFs
folder_path = "data"

# List all PDF files in the folder
pdf_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.pdf')]

# Extract text from each PDF and store in a list
documents = [extract_text_from_pdf(pdf_path) for pdf_path in pdf_files]

# Display some text snippets for verification
for i, doc in enumerate(documents, 1):
    print(f"\nDocument {i} (First 300 characters):")
    print(doc[:300])  # Show a snippet of each document

print(f"\nExtracted text from {len(documents)} PDF files.")


Document 1 (First 300 characters):
iScience
Article
Direct targeting of mitochondria by cisplatin leads
to cytotoxicity in zebraﬁsh lateral-line hair cells
David S. Lee,
Angela Schrader,
Jiaoxia Zou, Wee
Han Ang, Mark E.
Warchol, Lavinia
Sheets
sheetsl@wustl.edu
Highlights
Hair cells with more
cumulative metabolic
activity are suscep

Document 2 (First 300 characters):
 
 
1 
 
Driving forces for condensation of synapsin are governed by 
sequence-encoded molecular grammars 
 
Christian Hoffmann a,1, Kiersten M. Ruff b,1, Irina A. Edu c, Min Kyung Shinn b, Johannes V. Tromm 
a, Matthew R. King b, Avnika Pant b, Hannes Ausserwöger c, Jennifer R. Morgan d, Tuomas P. 

Document 3 (First 300 characters):
See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/383108685
Advancements of Nano-biotechnology in Public Health Sector: Beneﬁts and
Challenges
Article · August 2024
DOI: 10.2174/0122106812316402240808101033
CITATIONS
0
READS
72
5 

In [2]:
# online version
# Required installations:
# pip install langchain chromadb pypdf sentence-transformers ollama
# ollama pull llama2
# example question: what methods can we use for DNA adductomics screening? 


import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import Ollama
from langchain.chains import RetrievalQA


def load_pdfs(directory):
   """Load PDF files from directory"""
   loader = DirectoryLoader(directory, glob="**/*.pdf", loader_cls=PyPDFLoader)
   documents = loader.load()
   return documents


def split_documents(documents):
   """Split documents into chunks"""
   text_splitter = RecursiveCharacterTextSplitter(
       chunk_size=1000,
       chunk_overlap=200
   )
   chunks = text_splitter.split_documents(documents)
   return chunks


def create_vector_store(chunks):
   """Create vector store from document chunks"""
   embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
   vectorstore = Chroma.from_documents(
       documents=chunks,
       embedding=embeddings,
       persist_directory="db"
   )
   return vectorstore


def setup_qa_chain(vectorstore):
   """Setup QA chain with local LLM"""
   llm = Ollama(model="llama2")
   qa_chain = RetrievalQA.from_chain_type(
       llm=llm,
       chain_type="stuff",
       retriever=vectorstore.as_retriever()
   )
   return qa_chain


def main():
   # Load documents
   documents = load_pdfs("data")
   print(f"Loaded {len(documents)} documents")
  
   # Split into chunks
   chunks = split_documents(documents)
   print(f"Created {len(chunks)} chunks")
  
   # Create vector store
   vectorstore = create_vector_store(chunks)
   print("Created vector store")
  
   # Setup QA chain
   qa_chain = setup_qa_chain(vectorstore)
   print("Setup QA chain")
  
   # Interactive query loop
   while True:
       query = input("\nEnter your question (or 'quit' to exit): ")
       if query.lower() == 'quit':
           break
      
       response = qa_chain.run(query)
       print("\nResponse:", response)


if __name__ == "__main__":
   main()



Loaded 362 documents
Created 2039 chunks


  embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
  from tqdm.autonotebook import tqdm, trange
  llm = Ollama(model="llama2")


Created vector store
Setup QA chain


  response = qa_chain.run(query)



Response: Wiener filtering and OPIE are both techniques used in noise reduction applications, including illumination microscopy. However, they differ in their approach and assumptions made about the noise structure.

Wiener filtering is a traditional method that incorporates the out-of-focus noise term into the random noise term. In other words, it assumes that the noise in the image is a combination of both additive white Gaussian noise (AWGN) and out-of-focus noise. The Wiener filter estimates the noise covariance matrix using the AWGN term and then applies a covariance-based filtering method to reduce the noise.

On the other hand, OPIE assumes that the noise in the image is primarily due to the out-of-focus signal and makes no assumptions about the AWGN term. Instead, it uses the information from the other layers in the sample space (Z1 and Z−1) to enhance the weak signal in the center layerZ0. OPIE performs 3D Wiener filtering under the assumption of a thin sample, which means th

In [3]:
# offline version
# test question: what methods can we use for DNS adductomics screening?
# no chroma db meaning everytime you run the code, it 'trains' the data from the beginning.


import os
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import Ollama
from langchain.chains import RetrievalQA
import time


def load_pdfs(directory):
   loader = DirectoryLoader(directory, glob="**/*.pdf", loader_cls=PyPDFLoader)
   documents = loader.load()
   return documents


def split_documents(documents):
   text_splitter = RecursiveCharacterTextSplitter(
       chunk_size=500,
       chunk_overlap=50
   )
   chunks = text_splitter.split_documents(documents)
   return chunks


def create_vector_store(chunks):
   model_name = "sentence-transformers/all-MiniLM-L6-v2"
   model_kwargs = {'device': 'cpu'}
   encode_kwargs = {'normalize_embeddings': False}
  
   embeddings = HuggingFaceEmbeddings(
       model_name=model_name,
       model_kwargs=model_kwargs,
       encode_kwargs=encode_kwargs,
       cache_folder="./models"
   )
  
   vectorstore = Chroma.from_documents(
       documents=chunks,
       embedding=embeddings,
       persist_directory="db"
   )
   return vectorstore


def setup_qa_chain(vectorstore):
   llm = Ollama(model="llama2:7b", temperature=0)
   qa_chain = RetrievalQA.from_chain_type(
       llm=llm,
       chain_type="stuff",
       retriever=vectorstore.as_retriever(search_kwargs={"k": 3})
   )
   return qa_chain


def process_query(qa_chain, query):
   start_time = time.time()
   response = qa_chain.run(query)
   duration = time.time() - start_time
   return response, duration


def main():
   os.makedirs("models", exist_ok=True)
  
   start_time = time.time()
   print("Starting document processing...")
  
   documents = load_pdfs("data")
   print(f"Loaded {len(documents)} documents")
  
   chunks = split_documents(documents)
   print(f"Created {len(chunks)} chunks")
  
   vectorstore = create_vector_store(chunks)
   print("Created vector store")
  
   qa_chain = setup_qa_chain(vectorstore)
   print(f"Setup complete in {time.time() - start_time:.2f} seconds")
  
   while True:
       query = input("\nEnter your question (or 'quit' to exit): ")
       if query.lower() == 'quit':
           break
      
       response, duration = process_query(qa_chain, query)
       print(f"\nResponse ({duration:.2f} seconds):")
       print(response)


if __name__ == "__main__":
   main()

quit

Starting document processing...
Loaded 362 documents
Created 3593 chunks
Created vector store
Setup complete in 53.26 seconds


In [4]:
!pip install langchain transformers sentence-transformers faiss-cpu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp311-cp311-macosx_11_0_arm64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [7]:
from transformers import AutoModel, AutoTokenizer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from PIL import Image
import torch