### Chat with your PDFs with Llama3 and Ollama

Adapted from original Code by Sascha Retter (https://blog.retter.jetzt/)

##### Chat with local Llama3 Model via Ollama in KNIME Analytics Platform — Also extract Logs into structured JSON Files
https://medium.com/p/aca61e4a690a

In [1]:
import os
import pandas as pd

# from langchain.document_loaders import WebBaseLoader
from langchain_community.document_loaders import WebBaseLoader

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# from langchain.vectorstores import Chroma
from langchain_community.vectorstores import Chroma

# from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.embeddings import OllamaEmbeddings

# from langchain.llms import Ollama
from langchain_community.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from langchain.chains import RetrievalQA

model = "llama3:instruct" # model needs already be available, already pulled with for example 'ollama run llama3:instruct'

In [2]:
# Proxy configuration
proxy = "http://proxy.my-company.com:8080"  # Replace with your proxy server and port
proxy = ""
os.environ['http_proxy'] = proxy
os.environ['https_proxy'] = proxy

In [3]:
var_pdf_path = "../documents/"

In [4]:
def list_pdfs(directory):
    """List all PDF files in the given directory."""
    # List all files in the directory
    files = os.listdir(directory)
    # Filter out all files that end with '.pdf'
    pdf_files = [file for file in files if file.endswith('.pdf') and file.startswith('coffee')]
    return pdf_files

# Specify the directory to search for PDF files
pdf_files = list_pdfs(var_pdf_path)
print(pdf_files)


['coffee-machine-instruction-manual.pdf']


In [5]:
def list_pdfs_with_path(directory):
    """List all PDF files in the given directory that start with 'coffee', including their full paths."""
    # List all files in the directory
    files = os.listdir(directory)
    # Filter out all files that end with '.pdf' and start with 'coffee'
    pdf_files = [os.path.join(directory, file) for file in files if file.endswith('.pdf') and file.startswith('coffee')]
    return pdf_files

# Specify the directory to search for PDF files
pdfs = list_pdfs_with_path(var_pdf_path)
print(pdfs)


['../documents/coffee-machine-instruction-manual.pdf']


In [6]:
type(pdfs)

list

In [7]:
question = f"How can I clean the coffee machine?"

all_splits = []
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

In [8]:
# Load data from pdfs split the data of each pdf into chunks
for pdf in pdfs: 
    print(f"processing: {pdf}") 
    loader = PyPDFLoader(pdf)
    data = loader.load()
    all_splits.append(text_splitter.split_documents(data))

processing: ../documents/coffee-machine-instruction-manual.pdf


In [9]:
embedding_model = OllamaEmbeddings(base_url="http://localhost:11434", model=model)

In [None]:
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2" # the standard embedding model for
embedding_model = HuggingFaceEmbeddings(model_name=embedding_model_name)

In [11]:
# Create embeddings and store them in the vectordb
for split in all_splits: 
    vectorstore = Chroma.from_documents(documents=split,
                                    embedding=embedding_model,
                                    persist_directory="../data/vectorstore/coffee_machine_pdf")
print(f"Loaded documents")

Loaded documents


In [12]:
type(vectorstore)

langchain_community.vectorstores.chroma.Chroma

In [13]:
# load from disk
vectorstore = Chroma(persist_directory="../data/vectorstore/coffee_machine_pdf", embedding_function=embedding_model)

In [14]:
type(vectorstore)

langchain_community.vectorstores.chroma.Chroma

In [15]:
# LLM
llm = Ollama(model=model,
            verbose=True,
            callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))

print(f"Loaded LLM model {llm.model}")

Loaded LLM model llama3:instruct


In [17]:
# Initialize the RetrievalQA chain with the vector store retriever
my_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})  # Use the number of documents to retrieve
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=my_retriever,
)

# Use the 'invoke' method to handle the query instead of '__call__'
result = qa_chain.invoke({"query": question})

According to the provided context, cleaning the outer housing and cup warming tray requires a soft, damp cloth, followed by polishing with a soft, dry cloth. Avoid using abrasive cleansers, pads, or cloths that could scratch the surface.

To clean the filter baskets and portafilter, rinse them under hot water immediately after use to remove residual coffee oils. If holes become blocked, use the pin on the end of the provided cleaning tool to unblock them. For stubborn blockages, dissolve a cleaning tablet in hot water, soak the filter basket and portafilter for about 20 minutes, and then rinse thoroughly.

Note that Single Wall filter baskets are not recommended for pre-ground coffee; instead, use Dual Wall filter baskets with pre-ground coffee.

In [None]:
# Print the result
print(result)