In [None]:
!pip install langchain.schema

In [None]:
from langchain_ollama import OllamaLLM
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.vectorstores import LanceDB
from langchain_core.runnables import RunnablePassthrough
from langchain.schema.runnable import RunnableLambda
import lancedb
import pandas
from pydantic import BaseModel
from pprint import pprint

In [None]:
model = OllamaLLM(model="llama3.1", model_kwargs={'device': 'gpu'})
# model.invoke("Come up with 10 names for a song about parrots")

In [None]:
pdf_location = "C:/Users/mario/Downloads/Black Hat Python ( PDFDrive ).pdf"

In [None]:
loader = PyPDFLoader(pdf_location)
docs = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, # 500
    chunk_overlap=20, #100 play with this numbers for better results
)
documents = text_splitter.split_documents(docs)
embeddings = OllamaEmbeddings(model="nomic-embed-text")

In [None]:
len(documents)

In [None]:
vector_store = LanceDB.from_documents(documents, embeddings)

In [None]:
# vector_store.get_table().head()

In [None]:
prompt = ChatPromptTemplate.from_template(
    "Summarize the main themes in these retrieved docs: {docs}"
)


# Convert loaded documents into strings by concatenating their content
# and ignoring metadata
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


chain = {"docs": format_docs} | prompt | model | StrOutputParser()

question = "What are some techniques used for pen testing?"

docs = vector_store.similarity_search(question)

chain.invoke(docs)

In [None]:
RAG_TEMPLATE = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

<context>
{context}
</context>

Answer the following question:

{question}"""

rag_prompt = ChatPromptTemplate.from_template(RAG_TEMPLATE)

chain = (
    RunnablePassthrough.assign(context=lambda input: format_docs(input["context"]))
    | rag_prompt
    | model
    | StrOutputParser()
)

question = "What are some techniques used for pen testing?"

docs = vector_store.similarity_search(question)

# Run
chain.invoke({"context": docs, "question": question})

In [None]:
retriever = vector_store.as_retriever()

qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | model
    | StrOutputParser()
)

In [None]:
question = "Are there any good places to eat in Irving TX?"

res = qa_chain.invoke(question)
pprint(res)