In [None]:
%pip install pypdf sentence-transformers replicate

In [None]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, HypotheticalDocumentEmbedder, HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.llms import OpenAI, Replicate
from langchain.chains import VectorDBQA, ConversationalRetrievalChain
from langchain.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from dotenv import load_dotenv
import os, sys

load_dotenv()

In [None]:
root_folder = 'H:/Meu Drive/Prog/IA/Daily Dose of Data Science - Archive.pdf'
persist_directory = './data/processed'
# Load and process the text
if not(os.path.exists(persist_directory)):
    loader = PyPDFLoader(root_folder)
    documents = loader.load()

    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(documents)

    # Embed and store the texts
    embedding = HuggingFaceEmbeddings()
    vectordb = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=persist_directory)
    vectordb.persist()
else:
    # Now we can load the persisted database from disk, and use it as normal. 
    embedding = HuggingFaceEmbeddings()
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [None]:
# Initialize Replicate Llama2 Model https://medium.com/@woyera/how-to-chat-with-your-pdf-using-python-llama-2-41df80c4e674
llm = Replicate(
    model="a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5",
    model_kwargs={"temperature": 0.75, "max_length": 3000}
)

In [None]:
# Set up the Conversational Retrieval Chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm,
    vectordb.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True
)

In [None]:
# Start chatting with the chatbot
chat_history = []
while True:
    query = input('Prompt: ')
    if query.lower() in ["exit", "quit", "q"]:
        print('Exiting')
        sys.exit()
    result = qa_chain({'question': query, 'chat_history': chat_history})
    print('Answer: ' + result['answer'] + '\n')
    chat_history.append((query, result['answer']))

In [None]:
# To cleanup, you can delete the collection
vectordb.delete_collection()
vectordb.persist()