In [1]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Together
from langchain_core.prompts import PromptTemplate
import os
from dotenv import load_dotenv
from langchain_community.vectorstores import Milvus

In [2]:
loader = PyMuPDFLoader("../DocumentStore/The_Art_Of_War.pdf")
pages = loader.load()
pages[1]

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                               chunk_overlap = 200, 
                                                length_function=len,
                                                is_separator_regex=False,
                                                separators=[
                                                            "\n\n",
                                                            "\n",
                                                            ".",
                                                            ",",
                                                            "\u200B",  # Zero-width space
                                                            "\uff0c",  # Fullwidth comma
                                                            "\u3001",  # Ideographic comma
                                                            "\uff0e",  # Fullwidth full stop
                                                            "\u3002",  # Ideographic full stop
                                                            " ",
                                                            "",
                                                        ]
                                            )
splits = text_splitter.split_documents(pages)

In [4]:
splits[1]

In [5]:
embedding_function = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
vectorstore = Milvus.from_documents(
 splits,
 embedding_function,
 connection_args={"host": "127.0.0.1", "port": "19530"},
 collection_name = "part2_langchain", ## custom collection name 
 search_params = {"metric":"IP","offset":0}, ## search params
)

In [14]:
# vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_function, persist_directory="../chroma_db")

In [15]:
query = "Sun Tzu"
k = 10
results = vectorstore.similarity_search(query, k=k)

In [16]:
k = 5
retriever = vectorstore.as_retriever(search_kwargs={"k":k})
query = "Who was Sun Tzu?"
results = retriever.invoke(query)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [17]:
load_dotenv()

prompt = PromptTemplate(input_variables=['context', 'question'],
                        template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Keep answers descriptive and mention the process.\nQuestion: {question} \nContext: {context} \nAnswer:")

response = Together(
        model="mistralai/Mistral-7B-Instruct-v0.1",
        together_api_key= os.environ["TOGETHER_API_KEY"],
        temperature=0.3,
        max_tokens=512
    )

rag_chain = (
    {"context": retriever 
    | format_docs, "question": RunnablePassthrough()}
    | prompt
    | response
    | StrOutputParser())

In [21]:
query = "What is the art of war governed by?"

query_list = ["Describe the ledger statement data-table?", ]
# query = "Summarise the document for me in 500 words"
ls = rag_chain.invoke(query)
print(ls)

 The art of war is governed by the five constant factors, to be taken into account in one’s deliberations, when seeking to determine the conditions obtaining in the field. These are: (1) The Moral Law; (2) Heaven; (3) Earth; (4) The Commander; (5) Method and discipline.
