In [1]:
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_community.llms import Together
from langchain_core.prompts import PromptTemplate
import os
from dotenv import load_dotenv
from langchain_community.vectorstores import Milvus
from langchain_astradb import AstraDBVectorStore

In [2]:
loader = PyMuPDFLoader("../DocumentStore/The_Art_Of_War.pdf")
pages = loader.load()

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, 
                                               chunk_overlap = 200, 
                                                length_function=len,
                                                is_separator_regex=False,
                                                separators=[
                                                            "\n\n",
                                                            "\n",
                                                            ".",
                                                            ",",
                                                            "\u200B",  # Zero-width space
                                                            "\uff0c",  # Fullwidth comma
                                                            "\u3001",  # Ideographic comma
                                                            "\uff0e",  # Fullwidth full stop
                                                            "\u3002",  # Ideographic full stop
                                                            " ",
                                                            "",
                                                        ]
                                            )
splits = text_splitter.split_documents(pages)

In [4]:
# splits[1]

In [4]:
embedding_function = SentenceTransformerEmbeddings(model_name="BAAI/bge-base-en-v1.5")

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
# ASTRA_DB_API_ENDPOINT = "Use your own API Endpoint"
# ASTRA_DB_APPLICATION_TOKEN = "Use your own application token"
ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"]
ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_API_ENDPOINT"]

desired_namespace = input("(optional) Namespace = ")
if desired_namespace:
    ASTRA_DB_KEYSPACE = desired_namespace
else:
    ASTRA_DB_KEYSPACE = None

vectorstore = AstraDBVectorStore(
    embedding=embedding_function,
    collection_name="astra_vector_demo",
    api_endpoint=ASTRA_DB_API_ENDPOINT,
    token=ASTRA_DB_APPLICATION_TOKEN,
    namespace=ASTRA_DB_KEYSPACE,
)

In [17]:
# vectorstore = Chroma.from_documents(documents=splits, embedding=embedding_function, persist_directory="../chroma_d")

In [26]:
query = "Sun Tzu"
k = 10
results = vectorstore.similarity_search(query, k=k)

In [27]:
k = 5
retriever = vectorstore.as_retriever(search_kwargs={"k":k})
query = "Who was Sun Tzu?"
results = retriever.invoke(query)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [28]:
load_dotenv()

prompt = PromptTemplate(input_variables=['context', 'question'],
                        template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Keep answers descriptive and mention the process.\nQuestion: {question} \nContext: {context} \nAnswer:")

response = Together(
        model="mistralai/Mistral-7B-Instruct-v0.1",
        together_api_key= os.environ["TOGETHER_API_KEY"],
        temperature=0.3,
        max_tokens=512
    )

rag_chain = (
    {"context": retriever 
    | format_docs, "question": RunnablePassthrough()}
    | prompt
    | response
    | StrOutputParser())

In [29]:
query = "What is the art of war goverened by?"

query_list = ["Describe the ledger statement data-table?", ]
# query = "Summarise the document for me in 500 words"
ls = rag_chain.invoke(query)
print(ls)

 The art of war is governed by strategy. It is a concept that has been studied and practiced for centuries, with many different philosophers and military leaders contributing to its development. The art of war is not a specific set of rules or laws, but rather a set of principles and guidelines that can be applied in a variety of situations. These principles include understanding the strengths and weaknesses of oneself and one's opponent, being able to adapt to changing circumstances, and making decisions based on a clear understanding of the situation at hand. The art of war is a complex and constantly evolving field, and it is up to the individual to study and understand its principles in order to be effective in battle.
