# exploring vectordatabases and retrievers
this notebook focuses on exploring different ways to set up vector db and retrievers

# Set up

In [1]:
%load_ext dotenv
%dotenv ../.env

In [2]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS

from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate

from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableSequence, RunnableAssign, RunnableLambda
from langchain_core.output_parsers import StrOutputParser


import os
from os.path import  join

In [3]:

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")


In [4]:
#this needs more work 
need_to_recompile = False
if(not os.path.exists("faiss_index") or need_to_recompile):
    path_to_docs = './text'
    docs = []
    for f in os.listdir(path_to_docs):
        file_path = join(path_to_docs, f)
        if os.path.isfile(file_path):
            loader = UnstructuredHTMLLoader(file_path)
            docs.extend(loader.load())
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    splits = text_splitter.split_documents(docs)

    vectorstore = FAISS.from_documents(splits, OpenAIEmbeddings())
    vectorstore.save_local("faiss_index")
else:
    vectorstore = FAISS.load_local("faiss_index", OpenAIEmbeddings(), allow_dangerous_deserialization=True)


In [5]:
prompt = ChatPromptTemplate(
    input_variables=['context','question'],
    messages=[
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=['context', 'question'], 
                template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"
                )
            )
        ]
)

In [6]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [7]:
def make_chain(retriever):
  chain = (
    #our first step is to create a dict {context: retriever.invoke($input_question), question: $input_question}, done through runnableparallel
    RunnableParallel({
      "context": retriever,
      "question": RunnablePassthrough()
    })
    #with this dict, we then use runnable assign to keep our original dictionary, but also add on a new field; answer                                                                                      
    | RunnableAssign(           
        # we add the new field with the mapper funciotn, which takes a new runnable parallel
        # this parrallel will take the dictionary passed into it, use it as input, and when it has its output, it will add the parallels dict to the orinal
        # {**input_dictionary, **our_parallel_function(input_dictionary)->dict}                                                      
        mapper=RunnableParallel(
            #we just need one thing, the answer, so that is the only field
            {"answer": 
              # prompt is expecting a dict with a 'question' and 'context' field, we currently have both of those fields, but context is a list of Document objects
              # we use the assign and format_docs function to remap context to our desired format 
              RunnableAssign(
                  mapper={"context": RunnableLambda(lambda x: format_docs(x['context']))}
                )
              | prompt
              | llm
              | StrOutputParser()
              #at the end, this parallel has produced a dict of {answer: string output}, which is then added to the original
            }
        )
    )
  )
  return chain

In [8]:
def ask_question(chain, question):
    ans = chain.invoke(question)
    print("question = ",ans["question"])
    print("answer = ", ans['answer'])
    print("Documents used:")
    for d in ans['context']:
        if len(d.page_content) > 40:
            print("\tsource: "+d.metadata['source']+"\t"+d.page_content[:40]+"..."+d.page_content[-30:])
        else:
            print("\tsource: "+d.metadata['source']+"\t"+d.page_content)

## fyi on other ways to chain text


other ways to do chain 
```python
RunnableSequence(
    RunnableParallel({
        "context": retriever,
        "question": RunnablePassthrough()
    }),
    RunnableAssign(           
      mapper=RunnableParallel(
          {"answer": 
            RunnableAssign(
                mapper={"context": RunnableLambda(lambda x: format_docs(x['context']))}
              )
            | prompt
            | llm
            | StrOutputParser()
          }
      )
    )   
)
############
RunnableParallel({
    "context": retriever,
    "question": RunnablePassthrough()
}).assign(answer=RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
        | prompt
        | llm
        | StrOutputParser()
)


```

# Default

In [9]:
retriever  = vectorstore.as_retriever()
chain = make_chain(retriever)
ask_question(chain, "who was the first president")

question =  who was the first president
answer =  George Washington was the first president of the United States, taking office in 1789. The national capital moved to Philadelphia in 1790 and then to Washington, D.C., in 1800. Washington's presidency was marked by the creation of a federal government with a strong president and powers of taxation.
Documents used:
	source: ./text\history.html	Early republic (1793–1830)[edit]

Main a... in Washington, D.C., in 1800.
	source: ./text\history.html	Nationalists – most of them war veterans...residency of George Washington
	source: ./text\history.html	Main article: 

Presidency of Richard Ni...ng to the right-center.

[271]
	source: ./text\history.html	[471]

campaigning for the

2024 preside...education in the United States


In [10]:
# Retrieve more documents with higher diversity
# Useful if your dataset has many similar documents
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 6, 'lambda_mult': 0.25}
)
chain = make_chain(retriever)
ask_question(chain, "describe the history of US and UK")

question =  describe the history of US and UK
answer =  The United States and Great Britain had generally satisfactory results from the war. Relations remained peaceful after the American Civil War, leading to close alliances in the 20th century due to world conflicts. The Rush-Bagot Treaty of 1817 demilitarized the Great Lakes and Lake Champlain, establishing a boundary that remains in effect today.
Documents used:
	source: ./text\war1812.html	The long-term results of the war were ge...
Royal Naval Dockyard, Bermuda
	source: ./text\econhistory.html	Toggle the table of contents

Economic h...inancial center

[1]

[2]

[3]
	source: ./text\history.html	United States began with the arrival of
...an in

Massachusetts in

1775.
	source: ./text\history.html	Bibliography of American history

Coloni...ted States factor

Notes[edit]
	source: ./text\history.html	Authority control databases : National F... clarification from March 2024
	source: ./text\war1812.html	Aprill, Alex (October 2015). "Ge

In [15]:
# Fetch more documents for the MMR algorithm to consider
# But only return the top 5
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={'k': 3, 'fetch_k': 9}
)
chain = make_chain(retriever)
ask_question(chain, "describe the history of US and UK")

question =  describe the history of US and UK
answer =  The US and UK have a long history starting from the colonization of North America by the British Empire in the late 15th century. The Rush–Bagot Treaty between the US and Britain was enacted in 1817, establishing a demilitarized boundary that remains in effect today. Despite occasional border disputes and tensions, the two countries became close allies in the 20th century due to multiple world conflicts.
Documents used:
	source: ./text\war1812.html	The long-term results of the war were ge...
Royal Naval Dockyard, Bermuda
	source: ./text\history.html	Bibliography of American history

Coloni...ted States factor

Notes[edit]
	source: ./text\history.html	United States began with the arrival of
...an in

Massachusetts in

1775.


In [12]:
# Only retrieve documents that have a relevance score
#Above a certain threshold
retriever = vectorstore.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={'score_threshold': 0.8}
)
chain = make_chain(retriever)
ask_question(chain, "describe the history of US and UK")




question =  describe the history of US and UK
answer =  The US and UK have a long history of diplomatic relations, dating back to the 18th century. The two countries have been allies in many conflicts, including World War I and World War II. Despite occasional disagreements, the US and UK have maintained a strong partnership based on shared values and interests.
Documents used:


In [19]:
# Use a filter to only retrieve documents from a specific paper
retriever = vectorstore.as_retriever(
    search_kwargs={'filter': {'source':'./text\\war1812.html'}}
)
chain = make_chain(retriever)
ask_question(chain, "describe the history of US and UK")


question =  describe the history of US and UK
answer =  The United States and Great Britain had generally satisfactory long-term relations after occasional tensions and border disputes. The Rush-Bagot Treaty of 1817 demilitarized the Great Lakes and laid the basis for a peaceful boundary that remains in effect today. The two nations became close allies in the 20th century due to multiple world conflicts.
Documents used:
	source: ./text\war1812.html	The long-term results of the war were ge...
Royal Naval Dockyard, Bermuda
	source: ./text\war1812.html	The war is seldom remembered in the Unit...and a growing friendship.[366]
	source: ./text\war1812.html	The historian Donald Hickey maintains th... of Good Feelings ensued.[369]
	source: ./text\war1812.html	At the same time, the British public wer...itish maritime supremacy".[35]
