# Documentation chat bot
this notebook explores using llm to set up chatbot for documentation
* this chatbot should be able to answer basic questions and provide the documents that were used to generate its responses

there were many good resources from langchain about this appliaction
* https://python.langchain.com/v0.2/docs/tutorials/rag/
* https://python.langchain.com/v0.1/docs/use_cases/question_answering/citations/
* https://www.youtube.com/watch?v=Vw52xyyFsB8&list=PLfaIDFEXuae2LXbO1_PKyVJiQ23ZztA0x&index=4

In [7]:
%load_ext dotenv
%dotenv ../.env

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [8]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

### making model

In [9]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")



### making doc loader

In [10]:
from langchain_community.document_loaders import TextLoader

from os import listdir
from os.path import isfile, join

mypath = r"./text"
docs = []

for f in listdir(mypath):
    file_path = join(mypath, f)
    if isfile(file_path):
        loader = TextLoader(file_path, encoding='utf8')
        docs.extend(loader.load())
print(len(docs))


11


In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

In [12]:
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import TextLoader

import os
from os.path import  join



def get_retriever(path_to_docs= "./text", force_reembed=False):
    if(not os.path.exists("faiss_index") or force_reembed):

        docs = []
        for f in os.listdir(path_to_docs):
            file_path = join(path_to_docs, f)
            if os.path.isfile(file_path):
                loader = TextLoader(file_path, encoding='utf8')
                docs.extend(loader.load())


        vectorstore = FAISS.from_documents(splits, OpenAIEmbeddings())
        vectorstore.save_local("faiss_index")
    else: 
        vectorstore = FAISS.load_local("faiss_index", OpenAIEmbeddings(), allow_dangerous_deserialization=True)
    return vectorstore.as_retriever()

retriever = get_retriever()

In [13]:
prompt = hub.pull("rlm/rag-prompt") #pulled prompt from langchain prompt repo 


In [14]:
print(prompt)

input_variables=['context', 'question'] metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]


In [15]:
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, PromptTemplate

my_prompt = ChatPromptTemplate(
    input_variables=['context','question'],
    messages=[
        HumanMessagePromptTemplate(
            prompt=PromptTemplate(
                input_variables=['context', 'question'], 
                template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"
                )
            )
        ]
)

In [16]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



## local and citing sources

In [17]:
from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)




In [18]:
def ask_question(question):
    ans = rag_chain_with_source.invoke(question)
    print("question = ",ans["question"])
    print("answer = ", ans['answer'])
    print("Documents used:")
    for d in ans['context']:
        if len(d.page_content) > 40:
            print("\tsource: "+d.metadata['source']+"\t"+d.page_content[:40]+"..."+d.page_content[-30:])
        else:
            print("\tsource: "+d.metadata['source']+"\t"+d.page_content)
ask_question("Which countries have a major service sector")

question =  Which countries have a major service sector
answer =  The United Kingdom, India, and Japan have major service sectors contributing a significant portion to their GDP. The service sector in the United Kingdom accounts for 82% of GDP, in India over 50% of GDP, and in Japan approximately 70% of GDP.
Documents used:
	source: ./text\econ_uk.txt	The service sector dominates, contributi...otland being the richest areas
	source: ./text\econ_india.txt	The service sector makes up more than 50... at 29.3% of GDP in 2022.[104]
	source: ./text\econ_japan.txt	Japan has a highly service-dominated eco...panies are based in Japan.[41]
	source: ./text\econ_uk.txt	The United Kingdom has one of the most g...oughly 24.5% of GDP.[4][38][3]


of note is that the US text only mentions that its economy dominates the service trade but not how big it is relatively
> The U.S. not only has the largest internal market for goods, but also dominates the services trade.  

this could be fixed with more specific querying or more explicit documents