# Setup

Instal

In [1]:
#! pip install pypdf
#! pip install langchain
#! pip install "langchain[docarray]"
#! pip install chromadb

Import

In [1]:
import os
import openai
import sys
from langchain_community.document_loaders import PyPDFLoader

# from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings

from langchain.vectorstores import Chroma
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

API KEY

In [2]:
sys.path.append("../..")

from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())  # read local .env file

openai.api_key = os.environ["OPENAI_API_KEY"]

# Load document

In [21]:
loaders = [
    PyPDFLoader("../pdf/EdAP 2020_EN.pdf"),
    PyPDFLoader("../pdf/SOF book-web-rev3d-hires.pdf"),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [22]:
len(docs)

878

In [25]:
page = docs[870]
print(f"Contenu de la page \n{page.page_content[0:500]}")
print(f"METADATA : {page.metadata}")

Contenu de la page 
440   |  Les forêts du bassin du Congo
BibliographieRoca T, Letouzé E. 2016. La révolution des données est-elle en marche ? Implications pour la statistique 
publique et la démocratie. Afrique contemporaine. 258(2):95-111.
Rosen GE, Smith KF. 2010. Summarizing the evidence on the international trade in illegal wildlife. 
EcoHealth. 7(1):24-32.
RRI (Rights and Resources Initiative). 2017. Securing community land rights : Priorities and Opportunities to 
advance climate and sustainable development
METADATA : {'source': '../pdf/SOF book-web-rev3d-hires.pdf', 'page': 466}


# Split document

In [27]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500, chunk_overlap=50, separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
splits = text_splitter.split_documents(docs)
print(len(splits))
print(len(docs))

2196
878


# Embed / Vector store

Setup Embedding and Vector store

In [44]:
embedding = OpenAIEmbeddings()
persist_directory = "../vectorstore/chroma/"


# Delete the persist_directory if you want to force the generatation of another vector store
#! rm -rf persist_directory

Generate new Vectore Store if foler Chroma is empty

In [45]:
# Check if the vector store already exists
if os.path.exists(persist_directory):
    # If the vector store exists, load it
    vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)
else:
    # If the vector store does not exist, generate it
    # Assuming 'splits' is a list of documents already defined elsewhere in your notebook
    vectordb = Chroma.from_documents(
        documents=splits, embedding=embedding, persist_directory=persist_directory
    )

Similarity search

In [49]:
# print(vectordb._collection.count())
question = "What is OFAC stand for ?"
answer_docs = vectordb.similarity_search(question, k=3)
len(answer_docs)
answer_docs[0].page_content
for d in answer_docs:
    print(d.metadata)

{'page': 158, 'source': '../pdf/EdAP 2020_EN.pdf'}
{'page': 216, 'source': '../pdf/SOF book-web-rev3d-hires.pdf'}
{'page': 387, 'source': '../pdf/EdAP 2020_EN.pdf'}


MMR search

In [50]:
answer_docs = vectordb.max_marginal_relevance_search(question, k=2, fetch_k=3)
len(answer_docs)
answer_docs[0].page_content
for d in answer_docs:
    print(d.metadata)

{'page': 158, 'source': '../pdf/EdAP 2020_EN.pdf'}
{'page': 387, 'source': '../pdf/EdAP 2020_EN.pdf'}


Filter on SOF (State of forest) only (doesn't work)

In [56]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of EdAP 2020_EN.pdf or SOF book-web-rev3d-hires.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the document",
        type="integer",
    ),
]

document_content_description = "Report of forest and protected area"
llm = OpenAI(model="gpt-3.5-turbo-instruct", temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm, vectordb, document_content_description, metadata_field_info, verbose=True
)

In [59]:
question = "what OFAC stand for, check answer in SOF document"
docs = retriever.get_relevant_documents(question)
for d in docs:
    print(d.metadata)

[]


Compression of retrieved data

In [64]:
# Guillaume comment :
# In this section, we will use a Langchain compressor to reduce the amount of text retrieve.
# By doing this, the summarized text produced is almost an answer


def pretty_print_docs(docs):
    print(
        f"\n{'-' * 100}\n".join(
            [f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]
        )
    )


# Wrap our vectorstore
llm = OpenAI(temperature=0, model="gpt-3.5-turbo-instruct")

compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=vectordb.as_retriever()
)

question = "what is the difference between OFAC and COMIFAC?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)



Document 1:

OFAC and COMIFAC
----------------------------------------------------------------------------------------------------
Document 2:

- COMIFAC
- Central Africa
- sustainable and coordinated management of forest ecosystems
- orientation, harmonization and monitoring of forestry and environmental policies
- emerged from the commitments made in March 1999 by the Heads of State of Central Africa in the “Yaoundé Declaration”
- ten member countries of the subregion
- common natural heritage
- legal framework governed by the February 2005 treaty: “Treaty on the Conservation and Sustainable Management of Forest Ecosystems in Central Africa and to establish the Central African Forests Commission”
- Convergence Plan defines the shared ten-year intervention strategies of Central African States and development partners in the field of conservation and sustainable management of forest and savanna ecosystems
- second edition of this plan, covering the period 2015-2025
- Web site: www.comi

# Question answering

Stuff technics : all document are sent to the LLM to answer

In [71]:
# Here we are creating a answer chain with Langchain. We make a specific prompt.
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)

# Run chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

In [73]:
question = "What is the difference between OFAC and COMIFAC?"
result = qa_chain.invoke({"query": question})
result["result"]

"OFAC is a specialized unit of COMIFAC responsible for coordinating the Forest Observatory and disseminating information on Central Africa's forests and ecosystems. COMIFAC, on the other hand, is an organization responsible for harmonizing and monitoring forestry and environmental policies in Central Africa. Thanks for asking!"

Map reduce : it combine all retrieved data before sending to the LLM to answer

In [78]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm, retriever=vectordb.as_retriever(), chain_type="map_reduce"
)
question = (
    "What is the main expertise of OFAC ? How it contribute to reduce deforestation ?"
)
result = qa_chain.invoke({"query": question})
result["result"]

"OFAC's main expertise is in collecting and managing environmental data in Central Africa to support the sustainable management of forest ecosystems. It contributes to reducing deforestation by providing essential tools for steering and sharing knowledge for better governance and sustainable management of forest ecosystems in the region. Thanks for asking!"

Refine : it improve the answer with each different document

In [80]:
qa_chain_mr = RetrievalQA.from_chain_type(
    llm, retriever=vectordb.as_retriever(), chain_type="refine"
)
question = (
    "What is the main expertise of OFAC ? How it contribute to reduce deforestation ?"
)
result = qa_chain.invoke({"query": question})
result["result"]

"OFAC's main expertise is in collecting and managing environmental data in Central Africa to support the sustainable management of forest ecosystems. It contributes to reducing deforestation by providing essential tools for steering and sharing knowledge for better governance and sustainable management of forest ecosystems in the region. Thanks for asking!"

# Q/A with memory

In [81]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

retriever = vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

question = (
    "What is the main expertise of OFAC ? How it contribute to reduce deforestation ?"
)
result = qa.invoke({"question": question})

In [82]:
result["answer"]

'The main expertise of OFAC (Central Africa Forest Observatory) lies in ensuring the availability of information to support the sustainable management of forest ecosystems in Central Africa. OFAC contributes to reducing deforestation by collecting and managing environmental data at different scales, conducting annual campaigns to collect reference data in its member states, and collaborating with various partners to harmonize and disseminate information. Additionally, OFAC plays a crucial role in promoting knowledge transfer and skills between countries and actors, ultimately providing essential tools for steering and sharing knowledge for better governance and sustainable management of forest ecosystems in the region.'

In [83]:
question = "Which difference with COMIFAC ?"  # Check if it understand it's difference between OFAC and COMIFAC
result = qa.invoke({"question": question})
result["answer"]

'OFAC and COMIFAC are both mentioned in the provided context, but the text does not explicitly outline the specific differences between the two entities. The text describes OFAC as playing a role in data storage, analysis, and transmission to support decision-making within the reach of managers, while COMIFAC is mentioned in the context of a Council of Ministers and holders of global issues. For more detailed differences between OFAC and COMIFAC, further information or sources specific to these organizations would be needed.'

# Chatbot with memory (doesn't work!!)

In [3]:
def load_db(file, chain_type, k):
    # load documents
    loader = PyPDFLoader(file)
    documents = loader.load()
    # split documents
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    # define embedding
    embeddings = OpenAIEmbeddings()
    # create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    # define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": k})
    # create a chatbot chain. Memory is managed externally.
    qa = ConversationalRetrievalChain.from_llm(
        llm=ChatOpenAI(model_name="gpt-3.5-turbo-instruct", temperature=0),
        chain_type=chain_type,
        retriever=retriever,
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa

In [4]:
import panel as pn
import param


class cbfs(param.Parameterized):
    chat_history = param.List([])
    answer = param.String("")
    db_query = param.String("")
    db_response = param.List([])

    def __init__(self, **params):
        super(cbfs, self).__init__(**params)
        self.panels = []
        self.loaded_file = "../pdf/SOF book-web-rev3d-hires.pdf"
        self.qa = load_db(self.loaded_file, "stuff", 4)

    def call_load_db(self, count):
        if count == 0 or file_input.value is None:  # init or no file specified :
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            file_input.save("temp.pdf")  # local copy
            self.loaded_file = file_input.filename
            button_load.button_style = "outline"
            self.qa = load_db("temp.pdf", "stuff", 4)
            button_load.button_style = "solid"
        self.clr_history()
        return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

    def convchain(self, query):
        if not query:
            return pn.WidgetBox(
                pn.Row("User:", pn.pane.Markdown("", width=600)), scroll=True
            )
        result = self.qa({"question": query, "chat_history": self.chat_history})
        self.chat_history.extend([(query, result["answer"])])
        self.db_query = result["generated_question"]
        self.db_response = result["source_documents"]
        self.answer = result["answer"]
        self.panels.extend(
            [
                pn.Row("User:", pn.pane.Markdown(query, width=600)),
                pn.Row(
                    "ChatBot:",
                    pn.pane.Markdown(
                        self.answer, width=600, style={"background-color": "#F6F6F6"}
                    ),
                ),
            ]
        )
        inp.value = ""  # clears loading indicator when cleared
        return pn.WidgetBox(*self.panels, scroll=True)

    @param.depends(
        "db_query ",
    )
    def get_lquest(self):
        if not self.db_query:
            return pn.Column(
                pn.Row(
                    pn.pane.Markdown(
                        "Last question to DB:", styles={"background-color": "#F6F6F6"}
                    )
                ),
                pn.Row(pn.pane.Str("no DB accesses so far")),
            )
        return pn.Column(
            pn.Row(
                pn.pane.Markdown("DB query:", styles={"background-color": "#F6F6F6"})
            ),
            pn.pane.Str(self.db_query),
        )

    @param.depends(
        "db_response",
    )
    def get_sources(self):
        if not self.db_response:
            return
        rlist = [
            pn.Row(
                pn.pane.Markdown(
                    "Result of DB lookup:", styles={"background-color": "#F6F6F6"}
                )
            )
        ]
        for doc in self.db_response:
            rlist.append(pn.Row(pn.pane.Str(doc)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    @param.depends("convchain", "clr_history")
    def get_chats(self):
        if not self.chat_history:
            return pn.WidgetBox(
                pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True
            )
        rlist = [
            pn.Row(
                pn.pane.Markdown(
                    "Current Chat History variable",
                    styles={"background-color": "#F6F6F6"},
                )
            )
        ]
        for exchange in self.chat_history:
            rlist.append(pn.Row(pn.pane.Str(exchange)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    def clr_history(self, count=0):
        self.chat_history = []
        return

In [5]:
cb = cbfs()

file_input = pn.widgets.FileInput(accept=".pdf")
button_load = pn.widgets.Button(name="Load DB", button_type="primary")
button_clearhistory = pn.widgets.Button(name="Clear History", button_type="warning")
button_clearhistory.on_click(cb.clr_history)
inp = pn.widgets.TextInput(placeholder="Enter text here…")

bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)
conversation = pn.bind(cb.convchain, inp)

jpg_pane = pn.pane.Image("./img/convchain.jpg")

tab1 = pn.Column(
    pn.Row(inp),
    pn.layout.Divider(),
    pn.panel(conversation, loading_indicator=True, height=300),
    pn.layout.Divider(),
)
tab2 = pn.Column(
    pn.panel(cb.get_lquest),
    pn.layout.Divider(),
    pn.panel(cb.get_sources),
)
tab3 = pn.Column(
    pn.panel(cb.get_chats),
    pn.layout.Divider(),
)
tab4 = pn.Column(
    pn.Row(file_input, button_load, bound_button_load),
    pn.Row(
        button_clearhistory,
        pn.pane.Markdown("Clears chat history. Can use to start a new topic"),
    ),
    pn.layout.Divider(),
    pn.Row(jpg_pane.clone(width=400)),
)
dashboard = pn.Column(
    pn.Row(pn.pane.Markdown("# ChatWithYourData_Bot")),
    pn.Tabs(
        ("Conversation", tab1),
        ("Database", tab2),
        ("Chat History", tab3),
        ("Configure", tab4),
    ),
)
dashboard

ImportError: Could not import docarray python package. Please install it with `pip install "langchain[docarray]"`.