# Ingestion

In [None]:
% pip install unstructured rapidocr-onnxruntime

In [1]:
from langchain.document_loaders import PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader('docs')
documents = loader.load()

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 100,
    chunk_overlap  = 20,
    length_function = len,
    separators = ["\n\n", "\n", "(?<=\. )", " ", ""]
)

r_docs = text_splitter.split_documents(documents)

In [60]:
r_docs[:5]

[Document(page_content='Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics', metadata={'source': 'docs\\2022.acl-long.450.pdf', 'page': 0}),
 Document(page_content='Volume 1: Long Papers , pages 6507 - 6522', metadata={'source': 'docs\\2022.acl-long.450.pdf', 'page': 0}),
 Document(page_content='May 22-27, 2022 c⃝2022 Association for Computational Linguistics', metadata={'source': 'docs\\2022.acl-long.450.pdf', 'page': 0}),
 Document(page_content='MemSum: Extractive Summarization of Long Documents Using', metadata={'source': 'docs\\2022.acl-long.450.pdf', 'page': 0}),
 Document(page_content='Multi-Step Episodic Markov Decision Processes\nNianlong Gu\nInstitute of Neuroinformatics,', metadata={'source': 'docs\\2022.acl-long.450.pdf', 'page': 0})]

In [3]:
from langchain.text_splitter import NLTKTextSplitter

nltk_splitter = NLTKTextSplitter(chunk_size=200)

docs = nltk_splitter.split_documents(documents)

Created a chunk of size 501, which is longer than the specified 200
Created a chunk of size 322, which is longer than the specified 200
Created a chunk of size 352, which is longer than the specified 200
Created a chunk of size 244, which is longer than the specified 200
Created a chunk of size 260, which is longer than the specified 200
Created a chunk of size 305, which is longer than the specified 200
Created a chunk of size 550, which is longer than the specified 200
Created a chunk of size 267, which is longer than the specified 200
Created a chunk of size 321, which is longer than the specified 200
Created a chunk of size 250, which is longer than the specified 200
Created a chunk of size 293, which is longer than the specified 200
Created a chunk of size 209, which is longer than the specified 200
Created a chunk of size 205, which is longer than the specified 200
Created a chunk of size 241, which is longer than the specified 200
Created a chunk of size 270, which is longer tha

In [4]:
docs[2]

Document(page_content='When MemSum iteratively selects sentences\ninto the summary, it considers a broad infor-\nmation set that would intuitively also be used\nby humans in this task: 1) the text content of\nthe sentence, 2) the global text context of the\nrest of the document, and 3) the extraction his-\ntory consisting of the set of sentences that have\nalready been extracted.', metadata={'source': 'docs\\2022.acl-long.450.pdf', 'page': 0})

In [5]:
from gpt4all import GPT4All, Embed4All
from langchain.embeddings import GPT4AllEmbeddings

gpt4all_embd = GPT4AllEmbeddings()

In [32]:
from langchain.embeddings import HuggingFaceEmbeddings

hf_embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

## Vectorstore

In [6]:
from langchain.vectorstores import Chroma

chroma_db = Chroma.from_documents(docs, gpt4all_embd)


In [8]:
chroma_db.persist()

In [28]:
from langchain.vectorstores import FAISS

faiss_db = FAISS.from_documents(docs, gpt4all_embd)
faiss_db.save_local("faiss_index")

In [34]:
new_db = FAISS.load_local("faiss_index", gpt4all_embd)

In [48]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.storage import InMemoryStore

vectorstore = Chroma(
    collection_name="full_documents", embedding_function=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
)
# The storage layer for the parent documents
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=text_splitter,
)


In [49]:
retriever.add_documents(docs, ids=None)

In [45]:
sub_docs = vectorstore.similarity_search("what is RAG")

In [46]:
print(sub_docs[0].page_content)

which RAG does not require.


# Retrieval


## Self Query

In [None]:
import os
# os.environ["LANGCHAIN_TRACING_V2"] = "true"
# os.environ["LANGCHAIN_ENDPOINT"] = "https://api.langchain.plus"
# os.environ["LANGCHAIN_API_KEY"] = "..." # replace dots with your api key

In [9]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [10]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The content the chunk is from.",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the document",
        type="integer",
    ),
]

In [11]:
document_content_description = "Papers and lecture slides"
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    chroma_db,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [12]:
question = "what is RAG?"

In [13]:
docs = retriever.get_relevant_documents(question)

In [14]:
docs

[Document(page_content='An interactive demo of RAG models can be found at https://huggingface.co/rag/\n2', metadata={'page': 1, 'source': 'docs\\RAG.pdf'}),
 Document(page_content='Our RAG models achieve state-of-the-art results\non open Natural Questions [ 29], WebQuestions [ 3] and CuratedTrec [ 2] and strongly outperform\nrecent approaches that use specialised pre-training objectives on TriviaQA [ 24].', metadata={'page': 1, 'source': 'docs\\RAG.pdf'}),
 Document(page_content='This said, RAG techniques may work well in these settings, and\ncould represent promising future work.', metadata={'page': 8, 'source': 'docs\\RAG.pdf'}),
 Document(page_content='Acknowledgments\nThe authors would like to thank the reviewers for their thoughtful and constructive feedback on this\npaper, as well as HuggingFace for their help in open-sourcing code to run RAG models.', metadata={'page': 9, 'source': 'docs\\RAG.pdf'})]

## Additional tricks: compression

Another approach for improving the quality of retrieved docs is compression.

Information most relevant to a query may be buried in a document with a lot of irrelevant text. 

Passing that full document through your application can lead to more expensive LLM calls and poorer responses.

Contextual compression is meant to fix this. 

In [15]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [16]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))


In [17]:
# Wrap our vectorstore
llm = OpenAI(temperature=0)
compressor = LLMChainExtractor.from_llm(llm)

In [18]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=chroma_db.as_retriever()
)

In [19]:
question = "what did they say about RAG?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)



Document 1:

"similar concerns as for GPT-2 [ 50] are valid here, although arguably to a lesser extent, including that it might be used to generate abuse, faked or misleading content in the news or on social media; to impersonate others; or to automate the production of spam/phishing content [ 54]."
----------------------------------------------------------------------------------------------------
Document 2:

"RAG could be employed in a wide variety of scenarios with direct beneﬁt to society, for example by endowing it with a medical index and asking it open-domain questions on that topic, or by helping people be more effective at their jobs."
----------------------------------------------------------------------------------------------------
Document 3:

"RAG achieves an accuracy within 2.7% of this model, despite being supplied with only the claim and retrieving its own evidence."


## Combining various techniques

In [20]:
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=chroma_db.as_retriever(search_type = "mmr")
)

In [21]:
question = "what did they say about RAG?"
compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)



Document 1:

"similar concerns as for GPT-2 [ 50] are valid here, although arguably to a lesser extent, including that it might be used to generate abuse, faked or misleading content in the news or on social media; to impersonate others; or to automate the production of spam/phishing content [ 54]."
----------------------------------------------------------------------------------------------------
Document 2:

"An interactive demo of RAG models can be found at https://huggingface.co/rag/"
----------------------------------------------------------------------------------------------------
Document 3:

RAG-T The middle ear is the portion of the ear internal to the eardrum. RAG-S The middle ear includes the tympanic cavity and the three ossicles.
----------------------------------------------------------------------------------------------------
Document 4:

RAG-Token performs better than RAG-Sequence on Jeopardy question generation


## Use llamaindex

# Synthesis