# RAG using Llamaindex


In [76]:
### Setup environment 

!python3 -m venv rag-pipeline -- quiet
!source rag/bin/activate --quiet

##### Install dependencies

!pip install -r requirements.txt --quiet

  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpip subprocess to install build dependencies[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[1624 lines of output][0m
  [31m   [0m Ignoring numpy: markers 'python_version == "3.7" and (platform_machine != "arm64" or platform_system != "Darwin") and platform_machine != "aarch64"' don't match your environment
  [31m   [0m Ignoring numpy: markers 'python_version == "3.8" and (platform_machine != "arm64" or platform_system != "Darwin") and platform_machine != "aarch64"' don't match your environment
  [31m   [0m Ignoring numpy: markers 'python_version == "3.7" and platform_machine == "aarch64"' don't match your environment
  [31m   [0m Ignoring numpy: markers 'python_version == "3.8" and platform_machine == "aarch64"' don't match your environment
  [31m   [0m Ignoring numpy: markers 'python_version == "3.8" and platform_machine == "arm64" and platform_system == "D

### Loader

In [88]:
from pathlib import Path

from llama_index.readers.file import PyMuPDFReader

loader = PyMuPDFReader()
docs0 = loader.load_data(file_path=Path("data/State of AI Report 2023.pdf"), metadata=True)

In [89]:
print(f" docs is a {type(docs0)}, of length {len(docs0)}, where each element is a {type(docs0[0])} object")

 docs is a <class 'list'>, of length 163, where each element is a <class 'llama_index.core.schema.Document'> object


In [90]:
print ([k for k, v in docs0[94]])

print (docs0[94].get_content())

[v for k,v in docs0[94] if k=='metadata']

docs0[94]

['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'text', 'start_char_idx', 'end_char_idx', 'text_template', 'metadata_template', 'metadata_seperator']
    In Oct 2022, Shutterstock - a leading stock multimedia provider - announced it will work with OpenAI to bring 
DALL·E-powered content onto the platform. Then in July 2023, the two companies signed a 6-year content 
licensing agreement that would give OpenAI access to Shutterstock's image, video and music libraries and 
associated metadata for model training. Furthermore, Shutterstock will offer its customers indemniﬁcation for AI 
image creation. The company also entered into a content license with Meta for GenAI. This pro-GenAI stance is in 
stark contrast to Shutterstock’s competitor, Getty Images, which is profoundly against GenAI as evidenced by its 
ongoing lawsuit against Stability AI for copyright infringement ﬁled in Feb 2023. 
stateof.ai 2023
#stateofai | 95
 Int

Document(id_='53bd5f4e-ceb7-4836-946b-f89c316f5175', embedding=None, metadata={'total_pages': 163, 'file_path': 'data/State of AI Report 2023.pdf', 'source': '95'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text="    In Oct 2022, Shutterstock - a leading stock multimedia provider - announced it will work with OpenAI to bring \nDALL·E-powered content onto the platform. Then in July 2023, the two companies signed a 6-year content \nlicensing agreement that would give OpenAI access to Shutterstock's image, video and music libraries and \nassociated metadata for model training. Furthermore, Shutterstock will offer its customers indemniﬁcation for AI \nimage creation. The company also entered into a content license with Meta for GenAI. This pro-GenAI stance is in \nstark contrast to Shutterstock’s competitor, Getty Images, which is profoundly against GenAI as evidenced by its \nongoing lawsuit against Stability AI for copyright infringement ﬁled in Fe

## Clean text and add metadata

In [92]:
import re

def clean_slide_text(text:str) -> str: 
    """
    Cleans the provided slide by removing specific patterns and extra whitespace. 
    
    Parameters:

    Returns: 
    """
    # Remove the footer text
    text = text.replace("stateof.ai 2023", "")

    # Remove the header text
    text = text.replace("Introduction  | Research  | Industry  | Politics  | Safety  | Predictions", "")

    # Remove the pattern "#stateofai | n"
    text = re.sub(r"#stateofai(\s*\|\s*\d+)?", "", text)

    # Replace multiple consecutive spaces with a single space
    text = re.sub(r" +", " ", text)

    # Remove any leading or trailing whitespace
    text = text.strip()

    return text

In [94]:
def assign_section(document):
    """
    Assigns a section to the document based on its page number.

    The function updates the 'metadata' attribute of the document with a key 'section'
    that has a value corresponding to the section the page number falls into.

    Sections:
    - Page 1 through 10: Introduction
    - Page 11 through 68: Research
    - Page 69 through 120: Politics
    - Page 121 through 137: Safety
    - Pages 138 and beyond: Predictions

    Args:
    - document (Document): The Document object to be updated.

    Returns:
    None. The function updates the Document object in-place.
    """

    page_number = int(document.metadata['source'])

    if 1 <= page_number <= 10:
        document.metadata['section'] = 'Introduction'
    elif 11 <= page_number <= 68:
        document.metadata['section'] = 'Research'
    elif 69 <= page_number <= 120:
        document.metadata['section'] = 'Politics'
    elif 121 <= page_number <= 137:
        document.metadata['section'] = 'Safety'
    else:
        document.metadata['section'] = 'Predictions'

In [104]:
# Iterate through each Document object in docs0
for doc in docs0:
    # Update the metadata using assign_section
    assign_section(doc)

    # Metadata keys that are excluded from text for the embed model.
    doc.excluded_embed_metadata_keys=['file_name']

    # Apply clean_slide_text to the text attribute1
    doc.text = clean_slide_text(doc.text)
    print (doc.text)

State of AI Report
October 12, 2023
Nathan Benaich
Air Street Capital

stateof.ai
About the authors
 Introduction | Research | Industry | Politics | Safety | Predictions


Nathan is the General Partner of Air Street Capital, a 
venture capital ﬁrm investing in AI-ﬁrst technology 
and life science companies. He founded RAAIS and 
London.AI (AI community for industry and research), 
the RAAIS Foundation (funding open-source AI 
projects), and Spinout.fyi (improving university spinout 
creation). He studied biology at Williams College and 
earned a PhD from Cambridge in cancer research. 
Nathan Benaich
State of AI Report 2023 team

 Introduction | Research | Industry | Politics | Safety | Predictions

Othmane Sebbouh
Venture Fellow
Othmane is a Venture Fellow at Air 
Street Capital and ML PhD student at 
ENS Paris, CREST-ENSAE and CNRS. 
He holds an MsC in management 
from ESSEC Business School and a 
Master in Applied Mathematics from 
ENSAE and Ecole Polytechnique.
Alex Chalmers
Platfor

In [96]:
docs0[94].metadata

{'total_pages': 163,
 'file_path': 'data/State of AI Report 2023.pdf',
 'source': '95',
 'section': 'Politics'}

Two options here: 
1. Directly send the entire Document object to the index
    - Maintains entire document as a single unit 
    - Useful when documents are relatively short and contexts between different parts of the document is important 
2. Covert the Document into Node objects before sending them to the index
    - Practical when the documents are long and require breaking down into chunks (or nodes) before indexing
    - Useful to retrieve specific parts of a document than the entire document

## Convert Document object to Node: Node and NodeParser

- A Node represents a chunk of a source document 
- Node contain metadata and relationship information with other nodes
- Nodes are first-class citizens in LlamaIndex, this means Nodes and their attributes can be defined directly
- Every node derived from a Document will inherit the same metadata from that Document
- Alternatively, we can parse source Documents into Nodes using the NodeParser classes. 


**Chunk Size:** 

Choosing the optimal chunk_size provides optimal results 
- Smaller chunk_size provides granular chunks, but we risk that the essential information might not be be among the top retrived chunks
- Larger chunk size might contain all necessary infromation within the top chunks 
- Increase in chunk size directs more information into the LLM. This ensures a comprehensive context but might slow down the system. 


In [97]:
import re

# Define the pattern for bullet points and newlines
split_pattern = r"\n●|\n-|\n"

# Initialize lists to store the word counts of all chunks and entire texts across all documents
chunk_word_counts = []
entire_text_word_counts = []

# Initialize a dictionary to store word counts and slide counts by section
section_data = {}

# Iterate through each Document object in your list of documents
for doc in docs0:
    # Split the document's text into chunks based on the pattern
    chunks = re.split(split_pattern, doc.text)

    # Calculate the number of words in each chunk and store it
    chunk_word_counts.extend([len(chunk.split()) for chunk in chunks])

    # Calculate the number of words in the entire text and store it
    entire_word_count = len(doc.text.split())
    entire_text_word_counts.append(entire_word_count)

    # Update the word count and slide count for the section in the dictionary
    section = doc.metadata['section']
    if section in section_data:
        section_data[section]['word_count'] += entire_word_count
        section_data[section]['slide_count'] += 1
    else:
        section_data[section] = {'word_count': entire_word_count, 'slide_count': 1}

# Calculate the total word count across all sections
total_word_count = sum(data['word_count'] for data in section_data.values())

# Calculate the number of sections
num_sections = len(section_data)

# Calculate the average word count across all sections
average_word_count_across_sections = total_word_count / num_sections

# Calculate summary statistics for chunks
average_chunk_word_count = sum(chunk_word_counts) / len(chunk_word_counts)
max_chunk_word_count = max(chunk_word_counts)

# Calculate average word count for entire texts
average_entire_text_word_count = sum(entire_text_word_counts) / len(entire_text_word_counts)

print(f"Average word count for a slide: {average_entire_text_word_count}")
print(f"Average word count per bullet point: {average_chunk_word_count}")
print(f"Longest bullet point: {max_chunk_word_count}")
print(f"Average word count in a section: {average_word_count_across_sections:.2f}")

Average word count for a slide: 138.98773006134968
Average word count per bullet point: 9.577844311377245
Longest bullet point: 28
Average word count in a section: 4531.00


### Chunking Strategy

- *NodeParsers* are a simple abstraction that take a list of documents and chunk them into Node objects. 
Each *Node* is a specific chunk of the parent document.
- Strategy: Utilize smaller child chunks that refer to bigger parent chunks.
    - Use *SimpleNodeParser* with a *SentenceSplitter* to create "base nodes" aka parent chunks
    - Use *SentenceWindowNodeParser* to create child nodes that represent bullet points in the slide deck along with metadata

In [101]:
from llama_index.core.text_splitter import SentenceSplitter
from llama_index.core.node_parser import SimpleNodeParser
from pathlib import Path

# bullet_splitter = SentenceSplitter(paragraph_separator=r"\n●|\n-|\n", chunk_size=250)

# SentenceSplitter.from_defaults(separator: str = ' ', 
#             chunk_size: int = DEFAULT_CHUNK_SIZE, 
#             chunk_overlap: int = SENTENCE_CHUNK_OVERLAP, 
#             tokenizer: Optional[Callable] = None, 
#             paragraph_separator: str = DEFAULT_PARAGRAPH_SEP, 
#             chunking_tokenizer_fn: Optional[Callable[[str], 
#             List[str]]] = None, 
#             secondary_chunking_regex: str = CHUNKING_REGEX, 
#             callback_manager: Optional[CallbackManager] = None, 
#             include_metadata: bool = True, 
#             include_prev_next_rel: bool = True) -> SentenceSplitter


parser = SentenceSplitter.from_defaults(
                chunk_size=250,
                paragraph_separator=r"\n●|\n-|\n",
                include_metadata=True,
                include_prev_next_rel=True)

slides_nodes = parser.get_nodes_from_documents(docs0)

In [99]:
for i in slides_nodes[41]:
    print(i)

('id_', '34b9463c-a2c8-4659-9578-67bde3187a60')
('embedding', None)
('metadata', {'total_pages': 163, 'file_path': 'data/State of AI Report 2023.pdf', 'source': '10', 'section': 'Introduction'})
('excluded_embed_metadata_keys', ['file_name'])
('excluded_llm_metadata_keys', [])
('relationships', {<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='c666ee42-9bd9-4bdd-9df4-27630a8cf0a5', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'total_pages': 163, 'file_path': 'data/State of AI Report 2023.pdf', 'source': '10', 'section': 'Introduction'}, hash='b71ab0f7fadc480700519213697b1d5c4712a389f67184bd612b94dc9b856027'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='55da3ecc-7f5f-4034-a668-bc0855acf1c0', node_type=<ObjectType.TEXT: '1'>, metadata={'total_pages': 163, 'file_path': 'data/State of AI Report 2023.pdf', 'source': '10', 'section': 'Introduction'}, hash='88275d40a427417c42940298e250350022baff470a9532726b62ff303c48e276'), <NodeRelationship.NEXT: '3'>: RelatedNodeIn

In [102]:
clean_slide_text(slides_nodes[1].text)
# len(slides_nodes[42].text)

'About the authors\n Introduction | Research | Industry | Politics | Safety | Predictions\n\n\nNathan is the General Partner of Air Street Capital, a \nventure capital ﬁrm investing in AI-ﬁrst technology \nand life science companies. He founded RAAIS and \nLondon.AI (AI community for industry and research), \nthe RAAIS Foundation (funding open-source AI \nprojects), and Spinout.fyi (improving university spinout \ncreation). He studied biology at Williams College and \nearned a PhD from Cambridge in cancer research. \nNathan Benaich'