# Imports

In [1]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.schema import MetadataMode
import os
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

  from .autonotebook import tqdm as notebook_tqdm


# Documents loading

In [2]:
path_input_data = '../../data/tmp'
reader = SimpleDirectoryReader(input_dir=path_input_data)
documents = reader.load_data(show_progress=True)

Loading files: 100%|██████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.95s/it]


In [3]:
len(documents)
documents[0].to_dict().keys()

50

dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'metadata_template', 'metadata_separator', 'text_resource', 'image_resource', 'audio_resource', 'video_resource', 'text_template', 'class_name', 'text'])

# Metadata handling

## Metadata filtering lists

Why aren’t the two lists always the same?

Because **“good for retrieval” metadata ≠ “good for generation”** metadata.

A good mental model:
* Embedding metadata: “Will this help the vector land near the right queries?”
* LLM metadata: “Will this help answer correctly without hallucinating/biasing/leaking?”

In [4]:
documents[0].excluded_embed_metadata_keys  # these fields do not reach the embedding model.
documents[0].excluded_llm_metadata_keys  # these fields do not reach the llm model.

['file_name',
 'file_type',
 'file_size',
 'creation_date',
 'last_modified_date',
 'last_accessed_date']

['file_name',
 'file_type',
 'file_size',
 'creation_date',
 'last_modified_date',
 'last_accessed_date']

## Embeddings metadata

The embedding model will receive a text parsed using the `.text_template` and the fields except those in `.excluded_embed_metadata_keys` and create an embedding for that entire text string?

In [5]:
documents[0].text_template

'{metadata_str}\n\n{content}'

## Check which fields reach the embeddings model & LLM

This is useful to check what is reaching each model, and tweak the metadata selectors.

This is done for each document.

In [6]:
# embeddings model input
print(documents[0].get_content(metadata_mode=MetadataMode.EMBED))

page_label: 1
file_path: /Users/lautaro.quiroz/Documents/lqrz/personal/llms/notebooks/llamaindex/../../data/tmp/2022 Q3 AMZN.pdf

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
 ____________________________________
FORM 10-Q
____________________________________ 
(Mark One)
☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF1934
For the quarterly period ended September 30, 2022
or
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF1934
For the transition period from            to             .
Commission File No. 000-22513
____________________________________
AMAZON.COM, INC.
(Exact name of registrant as specified in its charter)
 ____________________________________
Delaware  91-1646860
(State or other jurisdiction ofincorporation or organization)  (I.R.S. EmployerIdentification No.)
410 Terry Avenue North, Seattle, Washington 98109-5210(206) 266-1000(Address and telephone numb

In [7]:
# llm model input
print(documents[0].get_content(metadata_mode=MetadataMode.LLM))

page_label: 1
file_path: /Users/lautaro.quiroz/Documents/lqrz/personal/llms/notebooks/llamaindex/../../data/tmp/2022 Q3 AMZN.pdf

Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
 ____________________________________
FORM 10-Q
____________________________________ 
(Mark One)
☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF1934
For the quarterly period ended September 30, 2022
or
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF1934
For the transition period from            to             .
Commission File No. 000-22513
____________________________________
AMAZON.COM, INC.
(Exact name of registrant as specified in its charter)
 ____________________________________
Delaware  91-1646860
(State or other jurisdiction ofincorporation or organization)  (I.R.S. EmployerIdentification No.)
410 Terry Avenue North, Seattle, Washington 98109-5210(206) 266-1000(Address and telephone numb

## Metadata selection

In [8]:
for d in documents:

    # metadata gets injected into the text that the embeddings model & llm model receive.
    # that actual text comes from a template.
    # redefine the template the doc will use to parse the file metadata + file content.
    d.text_template = "<metadata>\n{metadata_str}\n</metadata>\n\n<content>\n{content}\n</content>"
    
    # excluded_embed_metadata_keys
    if 'page_label' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('page_label')
    if 'file_path' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('file_path')
    if 'file_name' in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.remove('file_name')
        
    # excluded_llm_metadata_keys
    if 'page_label' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('page_label')
    if 'file_path' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('file_path')
    if 'file_name' in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.remove('file_name')

In [9]:
# this is the parsed doc after metadata filtering (for the case of the embeddings model)
print(documents[0].get_content(metadata_mode=MetadataMode.EMBED))

<metadata>
file_name: 2022 Q3 AMZN.pdf
</metadata>

<content>
Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
 ____________________________________
FORM 10-Q
____________________________________ 
(Mark One)
☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF1934
For the quarterly period ended September 30, 2022
or
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF1934
For the transition period from            to             .
Commission File No. 000-22513
____________________________________
AMAZON.COM, INC.
(Exact name of registrant as specified in its charter)
 ____________________________________
Delaware  91-1646860
(State or other jurisdiction ofincorporation or organization)  (I.R.S. EmployerIdentification No.)
410 Terry Avenue North, Seattle, Washington 98109-5210(206) 266-1000(Address and telephone number, including area code, of registrant’s principal executive offices

## Metadata extraction

In [10]:
filename_re = re.compile(
    r"^\s*(?P<year>\d{4})\s+(?P<quarter>Q[1-4])\s+(?P<company>.+?)\s*$",
    re.IGNORECASE,
)

for d in documents:
    m = filename_re.match(d.metadata.get('file_name').strip('.pdf'))
    d.metadata['year'] = m.group('year')
    d.metadata['quarter'] = m.group('quarter')
    d.metadata['company'] = m.group('company')

    if 'file_name' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('file_name')
    if 'file_name' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('file_name')

In [11]:
# this is the parsed doc after metadata extraction (for the case of the embeddings model)
print(documents[0].get_content(metadata_mode=MetadataMode.EMBED))

<metadata>
year: 2022
quarter: Q3
company: AMZN
</metadata>

<content>
Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
 ____________________________________
FORM 10-Q
____________________________________ 
(Mark One)
☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF1934
For the quarterly period ended September 30, 2022
or
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF1934
For the transition period from            to             .
Commission File No. 000-22513
____________________________________
AMAZON.COM, INC.
(Exact name of registrant as specified in its charter)
 ____________________________________
Delaware  91-1646860
(State or other jurisdiction ofincorporation or organization)  (I.R.S. EmployerIdentification No.)
410 Terry Avenue North, Seattle, Washington 98109-5210(206) 266-1000(Address and telephone number, including area code, of registrant’s principal executiv

# Embeddings model

### OpenAI

In [29]:
model_name = 'text-embedding-ada-002'

embedding_model = OpenAIEmbedding(
    model_name=model_name,
    api_key=os.getenv('OPENAI_API_KEY'),
)

texts = [
    "Hi, my name is Charles"
]

texts_embeddings = embedding_model.get_text_embedding_batch(texts)

len(texts_embeddings)
len(texts_embeddings[0])

1

1536

### HuggingFace

In [13]:
model_name = 'BAAI/bge-small-en-v1.5'

embedding_model = HuggingFaceEmbedding(
    model_name=model_name,
)

texts = [
    "Hi, my name is Charles"
]

texts_embeddings = embedding_model.get_text_embedding_batch(texts)

embedding_model
len(texts_embeddings)
len(texts_embeddings[0])

Loading weights: 100%|█████| 199/199 [00:00<00:00, 2265.27it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: BAAI/bge-small-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


HuggingFaceEmbedding(model_name='BAAI/bge-small-en-v1.5', embed_batch_size=10, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x11bfec6b0>, num_workers=None, embeddings_cache=None, max_length=512, normalize=True, query_instruction=None, text_instruction=None, cache_folder=None, show_progress_bar=False)

1

384

# Transformations

These transformations include chunking, extracting metadata, and embedding each chunk.

In [14]:
chunk_size = 100
chunk_overlap = 0

transformation_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
transformation_extractor = TitleExtractor()  # use an llm to extract a summary title for the doc & append it as metadata.

documents_transformed_splitter = transformation_splitter(documents)
documents_transformed_extractor = transformation_extractor(documents_transformed_splitter)

100%|███████████████████████████████████████████████████████████████████| 50/50 [00:51<00:00,  1.04s/it]


In [15]:
documents_transformed_splitter[0]

TextNode(id_='fc886033-2c62-47b0-bb5e-c93422ac7233', embedding=None, metadata={'page_label': '1', 'file_name': '2022 Q3 AMZN.pdf', 'file_path': '/Users/lautaro.quiroz/Documents/lqrz/personal/llms/notebooks/llamaindex/../../data/tmp/2022 Q3 AMZN.pdf', 'file_type': 'application/pdf', 'file_size': 501892, 'creation_date': '2026-02-19', 'last_modified_date': '2026-02-19', 'year': '2022', 'quarter': 'Q3', 'company': 'AMZN', 'document_title': 'Quarterly Report on Form 10-Q for the Period Ended September 30, [Year], Amazon.com, Inc. Transition Report for 2022'}, excluded_embed_metadata_keys=['file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date', 'page_label', 'file_path', 'file_name'], excluded_llm_metadata_keys=['file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date', 'page_label', 'file_path', 'file_name'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b503512d-ed4d-4759-a33b-ae871b4f7863', node_type=<Ob

In [16]:
documents_transformed_extractor[0]

TextNode(id_='fc886033-2c62-47b0-bb5e-c93422ac7233', embedding=None, metadata={'page_label': '1', 'file_name': '2022 Q3 AMZN.pdf', 'file_path': '/Users/lautaro.quiroz/Documents/lqrz/personal/llms/notebooks/llamaindex/../../data/tmp/2022 Q3 AMZN.pdf', 'file_type': 'application/pdf', 'file_size': 501892, 'creation_date': '2026-02-19', 'last_modified_date': '2026-02-19', 'year': '2022', 'quarter': 'Q3', 'company': 'AMZN', 'document_title': 'Quarterly Report on Form 10-Q for the Period Ended September 30, [Year], Amazon.com, Inc. Transition Report for 2022'}, excluded_embed_metadata_keys=['file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date', 'page_label', 'file_path', 'file_name'], excluded_llm_metadata_keys=['file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date', 'page_label', 'file_path', 'file_name'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='b503512d-ed4d-4759-a33b-ae871b4f7863', node_type=<Ob

# Vector index creation

In [23]:
# instantiate HuggingFace embedding model
model_name = 'BAAI/bge-small-en-v1.5'
embeddings_model = HuggingFaceEmbedding(
    model_name=model_name,
)

# instantiate transformation pipeline
chunk_size = 100
chunk_overlap = 0

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
        # TitleExtractor(),
        embeddings_model,
    ]
)
# transform documents
nodes = pipeline.run(documents=documents)

Loading weights: 100%|█████| 199/199 [00:00<00:00, 2179.37it/s, Materializing param=pooler.dense.weight]
[1mBertModel LOAD REPORT[0m from: BAAI/bge-small-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m
