# Imports

In [29]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.schema import MetadataMode
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.vector_stores.types import VectorStoreQueryMode
from llama_index.core.vector_stores.types import (
    MetadataFilters, MetadataFilter, FilterOperator, FilterCondition
)
from qdrant_client import QdrantClient
import os
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Documents loading

In [3]:
path_input_data = '../../data/tmp'
reader = SimpleDirectoryReader(input_dir=path_input_data)
documents = reader.load_data(show_progress=True)

Loading files: 100%|███| 1/1 [00:02<00:00,  2.01s/it]


In [4]:
len(documents)
documents[0].to_dict().keys()

50

dict_keys(['id_', 'embedding', 'metadata', 'excluded_embed_metadata_keys', 'excluded_llm_metadata_keys', 'relationships', 'metadata_template', 'metadata_separator', 'text_resource', 'image_resource', 'audio_resource', 'video_resource', 'text_template', 'class_name', 'text'])

## Metadata selection

In [5]:
for d in documents:

    # metadata gets injected into the text that the embeddings model & llm model receive.
    # that actual text comes from a template.
    # redefine the template the doc will use to parse the file metadata + file content.
    d.text_template = "<metadata>\n{metadata_str}\n</metadata>\n\n<content>\n{content}\n</content>"
    
    # excluded_embed_metadata_keys
    if 'page_label' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('page_label')
    if 'file_path' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('file_path')
    if 'file_name' in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.remove('file_name')
        
    # excluded_llm_metadata_keys
    if 'page_label' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('page_label')
    if 'file_path' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('file_path')
    if 'file_name' in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.remove('file_name')

## Metadata extraction

In [6]:
filename_re = re.compile(
    r"^\s*(?P<year>\d{4})\s+(?P<quarter>Q[1-4])\s+(?P<company>.+?)\s*$",
    re.IGNORECASE,
)

for d in documents:
    m = filename_re.match(d.metadata.get('file_name').strip('.pdf'))
    d.metadata['year'] = m.group('year')
    d.metadata['quarter'] = m.group('quarter')
    d.metadata['company'] = m.group('company')

    if 'file_name' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('file_name')
    if 'file_name' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('file_name')

## Visualise Metadata

In [7]:
# this is the parsed doc after metadata extraction (for the case of the embeddings model)
print(documents[0].get_content(metadata_mode=MetadataMode.EMBED))

<metadata>
year: 2022
quarter: Q3
company: AMZN
</metadata>

<content>
Table of Contents
UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
 ____________________________________
FORM 10-Q
____________________________________ 
(Mark One)
☒ QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF1934
For the quarterly period ended September 30, 2022
or
☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF1934
For the transition period from            to             .
Commission File No. 000-22513
____________________________________
AMAZON.COM, INC.
(Exact name of registrant as specified in its charter)
 ____________________________________
Delaware  91-1646860
(State or other jurisdiction ofincorporation or organization)  (I.R.S. EmployerIdentification No.)
410 Terry Avenue North, Seattle, Washington 98109-5210(206) 266-1000(Address and telephone number, including area code, of registrant’s principal executiv

# Vector index creation

In [8]:
# instantiate HuggingFace embedding model
model_name = 'BAAI/bge-small-en-v1.5'
embeddings_model = HuggingFaceEmbedding(
    model_name=model_name,
)

# instantiate transformation pipeline
chunk_size = 100
chunk_overlap = 0

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
        # TitleExtractor(),
        embeddings_model,
    ]
)
# transform documents
nodes = pipeline.run(documents=documents)

Loading weights: 100%|█| 199/199 [00:00<00:00, 2445.7
[1mBertModel LOAD REPORT[0m from: BAAI/bge-small-en-v1.5
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


In [9]:
# HierarchicalNodeParser.from_defaults(
#     chunk_sizes=[2048, 512, 128],
# )

In [56]:
qdrant_url = os.environ.get("QDRANT_URL", "http://localhost:6333")
qdrant_client = QdrantClient(url=qdrant_url)
collection_name = 'data'

vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=collection_name,
    enable_hybrid=True,  # enable hybrid search
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

# build index
index = VectorStoreIndex(nodes, storage_context=storage_context)

  qdrant_client = QdrantClient(url=qdrant_url)


## Semantic search

In [57]:
query = 'what is amazon ticker?'

# get retriever (specify the embeddings model)
top_k = 4
retriever = index.as_retriever(
    embed_model=embeddings_model,
    vector_store_query_mode=VectorStoreQueryMode.DEFAULT,  # semantic
    similarity_top_k=top_k,
)

results = retriever.retrieve(query)

for r in results:
    print("#### score:", r.score)
    print("#### text:", r.node.get_content())
    print("#### meta:", r.node.metadata)
    print("-" * 80)

#### score: 0.6859825
#### text: For
certain payment methods, including credit and debit cards, we pay interchange and other fees, which may increase over time and raise our operating costs and
lower profitability. We rely on third parties to provide certain Amazon-branded payment methods and payment processing services, including the processing of
credit cards, debit cards, electronic checks, and promotional financing.
#### meta: {'page_label': '41', 'file_name': '2022 Q3 AMZN.pdf', 'file_path': '/Users/lautaro.quiroz/Documents/lqrz/personal/llms/notebooks/llamaindex/../../data/tmp/2022 Q3 AMZN.pdf', 'file_type': 'application/pdf', 'file_size': 501892, 'creation_date': '2026-02-19', 'last_modified_date': '2026-02-19', 'year': '2022', 'quarter': 'Q3', 'company': 'AMZN'}
--------------------------------------------------------------------------------
#### score: 0.68125635
#### text: Table of Contents
AMAZON.COM, INC.
CONSOLIDATED BALANCE SHEETS
(in millions, except per share data) 
Dec

## Keyword search

In [58]:
query = 'what is amazon ticker?'

# get retriever (specify the embeddings model)
top_k = 4
retriever = index.as_retriever(
    embed_model=embeddings_model,
    vector_store_query_mode=VectorStoreQueryMode.SPARSE,  # keyword
    similarity_top_k=top_k,
)

results = retriever.retrieve(query)

for r in results:
    print("#### score:", r.score)
    print("#### text:", r.node.get_content())
    print("#### meta:", r.node.metadata)
    print("-" * 80)

#### score: 11.523385
#### text: 31.2 Certification of Brian T. Olsavsky, Senior Vice President and Chief Financial Officer of Amazon.com, Inc., pursuant to Rule 13a-14(a)
under the Securities Exchange Act of 1934.
32.1 Certification of Andrew R. Jassy, President and Chief Executive Officer of Amazon.com, Inc., pursuant to 18 U.S.C. Section 1350.
#### meta: {'page_label': '45', 'file_name': '2022 Q3 AMZN.pdf', 'file_path': '/Users/lautaro.quiroz/Documents/lqrz/personal/llms/notebooks/llamaindex/../../data/tmp/2022 Q3 AMZN.pdf', 'file_type': 'application/pdf', 'file_size': 501892, 'creation_date': '2026-02-19', 'last_modified_date': '2026-02-19', 'year': '2022', 'quarter': 'Q3', 'company': 'AMZN'}
--------------------------------------------------------------------------------
#### score: 11.315675
#### text: The People’s Republic of China (“PRC”) and India regulate Amazon’s and its affiliates’ businesses and operations in country through regulations and
license requirements that may re

## Metadata filter

In [59]:
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="company", value="AMZN", operator=FilterOperator.EQ),
        MetadataFilter(key="year", value='2022', operator=FilterOperator.EQ),
    ],
    condition=FilterCondition.AND,
)

retriever = index.as_retriever(
    embed_model=embeddings_model,
    vector_store_query_mode=VectorStoreQueryMode.DEFAULT,  # semantic
    similarity_top_k=5,
    filters=filters,  # metadata
)

results = retriever.retrieve(query)

for r in results:
    print("#### score:", r.score)
    print("#### text:", r.node.get_content())
    print("#### meta:", r.node.metadata)
    print("-" * 80)

#### score: 0.6859825
#### text: For
certain payment methods, including credit and debit cards, we pay interchange and other fees, which may increase over time and raise our operating costs and
lower profitability. We rely on third parties to provide certain Amazon-branded payment methods and payment processing services, including the processing of
credit cards, debit cards, electronic checks, and promotional financing.
#### meta: {'page_label': '41', 'file_name': '2022 Q3 AMZN.pdf', 'file_path': '/Users/lautaro.quiroz/Documents/lqrz/personal/llms/notebooks/llamaindex/../../data/tmp/2022 Q3 AMZN.pdf', 'file_type': 'application/pdf', 'file_size': 501892, 'creation_date': '2026-02-19', 'last_modified_date': '2026-02-19', 'year': '2022', 'quarter': 'Q3', 'company': 'AMZN'}
--------------------------------------------------------------------------------
#### score: 0.68125635
#### text: Table of Contents
AMAZON.COM, INC.
CONSOLIDATED BALANCE SHEETS
(in millions, except per share data) 
Dec

## Semantic + keyword + metadata search

In [66]:
top_k_final = 3
top_k_each = 5
alpha = .5
retriever = index.as_retriever(
    embed_model=embeddings_model,
    vector_store_query_mode=VectorStoreQueryMode.HYBRID,  # semantic
    similarity_top_k=top_k_final,  # controls the final number of returned nodes (after fusion).
    sparse_top_k=top_k_each,  # how many nodes will be retrieved from each dense and sparse query.
    alpha=alpha,  # by default applies relative_score_fusion
    filters=filters,  # metadata
)

results = retriever.retrieve(query)

for r in results:
    print("#### score:", r.score)
    print("#### text:", r.node.get_content())
    print("#### meta:", r.node.metadata)
    print("-" * 80)

#### score: 0.5
#### text: For
certain payment methods, including credit and debit cards, we pay interchange and other fees, which may increase over time and raise our operating costs and
lower profitability. We rely on third parties to provide certain Amazon-branded payment methods and payment processing services, including the processing of
credit cards, debit cards, electronic checks, and promotional financing.
#### meta: {'page_label': '41', 'file_name': '2022 Q3 AMZN.pdf', 'file_path': '/Users/lautaro.quiroz/Documents/lqrz/personal/llms/notebooks/llamaindex/../../data/tmp/2022 Q3 AMZN.pdf', 'file_type': 'application/pdf', 'file_size': 501892, 'creation_date': '2026-02-19', 'last_modified_date': '2026-02-19', 'year': '2022', 'quarter': 'Q3', 'company': 'AMZN'}
--------------------------------------------------------------------------------
#### score: 0.5
#### text: 31.2 Certification of Brian T. Olsavsky, Senior Vice President and Chief Financial Officer of Amazon.com, Inc., pursu