# Imports

In [None]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.schema import MetadataMode
import os
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Documents loading

In [None]:
path_input_data = '../../data/tmp'
reader = SimpleDirectoryReader(input_dir=path_input_data)
documents = reader.load_data(show_progress=True)

In [None]:
len(documents)
documents[0].to_dict().keys()

# Metadata handling

## Metadata filtering lists

Why aren’t the two lists always the same?

Because **“good for retrieval” metadata ≠ “good for generation”** metadata.

A good mental model:
* Embedding metadata: “Will this help the vector land near the right queries?”
* LLM metadata: “Will this help answer correctly without hallucinating/biasing/leaking?”

In [None]:
documents[0].excluded_embed_metadata_keys  # these fields do not reach the embedding model.
documents[0].excluded_llm_metadata_keys  # these fields do not reach the llm model.

## Embeddings metadata

The embedding model will receive a text parsed using the `.text_template` and the fields except those in `.excluded_embed_metadata_keys` and create an embedding for that entire text string?

In [None]:
documents[0].text_template

## Check which fields reach the embeddings model & LLM

This is useful to check what is reaching each model, and tweak the metadata selectors.

This is done for each document.

In [None]:
# embeddings model input
print(documents[0].get_content(metadata_mode=MetadataMode.EMBED))

In [None]:
# llm model input
print(documents[0].get_content(metadata_mode=MetadataMode.LLM))

## Metadata selection

In [None]:
for d in documents:

    # metadata gets injected into the text that the embeddings model & llm model receive.
    # that actual text comes from a template.
    # redefine the template the doc will use to parse the file metadata + file content.
    d.text_template = "<metadata>\n{metadata_str}\n</metadata>\n\n<content>\n{content}\n</content>"
    
    # excluded_embed_metadata_keys
    if 'page_label' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('page_label')
    if 'file_path' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('file_path')
    if 'file_name' in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.remove('file_name')
        
    # excluded_llm_metadata_keys
    if 'page_label' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('page_label')
    if 'file_path' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('file_path')
    if 'file_name' in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.remove('file_name')

In [None]:
# this is the parsed doc after metadata filtering (for the case of the embeddings model)
print(documents[0].get_content(metadata_mode=MetadataMode.EMBED))

## Metadata extraction

In [None]:
filename_re = re.compile(
    r"^\s*(?P<year>\d{4})\s+(?P<quarter>Q[1-4])\s+(?P<company>.+?)\s*$",
    re.IGNORECASE,
)

for d in documents:
    m = filename_re.match(d.metadata.get('file_name').strip('.pdf'))
    d.metadata['year'] = m.group('year')
    d.metadata['quarter'] = m.group('quarter')
    d.metadata['company'] = m.group('company')

    if 'file_name' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('file_name')
    if 'file_name' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('file_name')

In [None]:
# this is the parsed doc after metadata extraction (for the case of the embeddings model)
print(documents[0].get_content(metadata_mode=MetadataMode.EMBED))

# Embeddings model

### OpenAI

In [None]:
model_name = 'text-embedding-ada-002'

embedding_model = OpenAIEmbedding(
    model_name=model_name,
    api_key=os.getenv('OPENAI_API_KEY'),
)

texts = [
    "Hi, my name is Charles"
]

texts_embeddings = embedding_model.get_text_embedding_batch(texts)

len(texts_embeddings)
len(texts_embeddings[0])

### HuggingFace

In [None]:
model_name = 'BAAI/bge-small-en-v1.5'

embedding_model = HuggingFaceEmbedding(
    model_name=model_name,
)

texts = [
    "Hi, my name is Charles"
]

texts_embeddings = embedding_model.get_text_embedding_batch(texts)

embedding_model
len(texts_embeddings)
len(texts_embeddings[0])

# Transformations

These transformations include chunking, extracting metadata, and embedding each chunk.

In [None]:
chunk_size = 100
chunk_overlap = 0

transformation_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
transformation_extractor = TitleExtractor()  # use an llm to extract a summary title for the doc & append it as metadata.

documents_transformed_splitter = transformation_splitter(documents)
documents_transformed_extractor = transformation_extractor(documents_transformed_splitter)

In [None]:
documents_transformed_splitter[0]

In [None]:
documents_transformed_extractor[0]

# Vector index creation

In [None]:
# instantiate HuggingFace embedding model
model_name = 'BAAI/bge-small-en-v1.5'
embeddings_model = HuggingFaceEmbedding(
    model_name=model_name,
)

# instantiate transformation pipeline
chunk_size = 100
chunk_overlap = 0

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
        # TitleExtractor(),
        embeddings_model,
    ]
)
# transform documents
nodes = pipeline.run(documents=documents)