# Imports

In [None]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, StorageContext
from llama_index.core.ingestion import IngestionPipeline, IngestionCache
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.extractors import TitleExtractor
from llama_index.core.schema import MetadataMode
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.vector_stores.types import VectorStoreQueryMode
from llama_index.core.vector_stores.types import (
    MetadataFilters, MetadataFilter, FilterOperator, FilterCondition
)
from qdrant_client import QdrantClienta
import os
import re
import logging

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Documents loading

In [None]:
path_input_data = '../../data/tmp'
reader = SimpleDirectoryReader(input_dir=path_input_data)
documents = reader.load_data(show_progress=True)

In [None]:
len(documents)
documents[0].to_dict().keys()

## Metadata selection

In [None]:
for d in documents:

    # metadata gets injected into the text that the embeddings model & llm model receive.
    # that actual text comes from a template.
    # redefine the template the doc will use to parse the file metadata + file content.
    d.text_template = "<metadata>\n{metadata_str}\n</metadata>\n\n<content>\n{content}\n</content>"
    
    # excluded_embed_metadata_keys
    if 'page_label' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('page_label')
    if 'file_path' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('file_path')
    if 'file_name' in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.remove('file_name')
        
    # excluded_llm_metadata_keys
    if 'page_label' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('page_label')
    if 'file_path' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('file_path')
    if 'file_name' in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.remove('file_name')

## Metadata extraction

In [None]:
filename_re = re.compile(
    r"^\s*(?P<year>\d{4})\s+(?P<quarter>Q[1-4])\s+(?P<company>.+?)\s*$",
    re.IGNORECASE,
)

for d in documents:
    m = filename_re.match(d.metadata.get('file_name').strip('.pdf'))
    d.metadata['year'] = m.group('year')
    d.metadata['quarter'] = m.group('quarter')
    d.metadata['company'] = m.group('company')

    if 'file_name' not in d.excluded_embed_metadata_keys:
        d.excluded_embed_metadata_keys.append('file_name')
    if 'file_name' not in d.excluded_llm_metadata_keys:
        d.excluded_llm_metadata_keys.append('file_name')

## Visualise Metadata

In [None]:
# this is the parsed doc after metadata extraction (for the case of the embeddings model)
print(documents[0].get_content(metadata_mode=MetadataMode.EMBED))

# Vector index creation

In [None]:
def is_document_in_db(client: QdrantClient, collection_name: str, file_name: str, key: str = 'file_name') -> bool:
    """Check if document is already in the vector db."""
    scroll_filter = Filter(
        must=[FieldCondition(key=key, match=MatchValue(value=file_name))]
    )
    
    _, point_id = client.scroll(
        collection_name=collection_name,
        scroll_filter=scroll_filter,
        limit= 1,
        with_payload=False,
        with_vectors=False,
    )
    
    return point_id is not None


def collection_exists(client: QdrantClient, name: str) -> bool:
    """Check if collection exists in the vector db."""
    try:
        _ = client.get_collection(name)
        return True
    except UnexpectedResponse as e:
        return False


# get qdrant client
qdrant_url = "http://localhost:6333"
qdrant_client = QdrantClient(url=qdrant_url)
collection_name = 'data'

if collection_exists(client=qdrant_client, name=collection_name):
    for d in documents[:]:
        file_name = d.metadata['file_name']
        if is_document_in_db(client=qdrant_client, collection_name=collection_name, file_name=file_name):
            _ = logging.error(f'File: {file_name} already in vector db. Skipping...')
            _ = documents.remove(d)
        else:
            _ = logging.info(f'Keeping file: {file_name}')


vector_store = QdrantVectorStore(
    client=qdrant_client,
    collection_name=collection_name,
    enable_hybrid=True,  # enable hybrid search
)

storage_context = StorageContext.from_defaults(vector_store=vector_store)

In [None]:
# instantiate HuggingFace embedding model
model_name = 'BAAI/bge-small-en-v1.5'
embeddings_model = HuggingFaceEmbedding(
    model_name=model_name,
)

# instantiate transformation pipeline
chunk_size = 100
chunk_overlap = 0

pipeline = IngestionPipeline(
    transformations=[
        SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap),
        # TitleExtractor(),
        embeddings_model,
    ]
)
# transform documents
nodes = pipeline.run(documents=documents)

In [None]:
# HierarchicalNodeParser.from_defaults(
#     chunk_sizes=[2048, 512, 128],
# )

In [None]:
# build index and insert nodes
index = VectorStoreIndex(nodes, storage_context=storage_context)

## Semantic search

In [None]:
query = 'what is amazon ticker?'

# get retriever (specify the embeddings model)
top_k = 4
retriever = index.as_retriever(
    embed_model=embeddings_model,
    vector_store_query_mode=VectorStoreQueryMode.DEFAULT,  # semantic
    similarity_top_k=top_k,
)

results = retriever.retrieve(query)

for r in results:
    print("#### score:", r.score)
    print("#### text:", r.node.get_content())
    print("#### meta:", r.node.metadata)
    print("-" * 80)

## Keyword search

In [None]:
query = 'what is amazon ticker?'

# get retriever (specify the embeddings model)
top_k = 4
retriever = index.as_retriever(
    embed_model=embeddings_model,
    vector_store_query_mode=VectorStoreQueryMode.SPARSE,  # keyword
    similarity_top_k=top_k,
)

results = retriever.retrieve(query)

for r in results:
    print("#### score:", r.score)
    print("#### text:", r.node.get_content())
    print("#### meta:", r.node.metadata)
    print("-" * 80)

## Metadata filter

In [None]:
filters = MetadataFilters(
    filters=[
        MetadataFilter(key="company", value="AMZN", operator=FilterOperator.EQ),
        MetadataFilter(key="year", value='2022', operator=FilterOperator.EQ),
    ],
    condition=FilterCondition.AND,
)

retriever = index.as_retriever(
    embed_model=embeddings_model,
    vector_store_query_mode=VectorStoreQueryMode.DEFAULT,  # semantic
    similarity_top_k=5,
    filters=filters,  # metadata
)

results = retriever.retrieve(query)

for r in results:
    print("#### score:", r.score)
    print("#### text:", r.node.get_content())
    print("#### meta:", r.node.metadata)
    print("-" * 80)

## Semantic + keyword + metadata search

In [None]:
top_k_final = 3
top_k_each = 5
alpha = .5
retriever = index.as_retriever(
    embed_model=embeddings_model,
    vector_store_query_mode=VectorStoreQueryMode.HYBRID,  # semantic
    similarity_top_k=top_k_final,  # controls the final number of returned nodes (after fusion).
    sparse_top_k=top_k_each,  # how many nodes will be retrieved from each dense and sparse query.
    alpha=alpha,  # by default applies relative_score_fusion
    filters=filters,  # metadata
)

results = retriever.retrieve(query)

for r in results:
    print("#### score:", r.score)
    print("#### text:", r.node.get_content())
    print("#### meta:", r.node.metadata)
    print("-" * 80)