# Test indexing functions

The notebook demos setting up and editing an OpenSearch index using the Haystack framework. You must have Docker Desktop installed prior to using OpenSearch.

To install necessary packages, run `pip install -e '.[search_backend]'`.

Before running this notebook, set up an Opensearch container (see docker-compose.yml) by running:
```
docker compose up localstack
```
Or alternatively follow instructions here: https://docs.haystack.deepset.ai/v2.0/docs/opensearchbm25retriever

You will also need the JSON file dummy-products-20241015.json. This is [kept on the wiki](https://dsdmoj.atlassian.net/wiki/spaces/AN/pages/5214503074/Dummy+data) for privacy purposes. Copy it into the same directory as this notebook.

In [None]:
import json

from haystack import Document

from search_backend.indexing_pipeline import IndexingPipeline
from prep_data import format_doc_dict

## Read data

In [None]:
with open('dummy-products-20241015.json') as f:
    doc_list = json.load(f)

print(doc_list)

In [None]:
doc0 = [doc_list[0]]
doc1 = [doc_list[1]]
doc2 = [doc_list[2]]

In [None]:
doc0

In [None]:
dataset_part0 = [
    y for doc in doc0
    if (y := format_doc_dict(doc, "description")) is not None
]

In [None]:
print(len(dataset_part0))
dataset_part0

## Connect to OpenSearch container

In [None]:
# Connect to an existing Opensearch document store
from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore

query_document_store = OpenSearchDocumentStore(
    hosts="http://0.0.0.0:4566/opensearch/eu-west-2/rd-demo",
    use_ssl=False,
    verify_certs=False,
    http_auth=("localstack", "localstack"),
    create_index=True,
    index="document",
)

## Initialise docstore and write first document

In [None]:
docs = [Document(**content) for content in dataset_part0]


indexer = IndexingPipeline(query_document_store, dense_embedding_model="snowflake/snowflake-arctic-embed-xs")
indexer.index_docs(docs)

### Check what's in the docstore

Count how many docs are currently in the docstore:

In [None]:
query_document_store.count_documents()

Check the contents of the docstore:

In [None]:
query_document_store.filter_documents()

## Try adding another document

In [None]:
dataset_part1 = [
    y for doc in doc1
    if (y := format_doc_dict(doc, "description")) is not None
]

In [None]:
docs = [Document(**content) for content in dataset_part1]
indexer.index_docs(docs)

In [None]:
query_document_store.count_documents()

In [None]:
query_document_store.filter_documents()

In [None]:
dataset_part2 = [
    y for doc in doc2
    if (y := format_doc_dict(doc, "description")) is not None
]


In [None]:
docs = [Document(**content) for content in dataset_part2]
indexer.index_docs(docs)

In [None]:
query_document_store.count_documents()

In [None]:
query_document_store.filter_documents()

## Check behaviour when trying to add a duplicate doc

In [None]:
indexer.index_docs(docs)

In [None]:
query_document_store.count_documents()

## Try removing a document

In [None]:
filters = {
    "field": "meta.db_id",
    "operator": "in",
    "value": ["D28"],
}

results = query_document_store.filter_documents(filters=filters)
print(results[0])

doc_ids = {result.id for result in results}
print(doc_ids)

print(query_document_store.count_documents())
query_document_store.delete_documents(list(doc_ids))
print(query_document_store.count_documents())

In [None]:
indexer.delete_docs(document_ids=[28], id_metafield="id")

In [None]:
query_document_store.count_documents()

In [None]:
query_document_store.filter_documents()