# Test indexing functions

The notebook demos setting up and editing an OpenSearch index using the Haystack framework. You must have Docker Desktop installed prior to using OpenSearch.

To install necessary packages, run `pip install -e '.[search_backend]'`.

Before running this notebook, set up an Opensearch container (see docker-compose.yml) by running:
```
docker compose up localstack
```
Or alternatively follow instructions here: https://docs.haystack.deepset.ai/v2.0/docs/opensearchbm25retriever

In [None]:
import json

from search_backend.api.lib.indexing_functions import run_indexing_pipeline, delete_docs
from prep_data import format_doc_dict

## Read data

In [None]:
with open('dummy_data.json') as f:
    doc_list = json.load(f)

print(doc_list)

In [6]:
doc0 = [doc_list[0]]
doc1 = [doc_list[1]]
doc2 = [doc_list[2]]

In [None]:
doc0

In [8]:
dataset_part0 = [
    y for doc in doc0
    if (y := format_doc_dict(doc, "description")) is not None
]

In [None]:
print(len(dataset_part0))
dataset_part0

## Connect to OpenSearch container

In [2]:
# Connect to an existing Opensearch document store
# query_document_store = SERVICES["querydocumentstore"]
from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore


query_document_store = OpenSearchDocumentStore(
    hosts="http://0.0.0.0:4566/opensearch/eu-west-2/rd-demo",
    use_ssl=False,
    verify_certs=False,
    http_auth=("localstack", "localstack"),
    create_index=True,
    index="document",
)

## Initialise docstore and write first document

In [None]:
run_indexing_pipeline(dataset_part0, query_document_store)

### Check what's in the docstore

Count how many docs are currently in the docstore:

In [None]:
query_document_store.count_documents()

Check the contents of the docstore:

In [None]:
query_document_store.filter_documents()

## Try adding another document

In [17]:
dataset_part1 = [
    y for doc in doc1
    if (y := format_doc_dict(doc, "description")) is not None
]

In [None]:
run_indexing_pipeline(dataset_part1, query_document_store)

In [None]:
query_document_store.count_documents()

In [None]:
query_document_store.filter_documents()

In [25]:
dataset_part2 = [
    y for doc in doc2
    if (y := format_doc_dict(doc, "description")) is not None
]


In [None]:
run_indexing_pipeline(dataset_part2, query_document_store)

In [None]:
query_document_store.count_documents()

In [None]:
query_document_store.filter_documents()

## Check behaviour when trying to add a duplicate doc

In [None]:
run_indexing_pipeline(dataset_part2, query_document_store)

In [None]:
query_document_store.count_documents()

## Try removing a document

In [3]:
delete_docs(document_store=query_document_store, document_ids=["0001"], id_metafield="reference_id")

In [None]:
query_document_store.count_documents()

In [None]:
query_document_store.filter_documents()