# Test search functions

The notebook demos basic search functionality using OpenSearch and the Haystack framework. You must have Docker Desktop installed and be a part of the [MoJ Docker org](https://user-guide.operations-engineering.service.justice.gov.uk/documentation/services/dockerhub.html#docker) (so that you're covered by a licence) prior to using OpenSearch.

To install necessary packages, run `pip install -e '.[search_backend, dev]'`.

Before running this notebook, set up an Opensearch container (see docker-compose.yml) by running:
```
docker compose up localstack
```
Or alternatively follow instructions here: https://docs.haystack.deepset.ai/v2.0/docs/opensearchbm25retriever

Hybrid search was introduced in OpenSearch v2.11. Not clear whether Haystack is able to properly use a version this recent. Proper hybrid search with OpenSearch hasn't been enabled yet in Haystack.

You will also need the JSON file dummy-products-20241015.json. This is [kept on the wiki](https://dsdmoj.atlassian.net/wiki/spaces/AN/pages/5214503074/Dummy+data) for privacy purposes. Copy it into the same directory as this notebook.

In [None]:
import json

from haystack import Document
from search_backend.config import get_config
from search_backend.indexing_pipeline import IndexingPipeline
from search_backend.retrieval_pipeline import RetrievalPipeline
from search_backend.search import Search
from prep_data import replace_newlines, prep_project_data

cfg = get_config()

## Read data

In [None]:
with open('dummy_data.json') as f:
    project_list = json.load(f)

print(project_list)

In [None]:
# Replace newlines as they interfere with the matching
project_list = replace_newlines(project_list)

In [None]:
project_list[0]

In [None]:
dataset = prep_project_data(project_list)

In [None]:
print(len(dataset))
dataset

## Set up Opensearch

In [None]:
# Connect to an existing Opensearch document store
from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore

query_document_store = OpenSearchDocumentStore(
    hosts="http://0.0.0.0:4566/opensearch/eu-west-2/rd-demo",
    use_ssl=False,
    verify_certs=False,
    http_auth=("localstack", "localstack"),
    embedding_dim=cfg["embedding_dim"],
    recreate_index=True,
    index="document",
)

In [None]:
docs = [Document(**content) for content in dataset]

indexer = IndexingPipeline(query_document_store, dense_embedding_model=cfg["dense_embedding_model"], semantic=True)
indexer.index_docs(docs)

## Run BM25 search

In [None]:
bm25_pipeline = RetrievalPipeline(query_document_store)
bm25_pipeline = bm25_pipeline.setup_bm25_pipeline()

In [None]:
test_query = "rehabilitation"
bm25_search_init = Search(bm25_pipeline)
results = bm25_search_init.bm25_search(test_query, top_k=3)

for doc in results:
    print('-----------------------------------')
    print(f'{doc.meta["name"]} - Score: {doc.score}')
    print(doc.content)
    print("\n")

In [None]:
results[0].meta

## Run semantic search

In [None]:
semantic_pipeline = RetrievalPipeline(query_document_store, dense_embedding_model=cfg['dense_embedding_model'], rerank_model=cfg['rerank_model'])
semantic_pipeline = semantic_pipeline.setup_semantic_pipeline()

In [None]:
test_query = "project relating to law"
semantic_search_init = Search(semantic_pipeline)
results = semantic_search_init.semantic_search(test_query, top_k=3, threshold=0.00001)

for doc in results:
    print('-----------------------------------')
    print(f'{doc.meta["name"]} - Score: {doc.score}')
    print(doc.content)
    print("\n")

## Hybrid search

In [None]:
hybrid_pipeline = RetrievalPipeline(query_document_store, dense_embedding_model=cfg['dense_embedding_model'], rerank_model=cfg['rerank_model'])
hybrid_pipeline = hybrid_pipeline.setup_hybrid_pipeline()

In [None]:
test_query = "improved service quality"
hybrid_search_init = Search(hybrid_pipeline)
results = hybrid_search_init.hybrid_search(test_query, top_k=3)

for doc in results:
    print('-----------------------------------')
    print(f'{doc.meta["name"]} - Score: {doc.score}')
    print(doc.content)
    print("\n")