# Test search functions

The notebook demos basic search functionality using OpenSearch and the Haystack framework. You must have Docker Desktop installed and be a part of the [MoJ Docker org](https://user-guide.operations-engineering.service.justice.gov.uk/documentation/services/dockerhub.html#docker) (so that you're covered by a licence) prior to using OpenSearch.

To install necessary packages, run `pip install -e '.[search_backend, dev]'`.

Before running this notebook, set up an Opensearch container (see docker-compose.yml) by running:
```
docker compose up localstack
```
Or alternatively follow instructions here: https://docs.haystack.deepset.ai/v2.0/docs/opensearchbm25retriever

Hybrid search was introduced in OpenSearch v2.11. Not clear whether Haystack is able to properly use a version this recent. Proper hybrid search with OpenSearch hasn't been enabled yet in Haystack.

In [None]:
import json

from search_backend.api.lib.config import get_config
from search_backend.api.lib.indexing_functions import run_indexing_pipeline
from search_backend.api.lib.retrievalservice import RetrievalPipeline
from search_backend.api.lib.searchservice import Search
from prep_data import replace_newlines, prep_project_data

cfg = get_config()

## Read data

In [None]:
with open('ai_catalogue.json') as f:
    project_list = json.load(f)

print(project_list)

In [3]:
# Replace newlines as they interfere with the matching
project_list = replace_newlines(project_list)

In [None]:
project_list[0]

In [6]:
dataset = prep_project_data(project_list)

In [None]:
print(len(dataset))
dataset

## Set up Opensearch

In [9]:
# Connect to an existing Opensearch document store
# query_document_store = SERVICES["querydocumentstore"]
from haystack_integrations.document_stores.opensearch import OpenSearchDocumentStore


query_document_store = OpenSearchDocumentStore(
    hosts="http://0.0.0.0:4566/opensearch/eu-west-2/rd-demo",
    use_ssl=False,
    verify_certs=False,
    http_auth=("localstack", "localstack"),
    embedding_dim=cfg["embedding_dim"],
    recreate_index=True,
    index="document",
)

In [None]:
run_indexing_pipeline(dataset, query_document_store, cfg, semantic=True)

## Run BM25 search

In [11]:
bm25_pipeline = RetrievalPipeline(query_document_store)
bm25_pipeline = bm25_pipeline.setup_bm25_pipeline()

In [None]:
test_query = "improved service quality"
results = Search(test_query, bm25_pipeline, top_k=3).bm25_search()


for doc in results["bm25_retriever"]['documents']:
    print('-----------------------------------')
    print(f'{doc.meta["project_name"]} - Score: {doc.score}')
    print(doc.content)
    print("\n")

In [None]:
results["bm25_retriever"]['documents'][0].meta

## Run semantic search

In [13]:
semantic_pipeline = RetrievalPipeline(query_document_store, dense_embedding_model=cfg['dense_embedding_model'], rerank_model=cfg['rerank_model'])
semantic_pipeline = semantic_pipeline.setup_semantic_pipeline()

In [None]:
test_query = "project relating to law"
results = Search(test_query, semantic_pipeline, top_k=3).semantic_search()


for doc in results["ranker"]['documents']:
    print('-----------------------------------')
    print(f'{doc.meta["project_name"]} - Score: {doc.score}')
    print(doc.content)
    print("\n")

## Hybrid search

In [16]:
hybrid_pipeline = RetrievalPipeline(query_document_store, dense_embedding_model=cfg['dense_embedding_model'], rerank_model=cfg['rerank_model'])
hybrid_pipeline = hybrid_pipeline.setup_hybrid_pipeline()

In [None]:
test_query = "improved service quality"
results = Search(test_query, hybrid_pipeline, top_k=3).hybrid_search()


for doc in results["ranker"]['documents']:
    print('-----------------------------------')
    print(f'{doc.meta["project_name"]} - Score: {doc.score}')
    print(doc.content)
    print("\n")