# Lab 4 - Hybrid Search of title embeddings and full-text fields in Opensearch

In [None]:
from opensearchpy import OpenSearch,helpers
from sentence_transformers import SentenceTransformer, util as STutil
from tqdm.notebook import tqdm
from datetime import datetime
from IPython.display import display, HTML
import numpy as np
import pickle

In [None]:
# https://github.com/opensearch-project/opensearch-py/blob/main/USER_GUIDE.md
host = 'ai-search-opensearch-node'
port = 9200
client = OpenSearch(hosts = [{'host': host, 'port': port}])
info = client.info()
print(f"Welcome to {info['version']['distribution']} {info['version']['number']}!")

## Use the same model and method to get the query embedding, with some defaults changed
Remember, the model is `intfloat/e5-small-v2` and we need to prefix any query with 'query:'

In [None]:
#The E5 models expect 'query:' and 'passage:' prefixes
model = SentenceTransformer('intfloat/e5-small-v2')
def get_embeddings(texts,prefix="query: "):
    #The E5 models expects either 'query: ' or 'passage: ' prefix
    if not isinstance(texts, list):
        texts = [texts]
    prefixed = [prefix+text for text in texts]
    embeddings = model.encode(prefixed,show_progress_bar=False)
    return embeddings

## Define different query types
Here we define Opensearch query bodies for:
 - BM25
 - KNN
 - Hybrid

In [None]:
def get_bm25_body(querystring):
    return {
      "query": {
        "bool": {
          "should": [
            {
              "multi_match": {
                "query": querystring,
                "type": "cross_fields",
                "fields": ["description"],
                "boost": 1.0
              }
            },
            {
              "multi_match": {
                "query": querystring,
                "type": "cross_fields",
                "fields": ["title"],
                "boost": 1.1
              }
            },
            {
              "multi_match": {
                "query": querystring,
                "type": "cross_fields",
                "fields": ["title_exactish"],
                "boost": 1.2
              }
            }
          ]
        }
      },
      "_source": {"exclude":["title_embedding"]}
    }

In [None]:
def get_knn_body(querystring):
    embeddings = get_embeddings(querystring)
    return {
      "query": {
        "bool": {
          "should": [
            {
              "knn": {
                "title_embedding": {
                  "vector": embeddings[0],
                  "k": 20
                }
              }
            }
          ]
        }
      },
      "_source": {"exclude":["title_embedding"]}
    }

## Hybrid Query

In [None]:
def get_hybrid_body(querystring):
    embeddings = get_embeddings(querystring)
    return {
      "query": {
        "hybrid": {
          "queries": [
            {
              "bool": {
                "should": [
                  {
                    "multi_match": {
                      "query": querystring,
                      "type": "cross_fields",
                      "fields": ["description"],
                      "boost": 1.0
                    }
                  },
                  {
                    "multi_match": {
                      "query": querystring,
                      "type": "cross_fields",
                      "fields": ["title"],
                      "boost": 1.1
                    }
                  },
                  {
                    "multi_match": {
                      "query": querystring,
                      "type": "cross_fields",
                      "fields": ["title_exactish"],
                      "boost": 1.2
                    }
                  }
                ]
              }        
            },
            {
              "knn": {
                "title_embedding": {
                  "vector": embeddings[0],
                  "k": 100
                }
              }
            }
          ]
        }
      },
      "_source": {"exclude":["title_embedding"]}
    }  

In [None]:
def serps(querystring,resp,k=5,show=True):

    count = resp["hits"]["total"]["value"]
    results = resp["hits"]["hits"]
    
    # Create an HTML string to format the results
    html_str = f"<h4>Showing {count} Results for <em>{querystring}</em></h4><ol>"
    
    for result in results[:k]:
        score = result.get("_score")
        title = result["_source"].get("title", "No title")
        url = result["_source"].get("url", "No title")
        description = result["_source"].get("description", None)
        text = result["_source"].get("text", "")
        snippet = description if description else text[:140]+"..."
        
        # Format each result as an HTML list item
        html_str += f'<li><b>{title}</b>({score})<br>{snippet}<br><span style="font-size:0.8em"><a href="{url}">{url}</a></a></li>'
    
    html_str += "</ol>"
    
    # Display the HTML in the Jupyter Notebook
    if show:
        display(HTML(html_str))
    else:
        return html_str

In [None]:
def make_normalization_pipeline(name,bm25_weight=0.5,knn_weight=0.5):
    body = {
      "description": f"Post processor for hybrid search with bm25={bm25_weight} and knn={knn_weight}",
      "phase_results_processors": [
        {
          "normalization-processor": {
            "normalization": {
              "technique": "min_max"
            },
            "combination": {
              "technique": "arithmetic_mean",
              "parameters": {
                "weights": [
                  bm25_weight,
                  knn_weight
                ]
              }
            }
          }
        }
      ]
    }
    resp = client.transport.perform_request(method="PUT", url="/_search/pipeline/"+name,body=body)
    print(resp)

In [None]:
make_normalization_pipeline("nlp-search-pipeline-equal",bm25_weight=0.5,knn_weight=0.5)
make_normalization_pipeline("nlp-search-pipeline-bm25-heavy",bm25_weight=0.6,knn_weight=0.4)
make_normalization_pipeline("nlp-search-pipeline-knn-heavy",bm25_weight=0.4,knn_weight=0.6)

In [None]:
def search(querystring,body,pipeline="nlp-search-pipeline-equal",show=True):
    resp = client.search(body=body, index="ai-search", params={"search_pipeline":pipeline})
    if show:
        serps(querystring,resp)
    return resp    
def search_bm25(querystring,show=True):
    body = get_bm25_body(querystring)
    return search(querystring,body,show=show)
def search_knn(querystring,show=True):
    body = get_knn_body(querystring)
    return search(querystring,body,show=show)    
def search_hybrid(querystring,pipeline="nlp-search-pipeline-equal",show=True):
    body = get_hybrid_body(querystring)
    return search(querystring,body,pipeline=pipeline,show=show)

In [None]:
resp = search_hybrid("crypto scandal")

In [None]:
resp = search_bm25("crypto scandal")

In [None]:
resp = search_knn("crypto scandal")

In [None]:
def search_compare(querystring,pipeline="nlp-search-pipeline-equal"):
    bm25 = serps(querystring,search_bm25(querystring,show=False),k=5,show=False)   
    knn = serps(querystring,search_knn(querystring,show=False),k=5,show=False)   
    hybrid = serps(querystring,search_hybrid(querystring,pipeline=pipeline,show=False),k=5,show=False)
    html_all = f"""
        <style>
            .compare li {{overflow-x:hidden;width:320px!important;text-align:left;height:200px;border-bottom:1px solid #333;}}
        </style>
        <table class="compare">
            <tr><td>BM25</td><td>KNN</td><td>Hybrid ({pipeline})</td></tr>
            <tr><td>{bm25}</td><td>{knn}</td><td>{hybrid}</td></tr>
        </table>"""
    display(HTML(html_all))

In [None]:
search_compare("property market",pipeline="nlp-search-pipeline-equal")

In [None]:
search_compare("property market",pipeline="nlp-search-pipeline-bm25-heavy")

In [None]:
search_compare("property market",pipeline="nlp-search-pipeline-knn-heavy")

In [None]:
search_compare("crypto scandal")

In [None]:
search_compare("US economic recovery")