# Lab 3 - Indexing and Searching embeddings in Opensearch

In [None]:
from opensearchpy import OpenSearch,helpers
from sentence_transformers import SentenceTransformer, util as STutil
from tqdm.notebook import tqdm
from datasets import Dataset,load_dataset,concatenate_datasets
from datetime import datetime
import numpy as np
import pickle

In [None]:
# https://github.com/opensearch-project/opensearch-py/blob/main/USER_GUIDE.md
host = 'ai-search-opensearch-node'
port = 9200
client = OpenSearch(hosts = [{'host': host, 'port': port}])
info = client.info()
print(f"Welcome to {info['version']['distribution']} {info['version']['number']}!")

## First, look at the schema

Right click on the 'schema.json' file in the tree, and open with Editor

## Create the OpenSearch Index

In [None]:
def create_index(name="ai-search",filename="schema.json",delete=False):
    schema = None
    with open(filename) as fd:
        schema = fd.read()
    index_name = name
    index_body = schema

    if delete:
        try:
            client.indices.delete(index_name)
        except:
            pass
        
    response = client.indices.create(index_name,body=index_body)
    
    print(response)

In [None]:
create_index(delete=True)
#create_index()

## Use the same model and method to get the query embedding, with some defaults changed
Remember, the model is `intfloat/e5-small-v2` and we need to prefix any query with 'query:'

In [None]:
#The E5 models expect 'query:' and 'passage:' prefixes
model = SentenceTransformer('intfloat/e5-small-v2')
def get_embeddings(texts,prefix="query: "):
    #The E5 models expects either 'query:' or 'passage:' prefix
    if not isinstance(texts, list):
        texts = [texts]
    prefixed = [prefix+text for text in texts]
    embeddings = model.encode(prefixed,show_progress_bar=False)
    return embeddings

## Get our dataset and title_embeddings

In [None]:
# Load 50k records of the the 'cc_news' dataset from Hugging Face
dataset = load_dataset("cc_news",split='train[0:50000]')

In [None]:
#Load the title_embeddings we generated in 02-sentence-transformers
title_embeddings = []
with open('cc_news_title_embeddings_50000.pkl','rb') as fd:
    title_embeddings = pickle.load(fd)

In [None]:
#Add the title embeddings as a new column in our dataset
title_embeddings_dataset = Dataset.from_dict({"title_embedding": title_embeddings})
records_dataset = concatenate_datasets([dataset, title_embeddings_dataset], axis=1)
#dataset = dataset.add_column("title_embedding",title_embeddings)

In [None]:
print(records_dataset.select([2])['title'])
print(records_dataset.select([2])['title_embedding'][0])

## Index the data in bulk, into our Opensearch index

In [None]:
def format_date(date_string):
    try:
        date_obj = datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S')
        iso_date = date_obj.strftime('%Y-%m-%dT%H:%M:%S')
        return iso_date
    except ValueError:
        return None

def get_document(idx,records):    
    rec = records.select([idx])
    return {
        'title': rec['title'][0],
        'text': rec['text'][0],
        'domain': rec['domain'][0],
        'date': format_date(rec['date'][0]),
        'description': rec['description'][0],
        'url': rec['url'][0],
        'image_url': rec['image_url'][0],
        'title_embedding':rec['title_embedding'][0]        
    }

In [None]:
first_doc = get_document(0,records_dataset)
print(first_doc)

In [None]:
def index_one(document):
    index_name = "ai-search"
    client.index(index=index_name, id=document['url'], body=document)

In [None]:
index_one(first_doc)

In [None]:
records_dataset.select(list(range(100,200)))

In [None]:
records_dataset.num_rows

In [None]:
def index_bulk(records_dataset,batch_size=100):
    index = "ai-search"
    count = records_dataset.num_rows
    batches = (count//batch_size)+1
    for batch in tqdm(range(0,count,batch_size)):
        left = batch
        right = min(batch+batch_size,count)
        documents = []
        for idx in range(left,right):
            document = get_document(idx,records_dataset)
            document['_index'] = index
            document['_id'] = document['url']
            documents.append(document)
        response = helpers.bulk(client,documents,max_retries=3)

In [None]:
index_bulk(records_dataset)

# Time to search!

In [None]:
def get_knn_body(querystring):
    embeddings = get_embeddings(querystring)
    return {
      "query": {
        "bool": {
          "should": [
            {
              "knn": {
                "title_embedding": {
                  "vector": embeddings[0],
                  "k": 20
                }
              }
            }
          ]
        }
      },
      "_source": {"exclude":["title_embedding"]}
    }

In [None]:
from IPython.display import display, HTML
def serps(querystring,resp,k=5,show=True):

    count = resp["hits"]["total"]["value"]
    results = resp["hits"]["hits"]
    
    # Create an HTML string to format the results
    html_str = f"<h4>Showing {count} Results for <em>{querystring}</em></h4><ol>"
    
    for result in results[:k]:
        score = result.get("_score")
        title = result["_source"].get("title", "No title")
        url = result["_source"].get("url", "No title")
        description = result["_source"].get("description", None)
        text = result["_source"].get("text", "")
        snippet = description if description else text[:140]+"..."
        
        # Format each result as an HTML list item
        html_str += f'<li><b>{title}</b>({score})<br>{snippet}<br><span style="font-size:0.8em"><a href="{url}">{url}</a></a></li>'
    
    html_str += "</ol>"
    
    # Display the HTML in the Jupyter Notebook
    if show:
        display(HTML(html_str))
    else:
        return html_str

In [None]:
def search(querystring):
    body = get_knn_body(querystring)
    resp = client.search(body=body, index="ai-search")
    serps(querystring,resp)

In [None]:
search("Ballet event")

In [None]:
search("housing market")