In [1]:
#!pip install opensearch-py==2.3.1

In [1]:
from opensearchpy import OpenSearch,helpers
from sentence_transformers import SentenceTransformer, util as STutil
from tqdm.notebook import tqdm
from datasets import Dataset,load_dataset,concatenate_datasets
from datetime import datetime
import numpy as np
import pickle

In [2]:
# https://github.com/opensearch-project/opensearch-py/blob/main/USER_GUIDE.md
host = 'ai-search-opensearch-node'
port = 9200
client = OpenSearch(hosts = [{'host': host, 'port': port}])
info = client.info()
print(f"Welcome to {info['version']['distribution']} {info['version']['number']}!")

Welcome to opensearch 2.11.0!


## First, look at the schema

Right click on the 'schema.json' file in the tree, and open with Editor

## Create the OpenSearch Index

In [3]:
def create_index(name="ai-search",filename="schema.json",delete=False):
    schema = None
    with open(filename) as fd:
        schema = fd.read()
    index_name = name
    index_body = schema

    if delete:
        try:
            client.indices.delete(index_name)
        except:
            pass
        
    response = client.indices.create(index_name,body=index_body)
    
    print(response)

In [4]:
create_index(delete=True)
#create_index()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ai-search'}


## Use the same model and method to get the query embedding, with some defaults changed
Remember, the model is `intfloat/e5-small-v2` and we need to prefix any query with 'query:'

In [27]:
#The E5 models expect 'query:' and 'passage:' prefixes
model = SentenceTransformer('intfloat/e5-small-v2')
def get_embeddings(texts,prefix="query: "):
    #The E5 models expects either 'query:' or 'passage:' prefix
    if not isinstance(texts, list):
        texts = [texts]
    prefixed = [prefix+text for text in texts]
    embeddings = model.encode(prefixed,show_progress_bar=False)
    return embeddings

## Get our dataset and title_embeddings

In [6]:
# Load 50k records of the the 'cc_news' dataset from Hugging Face
dataset = load_dataset("cc_news",split='train[0:50000]')

Downloading readme:   0%|          | 0.00/7.57k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/211M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/234M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/245M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/215M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/708241 [00:00<?, ? examples/s]

In [7]:
#Load the title_embeddings we generated in 02-sentence-transformers
title_embeddings = []
with open('cc_news_title_embeddings_50000.pkl','rb') as fd:
    title_embeddings = pickle.load(fd)

In [8]:
#Add the title embeddings as a new column in our dataset
title_embeddings_dataset = Dataset.from_dict({"title_embedding": title_embeddings})
records_dataset = concatenate_datasets([dataset, title_embeddings_dataset], axis=1)
#dataset = dataset.add_column("title_embedding",title_embeddings)

In [9]:
print(records_dataset.select([2])['title'])
print(records_dataset.select([2])['title_embedding'][0])

['Watch Pennsylvania Ballet & Boston Ballet Face Off for the Super Bowl']
[-0.07843358814716339, 0.01670742779970169, 0.005107884760946035, -0.016091743484139442, -0.0279251579195261, 0.04059495031833649, 0.05747104063630104, -0.045327845960855484, 0.021599523723125458, 0.027876192703843117, -0.012411370873451233, 0.024207158014178276, -0.02011098898947239, 0.03666096553206444, 0.021458571776747704, -0.06887411326169968, -0.021738534793257713, 0.048546548932790756, -0.06475327908992767, 0.028987271711230278, -0.0002391615998931229, -0.0532710999250412, -2.8766125979018398e-05, -0.000542056281119585, 0.012278076261281967, 0.013491272926330566, 0.075719453394413, 0.06790968775749207, -0.059126097708940506, -0.11319344490766525, -0.048528704792261124, -0.04630454257130623, -0.05932382121682167, -0.013018528930842876, 0.04307679831981659, -0.0577373281121254, -0.01128420326858759, 0.041460562497377396, 0.029161149635910988, 0.09307586401700974, -0.061578139662742615, -0.023966027423739433,

## Index the data in bulk, into our Opensearch index

In [10]:
def format_date(date_string):
    try:
        date_obj = datetime.strptime(date_string, '%Y-%m-%d %H:%M:%S')
        iso_date = date_obj.strftime('%Y-%m-%dT%H:%M:%S')
        return iso_date
    except ValueError:
        return None

def get_document(idx,records):    
    rec = records.select([idx])
    return {
        'title': rec['title'][0],
        'text': rec['text'][0],
        'domain': rec['domain'][0],
        'date': format_date(rec['date'][0]),
        'description': rec['description'][0],
        'url': rec['url'][0],
        'image_url': rec['image_url'][0],
        'title_embedding':rec['title_embedding'][0]        
    }

In [11]:
first_doc = get_document(0,records_dataset)
print(first_doc)

{'title': 'Daughter Duo is Dancing in The Same Company', 'text': 'There\'s a surprising twist to Regina Willoughby\'s last season with Columbia City Ballet: It\'s also her 18-year-old daughter Melina\'s first season with the company. Regina, 40, will retire from the stage in March, just as her daughter starts her own career as a trainee. But for this one season, they\'re sharing the stage together.\nPerforming Side-By-Side In The Nutcracker\nRegina and Melina are not only dancing in the same Nutcracker this month, they\'re onstage at the same time: Regina is doing Snow Queen, while Melina is in the snow corps, and they\'re both in the Arabian divertissement. "It\'s very surreal to be dancing it together," says Regina. "I don\'t know that I ever thought Melina would take ballet this far."\nLeft: Regina and Melina with another company member post-snow scene in 2003. Right: The pair post-snow scene in 2017 (in the same theater)\nKeep reading at dancemagazine.com.', 'domain': 'www.pointema

In [12]:
def index_one(document):
    index_name = "ai-search"
    client.index(index=index_name, id=document['url'], body=document)

In [13]:
index_one(first_doc)

In [14]:
records_dataset.select(list(range(100,200)))

Dataset({
    features: ['title', 'text', 'domain', 'date', 'description', 'url', 'image_url', 'title_embedding'],
    num_rows: 100
})

In [15]:
records_dataset.num_rows

50000

In [16]:
def index_bulk(records_dataset,batch_size=100):
    index="ai-search"
    count = records_dataset.num_rows
    batches = (count//batch_size)+1
    for batch in tqdm(range(0,count,batch_size)):
        left = batch
        right = min(batch+batch_size,count)
        documents = []
        for idx in range(left,right):
            document = get_document(idx,records_dataset)
            document['_index'] = index
            document['_id'] = document['url']
            documents.append(document)
        response = helpers.bulk(client,documents,max_retries=3)

In [17]:
index_bulk(records_dataset)

  0%|          | 0/500 [00:00<?, ?it/s]

# Time to search!

In [18]:
def get_knn_body(querystring):
    embeddings = get_embeddings(querystring)
    return {
      "query": {
        "bool": {
          "should": [
            {
              "knn": {
                "title_embedding": {
                  "vector": embeddings[0],
                  "k": 20
                }
              }
            }
          ]
        }
      },
      "_source": {"exclude":["title_embedding"]}
    }

In [24]:
from IPython.display import display, HTML
def serps(querystring,resp,k=5,show=True):

    count = resp["hits"]["total"]["value"]
    results = resp["hits"]["hits"]
    
    # Create an HTML string to format the results
    html_str = f"<h4>Showing {count} Results for <em>{querystring}</em></h4><ol>"
    
    for result in results[:k]:
        score = result.get("_score")
        title = result["_source"].get("title", "No title")
        url = result["_source"].get("url", "No title")
        description = result["_source"].get("description", None)
        text = result["_source"].get("text", "")
        snippet = description if description else text[:140]+"..."
        
        # Format each result as an HTML list item
        html_str += f'<li><b>{title}</b>({score})<br>{snippet}<br><span style="font-size:0.8em"><a href="{url}">{url}</a></a></li>'
    
    html_str += "</ol>"
    
    # Display the HTML in the Jupyter Notebook
    if show:
        display(HTML(html_str))
    else:
        return html_str

In [28]:
def search(querystring):
    body = get_knn_body(querystring)
    resp = client.search(body=body, index="ai-search")
    serps(querystring,resp)

In [29]:
search("Ballet event")