## OpenSearch API

This API provides the configuration to setup an opensearch-py client for the OpenSearch cluster.

## Scope

* Create and configure opensearch-py client
* Create a new index
* List indices
* Add a document
* Search for document
* Delete document
* Delete index
* Simple k-NN recommendation example

In [1]:
import hopsworks

In [2]:
connection = hopsworks.connection()

Connected. Call `.close()` to terminate connection gracefully.


In [3]:
project = connection.get_project()

In [4]:
opensearch_api = project.get_opensearch_api()

### Configure the OpenSearch client

In [5]:
from opensearchpy import OpenSearch

In [6]:
client = OpenSearch(**opensearch_api.get_default_py_config())

### Create a simple index

In [7]:
index_name = opensearch_api.get_project_index("demo_simple_index")
print(index_name)

demo_ml_meb10000_demo_simple_index


In [8]:
import json
# Create an index with non-default settings.
index_body = {
  'settings': {
    'index': {
      'number_of_shards': 1
    }
  }
}

response = client.indices.create(index_name, body=json.dumps(index_body))

2022-05-06 13:27:21,071 INFO: PUT https://10.0.2.15:9200/demo_ml_meb10000_demo_simple_index [status:200 request:0.396s]


### List the indices

In [9]:
print(client.cat.indices(index_name))

2022-05-06 13:27:21,087 INFO: GET https://10.0.2.15:9200/_cat/indices/demo_ml_meb10000_demo_simple_index [status:200 request:0.007s]
yellow open demo_ml_meb10000_demo_simple_index SSjDX0uZQRyjyHYRKMsgeg 1 1 0 0 208b 208b



### Add a document

In [10]:
# Add a document to the index.
document = {
  'title': 'Moneyball',
  'director': 'Bennett Miller',
  'year': '2011'
}
id = '1'

response = client.index(
    index = index_name,
    body = document,
    id = id,
    refresh = True
)

print('\nAdded document: {}'.format(response))

2022-05-06 13:27:21,236 INFO: PUT https://10.0.2.15:9200/demo_ml_meb10000_demo_simple_index/_doc/1?refresh=true [status:201 request:0.136s]

Added document: {'_index': 'demo_ml_meb10000_demo_simple_index', '_type': '_doc', '_id': '1', '_version': 1, 'result': 'created', 'forced_refresh': True, '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}


### Search for document

In [11]:
# Search for the document.
q = 'miller'
query = {
  'size': 5,
  'query': {
    'multi_match': {
      'query': q,
      'fields': ['title^2', 'director']
    }
  }
}

response = client.search(
    body = query,
    index = index_name
)
print('\nSearch results: {}'.format(response))

2022-05-06 13:27:21,265 INFO: POST https://10.0.2.15:9200/demo_ml_meb10000_demo_simple_index/_search [status:200 request:0.018s]

Search results: {'took': 12, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 0.2876821, 'hits': [{'_index': 'demo_ml_meb10000_demo_simple_index', '_type': '_doc', '_id': '1', '_score': 0.2876821, '_source': {'title': 'Moneyball', 'director': 'Bennett Miller', 'year': '2011'}}]}}


### Delete the document

In [12]:
# Delete the document.
response = client.delete(
    index = index_name,
    id = id
)

print('\nDeleting document: {}'.format(response))
print(response)

2022-05-06 13:27:21,296 INFO: DELETE https://10.0.2.15:9200/demo_ml_meb10000_demo_simple_index/_doc/1 [status:200 request:0.017s]

Deleting document: {'_index': 'demo_ml_meb10000_demo_simple_index', '_type': '_doc', '_id': '1', '_version': 2, 'result': 'deleted', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}
{'_index': 'demo_ml_meb10000_demo_simple_index', '_type': '_doc', '_id': '1', '_version': 2, 'result': 'deleted', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1}


### Delete the index

In [13]:
# Delete the index.
response = client.indices.delete(
    index = index_name
)

print('\nDeleting index: {}'.format(response))

2022-05-06 13:27:21,411 INFO: DELETE https://10.0.2.15:9200/demo_ml_meb10000_demo_simple_index [status:200 request:0.103s]

Deleting index: {'acknowledged': True}


## k-NN recommendation example

### Create index

In [14]:
knn_index_name = opensearch_api.get_project_index("demo_knn_index")
print(index_name)

demo_ml_meb10000_demo_simple_index


In [15]:
index_body = {
  "settings": {
      "knn": True,
      "knn.algo_param.ef_search": 100,
  },
  "mappings": {
    "properties": {
      "my_vector1": {
        "type": "knn_vector",
        "dimension": 2
      }
    }
  }
}

response = client.indices.create(knn_index_name, body=index_body)
print(response)

2022-05-06 13:27:21,774 INFO: PUT https://10.0.2.15:9200/demo_ml_meb10000_demo_knn_index [status:200 request:0.337s]
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'demo_ml_meb10000_demo_knn_index'}


### List the indices

In [16]:
print(client.cat.indices(knn_index_name))

2022-05-06 13:27:21,792 INFO: GET https://10.0.2.15:9200/_cat/indices/demo_ml_meb10000_demo_knn_index [status:200 request:0.009s]
yellow open demo_ml_meb10000_demo_knn_index nKckPRMTTnGKBdSdkbGcTA 1 1 0 0 208b 208b



### Add vectors bulk

In [17]:
from opensearchpy.helpers import bulk
import random, time
    
    
actions = [
  {
    "_index": knn_index_name,     
    "_id": count,
    "_source": {
        "my_vector1": [random.uniform(0, 10), random.uniform(0, 10)],   
    }
  }
  for count in range(0, 10)
]

bulk(
    client,
    actions,
)

time.sleep(5)

2022-05-06 13:27:21,828 INFO: POST https://10.0.2.15:9200/_bulk [status:200 request:0.021s]


### Get most similar vector

In [18]:
import pprint 
query = {
  "size": 10,
  "query": {
    "knn": {
      "my_vector1": {
        "vector": [2.5, 3],
        "k": 10
      }
    }
  }
}

response = client.search(
    body = query,
    index = knn_index_name
)

2022-05-06 13:27:26,857 INFO: POST https://10.0.2.15:9200/demo_ml_meb10000_demo_knn_index/_search [status:200 request:0.013s]


#### The highest score corresponds to the vector that is most similar to [2.5, 3]

In [19]:
pp = pprint.PrettyPrinter()
pp.pprint(response)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '0',
                    '_index': 'demo_ml_meb10000_demo_knn_index',
                    '_score': 0.24409702,
                    '_source': {'my_vector1': [4.212482946839061,
                                               2.5948657343424273]},
                    '_type': '_doc'},
                   {'_id': '5',
                    '_index': 'demo_ml_meb10000_demo_knn_index',
                    '_score': 0.212302,
                    '_source': {'my_vector1': [4.3354166798851566,
                                               2.415605665484335]},
                    '_type': '_doc'},
                   {'_id': '8',
                    '_index': 'demo_ml_meb10000_demo_knn_index',
                    '_score': 0.18577491,
                    '_source': {'my_vector1': [4.4112345303168645,
                                               2.1455753911685904]},
                    '_type': '_do

### Delete the index

In [20]:
response = client.indices.delete(
    index = knn_index_name
)

print('\nDeleting index: {}'.format(response))

2022-05-06 13:27:26,995 INFO: DELETE https://10.0.2.15:9200/demo_ml_meb10000_demo_knn_index [status:200 request:0.097s]

Deleting index: {'acknowledged': True}
