# Introduction to OpenSearch

A server is available on the cluster for this course. If you really need to you can set up your own server in your local machine. I advise you to use docker: https://opensearch.org/docs/latest/opensearch/install/docker/

## CURL Connection to server


In [2]:
import pprint as pp
import requests

host = '10.10.255.202'
port = 8200
index_name = 'user220'
auth = ('user220', '2Ic{*Ul>/4c4/)pXn+J*B!') # For testing only. Don't store credentials in code.

s = requests.Session()
s.auth = auth

#auth = (index_name, 'zya*xJ!4]n') # For testing only. Don't store credentials in code.
ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA.
server_uri = 'https://' + host + ':' + str(port)

# function for the cURL requests
def opensearch_curl(uri = '/' , body='', verb='get'):
    # pass header option for content type if request has a
    # body to avoid Content-Type error in Elasticsearch v6.0
    
    uri = server_uri + uri
    print(uri)
    headers = {
        'Content-Type': 'application/json',
    }

    try:
        # make HTTP verb parameter case-insensitive by converting to lower()
        if verb.lower() == "get":
            resp = s.get(uri, json=body, headers=headers, verify=False)
        elif verb.lower() == "post":
            resp = s.post(uri, json=body, headers=headers, verify=False)
        elif verb.lower() == "put":
            resp = s.put(uri, json=body, headers=headers, verify=False)
        elif verb.lower() == "del":
                resp = s.delete(uri, json=body, headers=headers, verify=False)
        elif verb.lower() == "head":
                resp = s.head(uri, json=body, headers=headers, verify=False)

        # read the text object string
        try:
            resp_text = json.loads(resp.text)
        except:
            resp_text = resp.text

        # catch exceptions and print errors to terminal
    except Exception as error:
        print ('\nelasticsearch_curl() error:', error)
        resp_text = error

    # return the Python dict of the request
    return resp_text


## OpenSearch Python API

A short introduction is available here:
https://opensearch.org/docs/1.1/clients/python/


In [4]:
import pprint as pp
from opensearchpy import OpenSearch
from opensearchpy import helpers

# Optional client certificates if you don't want to use HTTP basic authentication.
# client_cert_path = '/full/path/to/client.pem'
# client_key_path = '/full/path/to/client-key.pem'

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
    #, ca_certs = ca_certs_path
)

if client.indices.exists(index_name):

    client.indices.open(index = index_name)

    print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
    index_settings = {
        "settings":{
          "index":{
             "refresh_interval" : "1s"
          }
       }
    }
    client.indices.put_settings(index = index_name, body = index_settings)
    settings = client.indices.get_settings(index = index_name)
    pp.pprint(settings)

    print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
    mappings = client.indices.get_mapping(index = index_name)
    pp.pprint(mappings)

    print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
    print(client.count(index = index_name))
    


----------------------------------------------------------------------------------- INDEX SETTINGS
{'user220': {'settings': {'index': {'creation_date': '1647384285322',
                                    'knn': 'true',
                                    'number_of_replicas': '0',
                                    'number_of_shards': '4',
                                    'provided_name': 'user220',
                                    'refresh_interval': '1s',
                                    'uuid': 'xEy7XhsRQXu008Supzqjag',
                                    'version': {'created': '135238227'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'user220': {'mappings': {'properties': {'contents': {'similarity': 'BM25',
                                                      'type': 'text'},
                                         'id': {'type': 'keyword'},
                                         'sentence_embedding': {'mod

# Index creation and configuration

## Create an index with your own settings


In [30]:

index_body = {
   "settings":{
      "index":{
         "number_of_replicas":0,
         "number_of_shards":4,
         "refresh_interval" : "-1",
         "knn" : "true",
      },
   },
   "mappings":{
      "properties":{
         "id":{
            "type":"keyword"
         },
         "contents":{
            "type":"text",
            "similarity":"BM25"
         },
         "sentence_embedding": {
           "type": "knn_vector",
           "model_id": "model_kwiz"
         }
      }
   }
}

if not client.indices.exists(index_name):
    response = client.indices.create(index_name, body=index_body)
    print('\nCreating index:')
    print(response)



Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'kwiz'}


## Check the indexes, settings and mappings


In [5]:
print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
index_settings = {
    "settings":{
      "index":{
         "refresh_interval" : "1s"
      }
   }
}
client.indices.put_settings(index = index_name, body = index_settings)
settings = client.indices.get_settings(index = index_name)
pp.pprint(settings)

print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
mappings = client.indices.get_mapping(index = index_name)
pp.pprint(mappings)

print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
print(client.count(index = index_name))



----------------------------------------------------------------------------------- INDEX SETTINGS
{'user220': {'settings': {'index': {'creation_date': '1647384285322',
                                    'knn': 'true',
                                    'number_of_replicas': '0',
                                    'number_of_shards': '4',
                                    'provided_name': 'user220',
                                    'refresh_interval': '1s',
                                    'uuid': 'xEy7XhsRQXu008Supzqjag',
                                    'version': {'created': '135238227'}}}}}

----------------------------------------------------------------------------------- INDEX MAPPINGS
{'user220': {'mappings': {'properties': {'contents': {'similarity': 'BM25',
                                                      'type': 'text'},
                                         'id': {'type': 'keyword'},
                                         'sentence_embedding': {'mod

## Delete the index if you want to replace it

In [6]:
be absolutely sure that you want to comment this line _and_ DELETE the index!!!

if client.indices.exists(index=index_name):
    # Delete the index.
    response = client.indices.delete(
        index = index_name,
        timeout = "600s"
    )
    print('\nDeleting index:')
    print(response)

SyntaxError: invalid syntax (382793999.py, line 1)

# Document Processing

## HTML text extraction



## Text tokenization

In [27]:
import spacy
from spacy import displacy
from pathlib import Path

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

save_figures = False

print("token".ljust(10), "lemma".ljust(10), "pos".ljust(6), "tag".ljust(6), "dep".ljust(10),
            "shape".ljust(10), "alpha", "stop")
print("------------------------------------------------------------------------------")
for token in doc:
    print(token.text.ljust(10), token.lemma_.ljust(10), token.pos_.ljust(6), token.tag_.ljust(6), token.dep_.ljust(10),
            token.shape_.ljust(10), token.is_alpha, token.is_stop)


token      lemma      pos    tag    dep        shape      alpha stop
------------------------------------------------------------------------------
Apple      Apple      PROPN  NNP    nsubj      Xxxxx      True False
is         be         AUX    VBZ    aux        xx         True True
looking    look       VERB   VBG    ROOT       xxxx       True False
at         at         ADP    IN     prep       xx         True True
buying     buy        VERB   VBG    pcomp      xxxx       True False
U.K.       U.K.       PROPN  NNP    dobj       X.X.       False False
startup    startup    VERB   VBD    dep        xxxx       True False
for        for        ADP    IN     prep       xxx        True True
$          $          SYM    $      quantmod   $          False False
1          1          NUM    CD     compound   d          False False
billion    billion    NUM    CD     pobj       xxxx       True False


## Named entity recognition

In [25]:
import spacy
from spacy import displacy
from pathlib import Path

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text.ljust(12), ent.label_.ljust(10), ent.start_char, ent.end_char)

html_ent = displacy.render(doc, style="ent", jupyter=True)


Apple        ORG        0 5
U.K.         GPE        27 31
$1 billion   MONEY      44 54


# Document indexing

## Simple Document indexing



In [13]:
doc = {
    'id': '52345',
    'contents': 'joana dias horas minutos segundos',
}

resp = client.index(index=index_name, id=1, body=doc)
print(resp['result'])


updated


# Search

## Text-based Search

In [16]:
qtxt = 'joana'

query_bm25 = {
  'size': 5,
  'fields': ['contents'],
#  'fields': ['id', 'contents'],
#  'fields': ['id', 'contents', 'sentence_embedding'],
  '_source': '',
  'query': {
    'multi_match': {
      'query': qtxt,
      'fields': ['contents']
    }
  }
}

response = client.search(
    body = query_bm25,
    index = index_name
)

print('\nSearch results:')
pp.pprint(response)



Search results:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4},
 'hits': {'hits': [{'_id': '1',
                    '_index': 'user220',
                    '_score': 0.4091398,
                    '_source': {},
                    '_type': '_doc',
                    'fields': {'contents': ['joana dias horas minutos '
                                            'segundos']}}],
          'max_score': 0.4091398,
          'total': {'relation': 'eq', 'value': 1}},
 'timed_out': False,
 'took': 11}


# Close the index and refresh it

In [17]:

# NOT SURE IF THIS IS NEEDED WITH FAISS

index_settings = {
    "settings":{
      "index":{
         "refresh_interval" : "1s"
      }
   }
}

client.indices.close(index = index_name, timeout="600s")
client.indices.put_settings(index = index_name, body = index_settings)

{'acknowledged': True}