# Introduction to OpenSearch

A server is available on the cluster for this course. If you really need to you can set up your own server in your local machine. I advise you to use docker: https://opensearch.org/docs/latest/opensearch/install/docker/

## CURL Connection to server


In [None]:
import pprint as pp
import requests

host = '10.10.255.202'
port = 8200
index_name = 'user220'
auth = ('user220', 'password here') # For testing only. Don't store credentials in code.

s = requests.Session()
s.auth = auth

#auth = (index_name, 'zya*xJ!4]n') # For testing only. Don't store credentials in code.
ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA.
server_uri = 'https://' + host + ':' + str(port)

# function for the cURL requests
def opensearch_curl(uri = '/' , body='', verb='get'):
    # pass header option for content type if request has a
    # body to avoid Content-Type error in Elasticsearch v6.0
    
    uri = server_uri + uri
    print(uri)
    headers = {
        'Content-Type': 'application/json',
    }

    try:
        # make HTTP verb parameter case-insensitive by converting to lower()
        if verb.lower() == "get":
            resp = s.get(uri, json=body, headers=headers, verify=False)
        elif verb.lower() == "post":
            resp = s.post(uri, json=body, headers=headers, verify=False)
        elif verb.lower() == "put":
            resp = s.put(uri, json=body, headers=headers, verify=False)
        elif verb.lower() == "del":
                resp = s.delete(uri, json=body, headers=headers, verify=False)
        elif verb.lower() == "head":
                resp = s.head(uri, json=body, headers=headers, verify=False)

        # read the text object string
        try:
            resp_text = json.loads(resp.text)
        except:
            resp_text = resp.text

        # catch exceptions and print errors to terminal
    except Exception as error:
        print ('\nelasticsearch_curl() error:', error)
        resp_text = error

    # return the Python dict of the request
    return resp_text


## OpenSearch Python API 

A short introduction is available here:
https://opensearch.org/docs/latest/clients/python/

In [None]:
import pprint as pp
from opensearchpy import OpenSearch
from opensearchpy import helpers

# Optional client certificates if you don't want to use HTTP basic authentication.
# client_cert_path = '/full/path/to/client.pem'
# client_key_path = '/full/path/to/client-key.pem'

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    # client_cert = client_cert_path,
    # client_key = client_key_path,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
    #, ca_certs = ca_certs_path
)

if client.indices.exists(index_name):

    client.indices.open(index = index_name)

    print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
    index_settings = {
        "settings":{
          "index":{
             "refresh_interval" : "1s"
          }
       }
    }
    client.indices.put_settings(index = index_name, body = index_settings)
    settings = client.indices.get_settings(index = index_name)
    pp.pprint(settings)

    print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
    mappings = client.indices.get_mapping(index = index_name)
    pp.pprint(mappings)

    print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
    print(client.count(index = index_name))
    

# Index creation and configuration

## Create an index with your own settings


In [None]:

index_body = {
   "settings":{
      "index":{
         "number_of_replicas":0,
         "number_of_shards":4,
         "refresh_interval":"-1",
         "knn":"true"
      },
      "analysis":{
         "filter":{
            "edge_ngram_filter":{
               "type":"edge_ngram",
               "min_gram":1,
               "max_gram":20
            }
         },
         "analyzer":{
            "my_analyzer":{
               "type":"custom",
               "tokenizer":"standard",
               "filter":[
                  "lowercase",
                  "edge_ngram_filter"
               ]
            }
         }
      }
   },
   "mappings":{
      "properties":{
         "doc_id":{
            "type":"keyword"
         },
         "contents":{
            "type":"text",
            "analyzer": "standard",
#            "analyzer":"my_analyzer",
            "similarity":"BM25"
         },
         "sentence_embedding":{
            "type":"knn_vector",
            "dimension": 768,
            "method":{
               "name":"hnsw",
               "space_type":"innerproduct",
               "engine":"faiss",
               "parameters":{
                  "ef_construction":256,
                  "m":48
               }
            }
         }
      }
   }
}

if client.indices.exists(index=index_name):
    print("Index already existed. Nothing to be done.")
else:        
    response = client.indices.create(index_name, body=index_body)
    print('\nCreating index:')
    print(response)


## Check the indexes, settings and mappings


In [None]:
print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
index_settings = {
    "settings":{
      "index":{
         "refresh_interval" : "1s"
      }
   }
}
client.indices.put_settings(index = index_name, body = index_settings)
settings = client.indices.get_settings(index = index_name)
pp.pprint(settings)

print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
mappings = client.indices.get_mapping(index = index_name)
pp.pprint(mappings)

print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
print(client.count(index = index_name))


## Delete the index if you want to replace it

In [None]:
#be absolutely sure that you want to comment this line and actually delete the index!!!

if client.indices.exists(index=index_name):
    # Delete the index.
    response = client.indices.delete(
        index = index_name,
        timeout = "600s"
    )
    print('\nDeleting index:')
    print(response)

# Document Processing


## Text tokenization

In [None]:
import spacy
from spacy import displacy
from pathlib import Path

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

save_figures = False

print("token".ljust(10), "lemma".ljust(10), "pos".ljust(6), "tag".ljust(6), "dep".ljust(10),
            "shape".ljust(10), "alpha", "stop")
print("------------------------------------------------------------------------------")
for token in doc:
    print(token.text.ljust(10), token.lemma_.ljust(10), token.pos_.ljust(6), token.tag_.ljust(6), token.dep_.ljust(10),
            token.shape_.ljust(10), token.is_alpha, token.is_stop)


## Named entity recognition

In [None]:
import spacy
from spacy import displacy
from pathlib import Path

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for ent in doc.ents:
    print(ent.text.ljust(12), ent.label_.ljust(10), ent.start_char, ent.end_char)

html_ent = displacy.render(doc, style="ent", jupyter=True)


## Dual-Encoders

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take average of all tokens
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


#Encode text
def encode(texts):
    # Tokenize sentences
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    return embeddings


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/msmarco-distilbert-base-v2")
model = AutoModel.from_pretrained("sentence-transformers/msmarco-distilbert-base-v2")

# Sentences we want sentence embeddings for
docs = ["Around 9 Million people live in London", "London is known for its financial district"]
doc_emb = encode(docs)


In [None]:
doc_emb[0].numpy()

## Training Dual-Encoders

      https://www.sbert.net/docs/training/overview.html
    

In [None]:
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

#Define the model. Either from scratch of by loading a pre-trained model
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

#Define your train examples. You need more than just two examples...
train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
    InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]

#Define your train dataset, the dataloader and the train loss
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model)

#Tune the model
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=100)

# Document indexing

## Simple Document indexing



In [None]:
doc = {
    'doc_id': 'documentA',
    'contents': docs[0],
    'sentence_embedding': doc_emb[0].numpy()
}
resp = client.index(index=index_name, id=1, body=doc)
print(resp['result'])

doc = {
    'doc_id': 'documentB',
    'contents': docs[1],
    'sentence_embedding': doc_emb[1].numpy()
}
resp = client.index(index=index_name, id=2, body=doc)
print(resp['result'])


## Deletion of documents and index

In [None]:
Delete the document.
response = client.delete(
    index = index_name,
    id = id
)

print('\nDeleting document:')
print(response)


# Search

OpenSearch implements a Query Syntax Language that supports a wide range of search options.

     https://opensearch.org/docs/latest/opensearch/query-dsl/index/


## Text-based Search

The text-based search documentation is available here:

     https://opensearch.org/docs/latest/opensearch/query-dsl/full-text/



In [None]:
query = "How many people live in London?"

query_bm25 = {
  'size': 5,
#  '_source': ['doc_id', 'contents', 'sentence_embedding'],
  '_source': ['doc_id', 'contents'],
#  '_source': ['doc_id'],
#  '_source': '',
  'query': {
    'match': {
      'contents': query,
    }
  }
}

response = client.search(
    body = query_bm25,
    index = index_name
)

print('\nSearch results:')
pp.pprint(response)


## Embedding Spaces Search

In [None]:
# Compute the query embedding
query = "How many people live in London?"
query_emb = encode(query)

query_denc = {
  'size': 5,
#  '_source': ['doc_id', 'contents', 'sentence_embedding'],
#  '_source': ['doc_id', 'contents'],
  '_source': ['doc_id'],
   "query": {
        "knn": {
          "sentence_embedding": {
            "vector": query_emb[0].numpy(),
            "k": 2
          }
        }
      }
}

response = client.search(
    body = query_denc,
    index = index_name
)

print('\nSearch results:')
pp.pprint(response)


# Close the index and refresh it

In [None]:

# NOT SURE IF THIS IS NEEDED WITH FAISS

index_settings = {
    "settings":{
      "index":{
         "refresh_interval" : "1s"
      }
   }
}

client.indices.close(index = index_name, timeout="600s")
client.indices.put_settings(index = index_name, body = index_settings)