In [None]:
# default_exp search

In [None]:
#export
from elasticsearch import Elasticsearch, helpers
from sentence_transformers import SentenceTransformer
import string

In [None]:
#export
def index_data(d, es, model, INDEX_NAME='my_index_name', BATCH_SIZE=5000):
    '''indexes data'''
    
    print("Creating the " + INDEX_NAME + " index.")
    
    es.indices.delete(index=INDEX_NAME, ignore=[404])
    
    settings = {
            "settings": {
                "number_of_shards": 2,
                "number_of_replicas": 1
            },
            "mappings": {
                "dynamic": "true",
                "_source": {
                "enabled": "true"
                },
                "properties": {
                    "text": {
                        "type": "text"
                    },
                    "name": {
                        "type": "text"
                    },
                    "library": {
                        "type": "text"
                    },
                    "text_vector": {
                        "type": "dense_vector",
                        "dims": 768
                    }
                }
            }
        }
    
    es.indices.create(index=INDEX_NAME, body=settings)

    docs = []
    count = 0
    
    for ind in d.index:
        doc = {'_id': ind,
               'text': d.text[ind],
               'name': d.name[ind],
               'library': d.library[ind],
               'text_vector': embed_text(d.text[ind], model)}
        docs.append(doc)
        count += 1
        
        if count % BATCH_SIZE == 0:
            index_batch(docs)
            docs = []
            print("Indexed {} documents.".format(count))
    
    if docs:
        index_batch(docs)
        print("Indexed {} documents.".format(count))

In [None]:
#export
def index_batch(docs):
    print("Starting embeddings")

    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME
        requests.append(request)
        
    helpers.bulk(es, requests)

In [None]:
#export
def embed_text(text, model):
    """embedds text"""
    
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    return model.encode(text)

In [None]:
#export
def query(text, size, es, model, INDEX_NAME='my_index_name', field=None):
    '''Elasticsearch query function'''
    
    query_vector = embed_text(text, model)
    
    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": 
                """
                    cosineSimilarity(params.query_vector, 'text_vector') + 1.0
                """, 
                 "params": {"query_vector": query_vector}                           
            }
        }
    }
    
    response = es.search(
        index=INDEX_NAME,
        body={
            "size": size,
            "query": script_query,
            "_source": {"includes": ['name', 'text']} # source fields to appear in response 
        }
    )
    
    if field != None:
        return [doc['_source'][field] for num, doc in enumerate(response['hits']['hits'])]
        
    
    return response

In [None]:
ELASTIC_PASSWORD = "bzF2z8iSu8bcSJR5SL3xM795"

CLOUD_ID = "Johan:dXMtZWFzdC0yLmF3cy5lbGFzdGljLWNsb3VkLmNvbTo0NDMkMjk5NWZiZjRjYTZkNGZlN2I2NmJhZDlhMzIxMmM5OGUkMWJhMjBlNzY3MDEyNDdmNjgxMDQ4ZGZmZDZjZDg0OTc="

es = Elasticsearch(
    cloud_id=CLOUD_ID,
    http_auth=("elastic", ELASTIC_PASSWORD)
)


model_name = 'sentence-transformers/all-mpnet-base-v2'
model = SentenceTransformer(model_name)

In [None]:
query(text='How to drop a column in pandas?', size=10, es=es, model=model, INDEX_NAME='bert', field='name')

['pandas.DataFrame.drop', 'pandas.Series.drop', 'pandas.DataFrame.dropna', 'pyarrow.Table.drop', 'pandas.DataFrame.droplevel', 'pandas.Index.dropna', 'pandas.Series.droplevel', 'pandas.Series.dropna', 'pyarrow.Table.remove_column', 'pandas.MultiIndex.dropna']


In [None]:
print(query(text='How to drop a column in pandas?', size=1, es=es, model=model, INDEX_NAME='bert', field='text')[0])

pandas.DataFrame.drop. function drop in module pandas.core.frame

ddrroopp(self, labels=None, axis: 'Axis' = 0, index=None, columns=None, level: 'Level | None' = None, inplace: 'bool' = False, errors: 'str' = 'raise')
    Drop specified labels from rows or columns.
    
    Remove rows or columns by specifying label names and corresponding
    axis, or by specifying directly index or column names. When using a
    multi-index, labels on different levels can be removed by specifying
    the level. See the `user guide <advanced.shown_levels>`
    for more information about the now unused levels.
    
    Parameters
    ----------
    labels : single label or list-like
        Index or column labels to drop. A tuple will be used as a single
        label and not treated as a list-like.
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Whether to drop labels from the index (0 or 'index') or
        columns (1 or 'columns').
    index : single label or list-like
        Alter