In [1]:
import numpy as np
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

In [2]:
client = Elasticsearch('localhost:9200')

# Create Index

In [11]:
mappings = {
    'properties': {
        'id': {'type': 'integer'},
        'label': {'type': 'text'},
        'group': {'type': 'integer'},
        'feature': {'type': 'dense_vector', 'dims': 128}
    }
}

In [12]:
if client.indices.exists(index='matching'):
    client.indices.delete(index='matching', ignore=[404])

In [13]:
client.indices.create(index='matching', body={'mappings': mappings})

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'matching'}

# Create Documents

In [15]:
n_data = 10000

In [16]:
features = np.random.uniform(size=(n_data, 128))

In [17]:
client.index('matching', body={'id': 0, 'label': 'foo', 'feature': list(np.random.uniform(size=(128, )))})

{'_index': 'matching',
 '_type': '_doc',
 '_id': 'ufGyAHQBaU0E8SR2SZc9',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [20]:
def generate_data(features):
    for i, f in enumerate(features):
        yield {
            '_index': 'matching',
            'id': i + 1,
            'feature': list(f),
            'group': i % 10,
            'label' : f'name={i:04d}'
        }

In [23]:
bulk(client, generate_data(features))

(10000, [])

# Index Documents

In [44]:
query_feature = np.random.uniform(size=(128,))

In [45]:
res = client.search(
    index='matching',
    body={
        'query': {
            'script_score': {
                'query': {
                    'match_all': {}
                },
                'script': {
                    'source': "cosineSimilarity(params.query_vec, doc['feature'])",
                    'params': {
                        'query_vec': query_feature.tolist()
                    }
                }
            }
        }
    }
)

In [46]:
top_object = res['hits']['hits'][0]['_source']

In [47]:
top_object

{'id': 9708,
 'feature': [0.6760291229599595,
  0.2966148815240065,
  0.33944093098883066,
  0.5644720934573715,
  0.3657261997900604,
  0.6305739591241626,
  0.8164859460292493,
  0.6964902409197422,
  0.18530467313462684,
  0.8433154239842243,
  0.4165636664090644,
  0.026351048177662317,
  0.9429093215887854,
  0.5352761032789485,
  0.13043730095431938,
  0.9196826431641023,
  0.9507341352345094,
  0.28326186253587016,
  0.21258749042521796,
  0.07897531521002654,
  0.8701521613502181,
  0.9763373839089314,
  0.05488923418754432,
  0.3446566899678125,
  0.06237134057417426,
  0.565305107476396,
  0.23725395455096943,
  0.975236636898188,
  0.3747267254206045,
  0.29445549775539714,
  0.44448025903355415,
  0.7837316900029357,
  0.4079884787222843,
  0.7885345703340448,
  0.7106741977881398,
  0.469352085422033,
  0.7031680864037874,
  0.31027291552448355,
  0.4891222434485505,
  0.9668464527170834,
  0.8781687150651835,
  0.6184862313165623,
  0.4468476730690627,
  0.753128938023998

In [39]:
res = client.search(
    index='matching',
    body={
        'query': {'match_all': {}},
        "_source": ["account_number", "balance"]
    }
)

In [43]:
res['hits']['hits'][0]

{'_index': 'matching',
 '_type': '_doc',
 '_id': 'ufGyAHQBaU0E8SR2SZc9',
 '_score': 1.0,
 '_source': {}}

In [4]:
x = {'test1': 1, 'test2': 2}

In [5]:
x.pop('test1')

1

In [6]:
x

{'test2': 2}