In [1]:
from elasticsearch import Elasticsearch
from tqdm import tqdm_notebook as tqdm

In [2]:
es = Elasticsearch([{ 'host': 'elasticsearch' }])

In [3]:
mappings = {
    'properties': {
        'id': { 'type': 'integer' },
        'label': { 'type': 'text' },
        'group': { 'type': 'integer' },
        'feature': { 'type': 'dense_vector', 'dims': 128 }
    }
}

In [None]:
if es.indices.exists(index='matching'):
    es.indices.delete(index='matching')

In [None]:
es.indices.create(index='matching', body={ 'mappings': mappings })

### データの作成

データ定義にしたがってデータを登録します。  
`feature` を真面目に作っても良いのですが少しめんどいので numpy のランダムなベクトルで代用

In [None]:
import numpy as np

In [None]:
n_data = 100000

In [None]:
features = np.random.uniform(size=(n_data, 128))

一件だけ登録

この時特徴量を list にして python object に変換する必要あり。(たぶん serializer をカスタムすればいちいち list にしなくても済むはずなのであとで調べる)

In [None]:
es.index('matching', body={ 'id': 0, 'label': 'foo', 'feature': list(np.random.uniform(size=(128,)))})

まとめてデータを作るときは `elasticsearch.helper.buld` をつかうとよさ気

https://elasticsearch-py.readthedocs.io/en/master/helpers.html#bulk-helpers

iterator を渡すと各要素をドキュメントとして登録してくれるみたい

```python
def gendata():
    mywords = ['foo', 'bar', 'baz']
    for word in mywords:
        yield {
            "_index": "mywords",
            "_type": "document",
            "doc": {"word": word},
        }

bulk(es, gendata())
```

In [None]:
def generate_data(features):
    for i, f in tqdm(enumerate(features), total=len(features)):
        yield {
            '_index': 'matching',
            'id': i + 1,
            'feature': list(f),
            'group': i % 10,
            'label': f'name={i:04d}'
        }

In [None]:
from elasticsearch.helpers import bulk

In [None]:
bulk(es, generate_data(features))

In [None]:
query_feature = np.random.uniform(size=(128,))

In [None]:
query_feature

In [None]:
%%time
res = es.search(index='matching', body={
  "query": {
    "script_score": {
      "query": {
        "match_all": {}
      },
      "script": {
        "source": "cosineSimilarity(params.query_vec, doc['feature'])",
        "params": {
          "query_vec": query_feature.tolist()
        }
      }
    }
  }
})

In [None]:
%%time
res = es.search(index='matching', body={
  "query": {
    "script_score": {
      "query": {
        "match_all": {}
      },
      "script": {
        "source": "cosineSimilarity(params.query_vec, doc['feature'])",
        "params": {
          "query_vec": query_feature.tolist()
        }
      }
    }
  }
})

In [None]:
top_object = res['hits']['hits'][0]['_source']

In [None]:
import pandas as pd

In [None]:
top_df = pd.DataFrame(res['hits']['hits'])

In [None]:
top_df

In [None]:
top_object['id'], top_object['label']

答え合わせ

In [None]:
from scipy.spatial.distance import cosine

In [None]:
%%time
cos_sim = [1 - cosine(x, query_feature) for x in features]
idx = np.argmax(cos_sim)
cos_sim[idx]

In [None]:
from joblib import Parallel, delayed

In [None]:
%%time
Parallel(n_jobs=-1)([delayed(cosine)(x, query_feature) for x in features])