In [1]:
from elasticsearch import Elasticsearch
from tqdm import tqdm_notebook as tqdm

In [2]:
es = Elasticsearch([{ 'host': 'elasticsearch' }])

In [3]:
mappings = {
    'properties': {
        'id': { 'type': 'integer' },
        'label': { 'type': 'text' },
        'group': { 'type': 'integer' },
        'feature': { 'type': 'dense_vector', 'dims': 128 }
    }
}

In [4]:
if es.indices.exists(index='matching'):
    es.indices.delete(index='matching')

In [5]:
es.indices.create(index='matching', body={ 'mappings': mappings })

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'matching'}

### データの作成

データ定義にしたがってデータを登録します。  
`feature` を真面目に作っても良いのですが少しめんどいので numpy のランダムなベクトルで代用

In [6]:
import numpy as np

In [7]:
n_data = 100000

In [8]:
features = np.random.uniform(size=(n_data, 128))

一件だけ登録

この時特徴量を list にして python object に変換する必要あり。(たぶん serializer をカスタムすればいちいち list にしなくても済むはずなのであとで調べる)

In [9]:
es.index('matching', body={ 'id': 0, 'label': 'foo', 'feature': list(np.random.uniform(size=(128,)))})

{'_index': 'matching',
 '_type': '_doc',
 '_id': 'qp_fkW0BPBfgrcvw5zEL',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

まとめてデータを作るときは `elasticsearch.helper.buld` をつかうとよさ気

https://elasticsearch-py.readthedocs.io/en/master/helpers.html#bulk-helpers

iterator を渡すと各要素をドキュメントとして登録してくれるみたい

```python
def gendata():
    mywords = ['foo', 'bar', 'baz']
    for word in mywords:
        yield {
            "_index": "mywords",
            "_type": "document",
            "doc": {"word": word},
        }

bulk(es, gendata())
```

In [10]:
def generate_data(features):
    for i, f in tqdm(enumerate(features), total=len(features)):
        yield {
            '_index': 'matching',
            'id': i + 1,
            'feature': list(f),
            'group': i % 10,
            'label': f'name={i:04d}'
        }

In [11]:
from elasticsearch.helpers import bulk

In [12]:
bulk(es, generate_data(features))

HBox(children=(IntProgress(value=0, max=100000), HTML(value='')))




(100000, [])

In [13]:
query_feature = np.random.uniform(size=(128,))

In [14]:
query_feature

array([0.81802499, 0.12571864, 0.27289382, 0.6986188 , 0.29066465,
       0.11050923, 0.8501511 , 0.51986054, 0.61685199, 0.83095645,
       0.75356709, 0.51062607, 0.26170096, 0.88077694, 0.70502994,
       0.98665565, 0.06174718, 0.02887512, 0.38458062, 0.77097599,
       0.75536054, 0.21685613, 0.60844107, 0.78031337, 0.95986356,
       0.97189258, 0.62521225, 0.84465395, 0.45734556, 0.85585512,
       0.23036291, 0.35699732, 0.68213987, 0.29794184, 0.23592959,
       0.93629309, 0.25856664, 0.21658585, 0.26004643, 0.64287907,
       0.69259503, 0.98923838, 0.4644076 , 0.12685969, 0.65329357,
       0.21355726, 0.7435358 , 0.48377971, 0.46569942, 0.5708957 ,
       0.58220555, 0.55059325, 0.12480346, 0.89269312, 0.18475965,
       0.93917319, 0.19996934, 0.08402065, 0.55516532, 0.72775439,
       0.65667649, 0.38978366, 0.63088646, 0.49233538, 0.60895785,
       0.68365275, 0.95476001, 0.09304078, 0.73913519, 0.73272903,
       0.61453181, 0.57471983, 0.90273075, 0.09222085, 0.35184

In [28]:
%%timeit
res = es.search(index='matching', body={
  "query": {
    "script_score": {
      "query": {
        "match_all": {}
      },
      "script": {
        "source": "cosineSimilarity(params.query_vec, doc['feature'])",
        "params": {
          "query_vec": query_feature.tolist()
        }
      }
    }
  }
})

43.7 ms ± 661 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [17]:
top_object = res['hits']['hits'][0]['_source']

In [18]:
import pandas as pd

In [19]:
top_df = pd.DataFrame(res['hits']['hits'])

In [20]:
top_df

Unnamed: 0,_index,_type,_id,_score,_source
0,matching,_doc,xJ_fkW0BPBfgrcvw8ENd,0.838863,"{'id': 4634, 'feature': [0.40823221917686825, ..."
1,matching,_doc,Up_fkW0BPBfgrcvw_npy,0.832677,"{'id': 18600, 'feature': [0.88734911344409, 0...."
2,matching,_doc,IqDgkW0BPBfgrcvwKT-F,0.832089,"{'id': 68984, 'feature': [0.6641028393123313, ..."
3,matching,_doc,wp_gkW0BPBfgrcvwBJbn,0.829431,"{'id': 25880, 'feature': [0.9886763523358413, ..."
4,matching,_doc,6KDgkW0BPBfgrcvwOoww,0.828593,"{'id': 88894, 'feature': [0.5354224545349886, ..."
5,matching,_doc,P6DgkW0BPBfgrcvwOYrK,0.828202,"{'id': 88213, 'feature': [0.5285779289050021, ..."
6,matching,_doc,dKDgkW0BPBfgrcvwKDhC,0.828133,"{'id': 67274, 'feature': [0.5625189876179224, ..."
7,matching,_doc,VqDgkW0BPBfgrcvwHQMW,0.827646,"{'id': 53676, 'feature': [0.951265664307382, 0..."
8,matching,_doc,uZ_gkW0BPBfgrcvwEtAu,0.827254,"{'id': 40719, 'feature': [0.3307656211310287, ..."
9,matching,_doc,FKDgkW0BPBfgrcvwM2_0,0.826668,"{'id': 81258, 'feature': [0.929954851122463, 0..."


In [21]:
top_object['id'], top_object['label']

(4634, 'name=4633')

答え合わせ

In [22]:
from scipy.spatial.distance import cosine

In [29]:
%%timeit
cos_sim = [1 - cosine(x, query_feature) for x in features]
idx = np.argmax(cos_sim)
cos_sim[idx]

3.27 s ± 17.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
from joblib import Parallel, delayed

In [31]:
%%timeit
_ = Parallel(n_jobs=-1)([delayed(cosine)(x, query_feature) for x in features])

5.2 s ± 108 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
