参考　https://blog.imind.jp/entry/2019/03/08/185935

In [None]:
import elasticsearch

In [None]:
from elasticsearch import helpers  # bulk データ投入用

In [None]:
import io
import requests
import pandas as pd

In [None]:
URL = ""  # Google spread sheetのURL
def load_data_from_google_drive(URL, index_col=None):
    '''サンプルデータ読み込み'''

    URL = URL.replace("open", "uc")
    r = requests.get(URL)
    df = pd.read_csv(io.BytesIO(r.content), index_col=index_col)
    return df

In [None]:
df = load_data_from_google_drive(URL, index_col=0)

In [None]:
df.head()

# ESに接続

In [None]:
es = elasticsearch.Elasticsearch("elasticsearch:9200")  
# es = elasticsearch.Elasticsearch("localhost:9200")  # local (コンテナ外)  から接続する場合はこちらを利用

# インデックスの生成

`expression2`という名称でindexを作成  
下記のデータ形式を想定
```
{ gene_id: gene,
  tpm: {sample_1: 1,
        sample_2: 2,
  }
}
```

In [None]:
index_name = "expression2"

In [None]:
# スキーマ定義はとりあえずしない
# mapping = {
#     "mappings" : {
#             "properties" : {
#                 "gene_id": {"type":"text"},
#                 "sample_id": {"type":"text"},
#                 "tpm": {"type":"float"}
#             }
#     }
# }

In [None]:
# index生成  (存在していたら一旦消してから）
# es.indices.create(index=index_name, body=mapping)
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

es.indices.create(index=index_name)

In [None]:
# indexの存在を確認
es.indices.exists(index=index_name)

# バルクデータの投入

In [None]:
def create_data(URL, index_name=index_name):
    '''CSVデータから遺伝子ごとのTPM値をjsonで返すジェネレータ関数'''
    df = load_data_from_google_drive(URL, index_col=0)
    D = df.T.to_dict()
    for gene_id, tpm_data in D.items():
        source = {"gene_id": gene_id, "tpm": tpm_data}
        yield {
            "_index": index_name,
            "_id": f"{gene_id}",   # 例) 'Mp1g00010'
            "_source": source
        }

19234件のデータ投入

In [None]:
%%time
elasticsearch.helpers.bulk(
        es,
        actions=create_data(URL),  # ジェネレータ関数を渡しているがiterableであれば良いのでリストでも動くと思われる
        chunk_size=1000)

__`gene_id`指定してデータ取得__

In [None]:
ret = es.get(index_name, "Mp2g00130")

In [None]:
print(f'ID: {ret["_id"]}, gene_id: {ret["_source"]["gene_id"]}\nTPM:\n{ret["_source"]["tpm"]}')

__searchでデータ取得__ (ID指定の方が早い)

In [None]:
%%time
query = {"query": 
           {
            "match": {"gene_id": "Mp1g00010"}
           }
        }
query_body = {"query": query}
result = es.search(index=index_name, body=query)
# get_all_docs(index=index_name, query=query, size=10000)

In [None]:
print(result["hits"]["hits"][0])

# データ全件取得 (scrollを使用)

In [None]:
def get_all_docs(index, query=None, scroll="2m", size=10000, request_timeout=150):
    if query is None:
        query = {'match_all': {}}
    query_body = {'query': query}
    data = es.search(index=index_name, body=query_body, scroll=scroll, size=size, request_timeout=request_timeout)
    s_id = data['_scroll_id']
    s_size = len(data['hits']['hits'])
    result = data['hits']['hits']
    while (s_size > 0):
        data = es.scroll(scroll_id=s_id, scroll=scroll,request_timeout=request_timeout)
        s_id = data['_scroll_id']
        s_size = len(data['hits']['hits'])
        result.extend(data['hits']["hits"])
    return result

In [None]:
%%time
result = get_all_docs(index=index_name)

In [None]:
len(result)

__データを発現量テーブルの形に整形__

In [None]:
%time
tmp_result = [x["_source"] for x in result]
S = [pd.Series(s["tpm"], name=s["gene_id"]) for s in tmp_result]
df_result = pd.DataFrame(S)

In [None]:
df_result.head()