<div class="alert alert-block alert-success">
    <b><center>Elasticsearch 7.7</center></b>
    <b><center>데이터 모델링</center></b>
</div>

In [15]:
import json, time
import numpy as np

from elasticsearch import Elasticsearch, helpers
import tensorflow as tf
import tensorflow_hub as hub

# TF 모델 테스트

In [2]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4" 
model = hub.load(module_url)

In [3]:
def embed_text(input):
    vectors = model(input)
    return [vector.numpy() for vector in vectors]

In [4]:
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

In [5]:
message_embeddings = embed_text(messages)

for i, message_embedding in enumerate(np.array(message_embeddings).tolist()):
    print("Message: {}".format(messages[i]))
    print("Embedding size: {}".format(len(message_embedding)))
    message_embedding_snippet = ", ".join(
        (str(x) for x in message_embedding[:3]))
    print("Embedding: [{}, ...]\n".format(message_embedding_snippet))

Message: Elephant
Embedding size: 512
Embedding: [0.008344489149749279, 0.0004808177181985229, 0.06595245748758316, ...]

Message: I am a sentence for which I would like to get its embedding.
Embedding size: 512
Embedding: [0.0508086159825325, -0.016524311155080795, 0.015737785026431084, ...]

Message: Universal Sentence Encoder embeddings also support short paragraphs. There is no hard limit on how long the paragraph is. Roughly, the longer the more 'diluted' the embedding will be.
Embedding size: 512
Embedding: [-0.028332680463790894, -0.0558621883392334, -0.012941470369696617, ...]



# Elasticsearch Indices 생성

In [6]:
INDEX_NAME = "posts"
INDEX_FILE = "posts_index.json"
DATA_FILE = "posts_data.json"

BATCH_SIZE = 50
SEARCH_SIZE = 5
GPU_LIMIT = 0.5

In [7]:
es = Elasticsearch(hosts="elastic.rsnet", port=80)

In [8]:
if es.indices.exists(INDEX_NAME):
    es.indices.delete(INDEX_NAME)
    
with open(INDEX_FILE) as index_file:
    source = index_file.read().strip()
    es.indices.create(index=INDEX_NAME, body=source)

## stackoverflow data 입력

In [9]:
docs = []
count = 0

def index_batch(docs):
    titles = [doc["title"] for doc in docs]
    title_vectors = embed_text(titles)

    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME
        request["title_vector"] = title_vectors[i]
        requests.append(request)
    helpers.bulk(es, requests)

In [10]:
with open(DATA_FILE) as data_file:
    for line in data_file:
        line = line.strip()

        doc = json.loads(line)
        if doc["type"] != "question":
            continue

        docs.append(doc)
        count += 1

        if count % BATCH_SIZE == 0:
            index_batch(docs)
            docs = []
            print("Indexed {} documents.".format(count))
    
    if docs:
        index_batch(docs)
        print("Indexed {} documents.".format(count))

Indexed 50 documents.
Indexed 100 documents.
Indexed 150 documents.
Indexed 200 documents.
Indexed 250 documents.
Indexed 300 documents.
Indexed 350 documents.
Indexed 400 documents.
Indexed 450 documents.
Indexed 500 documents.
Indexed 550 documents.
Indexed 600 documents.
Indexed 650 documents.
Indexed 700 documents.
Indexed 750 documents.
Indexed 800 documents.
Indexed 850 documents.
Indexed 900 documents.
Indexed 950 documents.
Indexed 1000 documents.
Indexed 1050 documents.
Indexed 1100 documents.
Indexed 1150 documents.
Indexed 1200 documents.
Indexed 1250 documents.
Indexed 1300 documents.
Indexed 1350 documents.
Indexed 1400 documents.
Indexed 1450 documents.
Indexed 1500 documents.
Indexed 1550 documents.
Indexed 1600 documents.
Indexed 1650 documents.
Indexed 1700 documents.
Indexed 1750 documents.
Indexed 1800 documents.
Indexed 1850 documents.
Indexed 1900 documents.
Indexed 1950 documents.
Indexed 2000 documents.
Indexed 2050 documents.
Indexed 2100 documents.
Indexed 2150

# Title Vector로 검색

In [12]:
def run_query_loop():
    while True:
        try:
            handle_query()
        except KeyboardInterrupt:
            return

In [13]:
def handle_query():
    query = input("Enter query: ")

    embedding_start = time.time()
    query_vector = embed_text([query])[0]
    embedding_time = time.time() - embedding_start

    script_query = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, doc['title_vector']) + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }

    search_start = time.time()
    response = es.search(
        index=INDEX_NAME,
        body={
            "size": SEARCH_SIZE,
            "query": script_query,
            "_source": {"includes": ["title", "body"]}
        }
    )
    search_time = time.time() - search_start

    print()
    print("{} total hits.".format(response["hits"]["total"]["value"]))
    print("embedding time: {:.2f} ms".format(embedding_time * 1000))
    print("search time: {:.2f} ms".format(search_time * 1000))
    for hit in response["hits"]["hits"]:
        print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
        print(hit["_source"])
        print()

In [None]:
run_query_loop()