In [None]:
# run-but-hidden

# here we just add the project root path to 
# the path list where python modules are searched.
# So we find elastipy even if it's not installed via setup.py
import sys
sys.path.insert(0, "..")

In [None]:
# run-but-hidden
# we create some data to display in the quickref
# CAREFUL! This destroys an index called "world" in your elastisearch!

import random
import datetime
from elastipy import Exporter

class WorldExporter(Exporter):
    INDEX_NAME = "world"
    MAPPINGS = {
        "properties": {
            "timestamp": {"type": "date"},
            "occasion": {"type": "keyword"},
            "excuse": {"type": "keyword"},
            "conversation_length": {"type": "float"},
        }
    }
    
def iter_documents(count=200):
    EXCUSES = [
        "tastes awful",
        "too salty",
        "smells repellent",
    ]
    RARE_EXCUSES = [
        "my mouth is too dry", 
        "i can't reach the spoon"
    ]
    rnd = random.Random(98374934)
    for i in range(count):
        yield {
            "timestamp": (
                datetime.datetime(2000, 1, 1) 
                + datetime.timedelta(days=rnd.randint(0, 20))
            ),
            "occasion": "dinner",
            "excuse": rnd.choice(EXCUSES) if i < count - 3 else RARE_EXCUSES[(i-count) % 2],
            "conversation_length": rnd.randint(5, 200),
        }
        
exporter = WorldExporter()
exporter.delete_index()
count, errors = exporter.export_list(iter_documents(), refresh=True)
assert not errors

In [None]:
# run-but-hidden
# we create some data to display in the quickref
# CAREFUL! This destroys an index called "prog-world" in your elastisearch!

import random
import datetime
from elastipy import Exporter

class ProgWorldExporter(Exporter):
    INDEX_NAME = "prog-world"
    MAPPINGS = {
        "properties": {
            "category": {"type": "keyword"},
            "usage": {"type": "keyword"},
            "topic": {"type": "keyword"},
            "country": {"type": "keyword"},
            "language": {"type": "keyword"},
        }
    }
    
def iter_documents(count=200):
    rnd = random.Random(343984)
    for i in range(count):
        yield {
            "category": "programming",
            "usage": "widely-used",
            "topic": rnd.choice([
                "yet-another-api", "yet-another-operator-overload",
            ]),
            "country": rnd.choice(["ES", "US", "IT"]),
            "language": rnd.choice(["Python", "C++", "PHP"]),
        }
        
exporter = ProgWorldExporter()
exporter.delete_index()
count, errors = exporter.export_list(iter_documents(), refresh=True)
assert not errors

### configuration 

By default an [elasticsearch](https://www.elastic.co/guide/en/elasticsearch/reference/current/elasticsearch-intro.html) host is expected at `localhost:9200`. There are currently two ways 
to specify a different connection.

In [None]:
from elasticsearch import Elasticsearch
from elastipy import Search

# Use an explicit Elasticsearch client (or compatible class)
client = Elasticsearch(
    hosts=[{"host": "localhost", "port": 9200}], 
    http_auth=("user", "pwd")
)

# create a Search using the specified client
s = Search(index="bla", client=client)

# can also be done later
s = s.client(client)

Check the Elasticsearch [API reference](https://elasticsearch-py.readthedocs.io/en/v7.10.1/api.html#elasticsearch) for all the parameters.

We can also set a default client at the program start:  

In [None]:
from elastipy import connections

connections.set("default", client)

# .. or as parameters, they get converted to an Elasticsearch client
connections.set("default", {"hosts": [{"host": "localhost", "port": 9200}]})

# get a client
connections.get("default")

Different connections can be specified with the *alias* name:

In [None]:
connections.set("special", {"hosts": [{"host": "special", "port": 1234}]})

s = Search(client="special")
s.get_client()

### aggregations

More details can be found in the [tutorial](https://elastipy.readthedocs.io/en/latest/tutorial.html).

In [None]:
# get a search object
s = Search(index="world")

# create an Aggregation class connected to the Search
agg = s.agg_date_histogram(calendar_interval="1w")
# (for date-specific aggregations we can leave out the 'field' parameter 
#  it fall's back to Search.timestamp_field which is "timestamp" by default)

# submit the whole request
s.execute()

# access the response

list(agg.keys())

In [None]:
list(agg.values())

Without a [metric](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics.html) these numbers are the document counts.

Above example as a one-liner:

In [None]:
Search(index="world").agg_date_histogram(calendar_interval="1w").execute().to_dict()

### nested aggregations and metrics

In [None]:
s = Search(index="world")

# the first parameter is the name of the aggregation 
#   (if omitted it will be "a0", "a1", aso..)  
agg = s \
    .agg_terms("occasion", field="occasion") \
    .agg_rare_terms("rare-excuses", field="excuse", max_doc_count=2) \
    .metric_avg("avg-length", field="conversation_length") \
    .metric_max("max-length", field="conversation_length") \
    .execute()

The `rare_terms` aggregation is nested into the `terms` aggregation and 
the metrics are siblings nested inside `rare_terms`.

`keys()`, `values()`, `items()` and `to_dict()` all operate on the current aggregation.
For bucket aggregations they typically show the `doc_count` value.'

In [None]:
agg.to_dict()

The `rows()`, `dict_rows()` and `dump.table()` methods operate on the whole aggregation branch:

In [None]:
list(agg.dict_rows())

In [None]:
agg.dump.table(colors=False)

### queries

In [None]:
from elastipy import query

s = Search(index="prog-world")

# chaining means AND
s = s \
    .term(field="category", value="programming") \
    .term("usage", "widely-used")

# also can use operators
s = s & (
    query.Term("topic", "yet-another-api") 
    | query.Term("topic", "yet-another-operator-overload")
)

# .query() replaces the current query 
s = s.query(query.MatchAll())

languages_per_country = s.agg_terms(field="country").agg_terms(field="language").execute()

languages_per_country.to_dict()

### exporting

There is a small helper to export stuff to elasticsearch.

In [None]:
# run-but-hidden
a_lot_of_objects = [
    {
        "some_field": "", 
        "id": i,
        "group": "group",
    }
    for i in range(1000)
]

In [None]:
from elastipy import Exporter

class MyExporter(Exporter):
    INDEX_NAME = "my-index"
    
    # mapping can be defined here
    # it will be sent to elasticsearch before the first document is exported 
    MAPPINGS = {
        "properties": {
            "some_field": {"type": "text"},
        }       
    }   

count, errors = MyExporter().export_list(a_lot_of_objects)

print(f"expored {count} objects, errors: {errors}")

It uses bulk requests and is very fast, supports document transformation and
control over id and sub-index of documents.

In [None]:
import datetime

class MyExporter(Exporter):
    INDEX_NAME = "my-index-*"
    MAPPINGS = {
        "properties": {
            "some_field": {"type": "text"},
            "group": {"type": "keyword"},
            "id": {"type": "keyword"},
            "timestamp": {"type": "date"},
        }       
    }   

    # if each document has a unique id value we can use it
    # as the elasticsearch id as well. That way we do not
    # create documents twice when exporting them again.
    # Their data just gets updated.
    def get_document_id(self, es_data):
        return es_data["id"]
    
    # we can bucket documents into separate indices 
    def get_document_index(self, es_data):
        return self.index_name().replace("*", es_data["group"])
    
    # here we can adjust or add some data before it gets exported.
    # it's also possible to split the data into several documents
    #   by yielding or returning a list
    def transform_document(self, data):
        data = data.copy()
        data["timestamp"] = datetime.datetime.now()
        return data

MyExporter().export_list(a_lot_of_objects)

If we are tired enough we can call:

In [None]:
MyExporter().delete_index()

This will actually delete all sub-indices because there's this wildcard `*` in the `INDEX_NAME`.


In [None]:
# run-but-hidden

# finally, the notebook is rendered and we remove those crappy indices

WorldExporter().delete_index()
_ = ProgWorldExporter().delete_index()