In [None]:
# import rxiv_rest_api

In [None]:
# curl --cacert http_ca.crt -u elastic:worksfine https://es:9200

## Retrieve data from a JSON file and return its keys and values in JSON format

**First we need to ensure that the data is indexed in Elasticsearch. This process involves reading the JSON file, parsing its contents, and then indexing those contents in Elasticsearch. After the data is indexed, we can use the Elasticsearch client (es) to perform search operations.**

### Step 1: Index JSON Data into Elasticsearch
**Before we can search, you must have the data indexed in Elasticsearch.** 
**Here's a simplified function to read the JSON file and index its contents.** 
**This example assumes that your JSON data is an array of objects, each representing a document to be indexed.**

In [25]:
import json
from elasticsearch import Elasticsearch
import ssl

def index_json_data(es: Elasticsearch, file_path: str, index_name: str) -> None:
    """
    Reads data from a JSON file and indexes it into Elasticsearch.

    Parameters:
    es (Elasticsearch): An Elasticsearch client instance.
    file_path (str): The path to the JSON file.
    index_name (str): The name of the Elasticsearch index where data will be stored.
    """
    # Load JSON data from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    # Assuming `data` is a list of documents
    for doc in data:
        # Index each document
        res = es.index(index=index_name, document=doc)
        # print(res['result'])

# Example usage
ca_cert_path = '/workspace/repos/osl/rxiv-restapi/containers/esconfig/ca/certs/http_ca.crt'
ssl_context = ssl.create_default_context(cafile=ca_cert_path)
es = Elasticsearch(
    ["https://es:9200"],
    basic_auth=("elastic", "worksfine"),
    ssl_context=ssl_context
)
file_path = '/workspace/repos/osl/rxiv-restapi/docs/notebooks/data/municipalities.json'
index_name = 'geocodes_and_cities_by_uf'
index_json_data(es, file_path, index_name)


### Step 2: Create a Search Function

**After indexing the data, we can create a function to perform searches using the Elasticsearch client.**

In [51]:
def search_data(es: Elasticsearch, index_name: str, query: dict) -> list:
    """
    Performs a search query in an Elasticsearch index and returns only the document data.

    Parameters:
    es (Elasticsearch): An Elasticsearch client instance.
    index_name (str): The name of the Elasticsearch index to search in.
    query (dict): The search query in Elasticsearch Query DSL format.

    Returns:
    list: A list of documents from the search results, each represented as a dictionary.
    """
    try:
        response = es.search(index=index_name, query=query)
        # Extract only the '_source' from each hit
        documents = [hit['_source'] for hit in response['hits']['hits']]
        return documents
    except Exception as e:
        print(f"Search failed: {e}")
        return []

## Example Usage


### 1. Simple Text Search on Name Field

**To search for municipalities by name:**

In [56]:
%%time

query = {
   "match": {
       "name": "Curitiba"
   }
}

# Execute the search
results = search_data(es, index_name, query)

# Display the results directly
for result in results:
    # Assuming you want to display each document's key-value pairs
    print(result)

{'geocode': 4106902, 'name': 'Curitiba', 'uf': 'PR'}
CPU times: user 6.37 ms, sys: 215 µs, total: 6.59 ms
Wall time: 9.79 ms


### 2. Exact Match on Geocode

**To find a municipality by its exact geocode:**

In [50]:
%%time

query = {
   "term": {
       "geocode": 3304557
   }
}

# Execute the search
results = search_data(es, index_name, query)

# Display the results directly
for result in results:
    # Assuming you want to display each document's key-value pairs
    print(result)

{'geocode': 3304557, 'name': 'Rio de Janeiro', 'uf': 'RJ'}
CPU times: user 0 ns, sys: 3.06 ms, total: 3.06 ms
Wall time: 6.08 ms


### 3. Search by State (UF)
**To list all municipalities in a specific state:**

In [54]:
%%time

# Assuming `search_data` is already defined as before and returns the list of documents
query = {
   "match": {
       "uf": "SC"
   }
}

# Execute the search
results = search_data(es, index_name, query)

# Display the results directly
for result in results:
    # Assuming you want to display each document's key-value pairs
    print(result)

{'geocode': 4214102, 'name': 'Presidente Nereu', 'uf': 'SC'}
{'geocode': 4214151, 'name': 'Princesa', 'uf': 'SC'}
{'geocode': 4214201, 'name': 'Quilombo', 'uf': 'SC'}
{'geocode': 4214300, 'name': 'Rancho Queimado', 'uf': 'SC'}
{'geocode': 4214409, 'name': 'Rio das Antas', 'uf': 'SC'}
{'geocode': 4214508, 'name': 'Rio do Campo', 'uf': 'SC'}
{'geocode': 4214607, 'name': 'Rio do Oeste', 'uf': 'SC'}
{'geocode': 4214805, 'name': 'Rio do Sul', 'uf': 'SC'}
{'geocode': 4214706, 'name': 'Rio dos Cedros', 'uf': 'SC'}
{'geocode': 4214904, 'name': 'Rio Fortuna', 'uf': 'SC'}
CPU times: user 6.6 ms, sys: 3.37 ms, total: 9.97 ms
Wall time: 15.3 ms


### 4. Combining Conditions
**If we want to combine conditions, for instance, searching for a municipality by name within a specific state, you can use the `bool` query:**

In [55]:
%%time

query = {
   "bool": {
       "must": [
           {"match": {"name": "Florianópolis"}},
           {"match": {"uf": "SC"}}
       ]
   }
}

# Execute the search
results = search_data(es, index_name, query)

# Display the results directly
for result in results:
    # Assuming you want to display each document's key-value pairs
    print(result)

{'geocode': 4205407, 'name': 'Florianópolis', 'uf': 'SC'}
CPU times: user 2.97 ms, sys: 4.02 ms, total: 6.99 ms
Wall time: 10.6 ms


---

### Notes

- The `search_data` function now takes a `query` parameter, which is a dictionary formatted according to Elasticsearch's Query DSL (Domain Specific Language). This allows for flexible search queries.
- Elasticsearch queries can be simple (e.g., match, term) or complex (e.g., bool with must, should, must_not conditions).
- The exact structure of your query will depend on your specific search requirements and the data schema.
- Ensure your Elasticsearch mappings are appropriately set for the fields you are querying. For text searches, Elasticsearch should analyze the field as `text`. For exact matches (like geocode), the field should be of type `keyword` or similar.
- Always test your queries directly in Kibana Dev Tools or another Elasticsearch interface to refine them before implementing in code.


#### The JSON file used in this notebook contains Brazilian census data from the Brazilian Institute of Geography and Statistics (IBGE). This data includes information about various municipalities in Brazil, such as their geocode, name, and corresponding state (UF).
*docs/notebooks/data/municipalities.json*

---