# Initialise Elasticsearch Client

In [68]:
from elasticsearch import Elasticsearch
from datetime import date, datetime, timedelta
from IPython.display import display, Markdown
import pandas as pd
import json
import os

# --- CONFIG ---
ES_HOST = "https://192.168.1.101:9200"
ES_USERNAME = "elastic"
ES_PASSWORD = "changeme"
ES_INDEX = "gkg"       

# Create the client with authentication
es = Elasticsearch(
    ES_HOST,
    basic_auth=(ES_USERNAME, ES_PASSWORD),
    verify_certs=False, # Set to False if you're using self-signed certs (not recommended for prod)
)

  _transport = transport_class(


# Run Elasticsearch Query

This query looks at documents from start_date to end_date. Within those documents, find all the unique entity names in the V21AllNames.Name.keyword field, count how many times each entity appeared on that day, and give me back a list of the top n most frequent entities along with their counts for that single day.

In [69]:
# --- INPUT ---
start_date = "2025-04-01"
end_date = "2025-04-02" # inclusive of all documents on this date
entity_field = "V21AllNames.Name.keyword"
date_field = "V2ExtrasXML.PubTimestamp"
max_entities = 10000


# --- Query ---
end_date_es = (date.fromisoformat(end_date) + timedelta(days=1)).isoformat()
query = {
    "size": 0,
    "query": {"range": {date_field: {"gte": start_date, "lt": end_date_es}}}, 
    # gte = greater than or equal to, lt = less than. We add an extra day to the end date as the end_date is calculated as 2025-04-02 00:00:00
    "aggs": {
        "entities_over_time": {
            "date_histogram": {"field": date_field, "calendar_interval": "1d", "min_doc_count": 0},
            "aggs": {"top_entities": {"terms": {"field": entity_field, "size": max_entities}}}
        }
    }
}

# --- Execute ---
response = es.search(index=ES_INDEX, body=query)  # Increase size if needed


import json
# Assuming 'response' holds the ObjectApiResponse

# Convert the response object to a standard dictionary
response_dict = dict(response)

# Use json.dumps() to create a pretty-printed string with indentation
# indent=2 or indent=4 are common choices
pretty_json_string = json.dumps(response_dict, indent=2)

# Print the formatted string
print(pretty_json_string)




{
  "took": 2003,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": {
      "value": 10000,
      "relation": "gte"
    },
    "max_score": null,
    "hits": []
  },
  "aggregations": {
    "entities_over_time": {
      "buckets": [
        {
          "key_as_string": "2025-04-01T00:00:00.000Z",
          "key": 1743465600000,
          "doc_count": 8710,
          "top_entities": {
            "doc_count_error_upper_bound": 0,
            "sum_other_doc_count": 22725,
            "buckets": [
              {
                "key": "United States",
                "doc_count": 527
              },
              {
                "key": "President Donald Trump",
                "doc_count": 521
              },
              {
                "key": "United Kingdom",
                "doc_count": 452
              },
              {
                "key": "Donald Trump",
                "doc_count": 

# Visualise Buckets as Dataframe

In [70]:
buckets = response['aggregations']['entities_over_time']['buckets']

data = []
for bucket in buckets:
    # Convert the date from milliseconds to a string
    date_value = datetime.fromtimestamp(bucket['key'] / 1000.0).strftime("%Y-%m-%d")
    for entity in bucket.get("top_entities", {}).get("buckets", []):
        row = {
            "date": date_value,
            "top_entity": entity["key"],
            "count": entity["doc_count"]
        }
        data.append(row)

# Create the DataFrame
df = pd.DataFrame(data)
df

Unnamed: 0,date,top_entity,count
0,2025-04-01,United States,527
1,2025-04-01,President Donald Trump,521
2,2025-04-01,United Kingdom,452
3,2025-04-01,Donald Trump,304
4,2025-04-01,New York,292
...,...,...,...
19995,2025-04-02,Gun Owners,3
19996,2025-04-02,Gunnhild Johnsen Hjetland,3
19997,2025-04-02,Gurbanguly Berdimuhamedov,3
19998,2025-04-02,Gurmeet Chahal,3


# Pivot the Dataframe

In [71]:
pivot_df = df.pivot_table(index="date", columns="top_entity", values="count", fill_value=0)
pivot_df

top_entity,A'Marion Peterson,A-Rated Debt,A-Z Animals,A-list Filipino,Aadhaar Card,Aakash Doshi,Aakash Institute,Aakash Shrivastava,Aam Aadmi Party,Aamir Khan,...,Zoning Commission,Zonta Art Contest,Zonta Club,Zoo Miami,Zoological Society,Zoran Radovanovic,Zubair Ahmad Wani,Zubeda Hamid,Zuberi The,Zum Zwecke
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-04-01,0.0,1.0,1.0,1.0,0.0,0.0,1.0,3.0,1.0,1.0,...,0.0,2.0,2.0,2.0,0.0,0.0,0.0,3.0,2.0,2.0
2025-04-02,4.0,0.0,0.0,0.0,3.0,5.0,0.0,0.0,0.0,5.0,...,5.0,0.0,0.0,0.0,4.0,4.0,4.0,0.0,0.0,0.0


# Apply Mann-Kendall Test