# Investigating conversion of Elastic ingest pipelines into Cribl worker mappers

## Install prerequisite packages

In [15]:
!pip install elasticsearch google-generativeai ipywidgets tqdm

[0m

## Create a connection to Elasticsearch

In [6]:
from elasticsearch import Elasticsearch
import os

# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = os.getenv('ELASTIC_PASSWORD')

# Create the client instance
client = Elasticsearch(
    "https://es01:9200",
    basic_auth=("elastic", ELASTIC_PASSWORD),
    ca_certs="/certs/ca/ca.crt"
)

# Successful response!
body = client.info().body
# {'name': 'instance-0000000000', 'cluster_name': ...}

body

{'name': 'es01',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'yqxqbj8fQ1qwpZVzo2LICQ',
 'version': {'number': '8.15.1',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': '253e8544a65ad44581194068936f2a5d57c2c051',
  'build_date': '2024-09-02T22:04:47.310170297Z',
  'build_snapshot': False,
  'lucene_version': '9.11.1',
  'minimum_wire_compatibility_version': '7.17.0',
  'minimum_index_compatibility_version': '7.0.0'},
 'tagline': 'You Know, for Search'}

## Retrieve and manipulate the ingest pipeline

In [8]:
PIPELINE_NAME = 'logs-crowdstrike.fdr-1.40.0'

pipeline = client.ingest.get_pipeline(id=PIPELINE_NAME).body

In [9]:
import json

processors = pipeline[PIPELINE_NAME]['processors']

processors_json = json.dumps(processors)

## Connect to Gemini API

In [48]:
system_instruction="""
You are an expert in Elasticsearch ingest pipelines. 

You will receive a JSON representation of an ingest pipeline fragment containing processors. 
Your task is to explain this pipeline step-by-step, describing each processor in the order they appear. 

For each processor:
- Identify the processor type.
- Explain its purpose. Describe how it modifies or interacts with the document being processed. If it is a script processor, explain what the code block is doing in details.
- Specify the link to it's documentation page on elastic.co documentation site. Provide link as text only.
- Mention any conditions applied to the processor.
- Specify the event tags, if the event has tags.

Provide a new paragraph when describing each processor without numbering the paragraph. 
Preserve the order of processor appearance in the pipeline fragment. 
Identify processors one at a time and don't group processors.

If you encounter any unfamiliar processors or need additional information, consult the official Elasticsearch documentation to ensure accuracy. 
Provide a clear, concise explanation that a developer familiar with Elasticsearch could understand.
"""

prompt = """
 Describe the processors in the following ingest pipeline fragment:
"""

In [49]:
import google.generativeai as genai
import os

genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [50]:
# Response JSON schema
import typing_extensions as typing


class Processor(typing.TypedDict):
    processor_type: str
    purpose: str
    documentation_link: str
    conditions: list[str]
    tags: str
    

In [54]:
import json
from tqdm.notebook import tqdm

ARRAY_LENGTH=5

def split_array(arr, X):
    return [arr[i:i + X] for i in range(0, len(arr), X)]

model_parameters = {
    "system_instruction": system_instruction
}

model = genai.GenerativeModel('gemini-1.5-flash-latest', **model_parameters)

generation_config=genai.types.GenerationConfig(
    response_mime_type="application/json",
    response_schema=list[Processor],
    temperature=0,
    
)

combined_response = []
processor_chunks =  split_array(processors, ARRAY_LENGTH)
progress_bar = tqdm(total=len(processor_chunks))

for i in range(len(processor_chunks)):
    response = model.generate_content(prompt + json.dumps(processor_chunks[i]), generation_config=generation_config)
    combined_response.extend(json.loads(response.text))
    progress_bar.update(1)

  0%|          | 0/42 [00:00<?, ?it/s]

ValueError: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. Please check the `candidate.safety_ratings` to determine if the response was blocked.

In [None]:
response.text

In [None]:
from IPython.display import display, JSON

display(JSON(combined_response))

In [None]:
print(json.dumps(combined_response, indent=4))