In [None]:
!pip install dotenv

In [None]:
import requests
from dotenv import load_dotenv
import os

# Export the API key to an environment variable
if not os.path.exists('.env.instruqt'):
    env_text = requests.get('http://kubernetes-vm:9000/env').text
    with open('.env.instruqt', 'w') as f:
        f.write(env_text)
load_dotenv('.env.instruqt')

openai_api_key =  os.environ.get("LLM_APIKEY") 
url = os.environ.get("LLM_PROXY_URL") 
openai_api_base = f"https://{url}"

os.environ["OPENAI_API_KEY"] = openai_api_key
os.environ["OPENAI_BASE_URL"] = openai_api_base

In [None]:
es_host = os.getenv("ELASTICSEARCH_URL", None)
es_api_key = os.getenv("ELASTICSEARCH_APIKEY", None)

In [1]:
from elasticsearch import Elasticsearch

In [4]:
#Function for pretty_printing JSON
def jsn(x):
    import json
    x=dict(x)
    print(json.dumps(x, indent=2, sort_keys=True))

In [5]:
import warnings
warnings.filterwarnings('ignore')

In [6]:
#connect to Elasticsearch and verify
es = Elasticsearch(
     hosts=[f"{es_host}"],
     api_key=es_api_key,
)
jsn(es.info())

{
  "cluster_name": "elasticsearch",
  "cluster_uuid": "0IM995ZESGyT53Q-GPUQRw",
  "name": "Marks-MacBook-Pro.local",
  "tagline": "You Know, for Search",
  "version": {
    "build_date": "2025-04-08T15:13:46.049795831Z",
    "build_flavor": "default",
    "build_hash": "112859b85d50de2a7e63f73c8fc70b99eea24291",
    "build_snapshot": false,
    "build_type": "tar",
    "lucene_version": "10.1.0",
    "minimum_index_compatibility_version": "8.0.0",
    "minimum_wire_compatibility_version": "8.18.0",
    "number": "9.0.0"
  }
}


# Run searches on Elasticsearch #

In [None]:
#function to run a simple query == match 
def retrieve_documents(query, top_n=2):
    search_query = {
        "query": {
            "match": {
                "content": query
            }
        }
    }
    response = es.search(index="elastic_blogs-full-embeddings_e5", body=search_query)
    top_docs = [hit["_source"]["content"] for hit in response["hits"]["hits"][:top_n]]
    line_separated = "\n\n".join(top_docs)
    print(line_separated)

In [None]:
retrieve_documents("Kibana for data analytics",top_n=3)

That was a simple "match" search. We want to be able to run a more sophisticated lexical search on Elasticsearch so we can RAG to the LLM some very relevant documents.

The function `create_response` can run searches by calling a search_template (which is more newly a search_application).
The search application is running a hybrid search  -  lexical and semantic - combined using RRF.

In [None]:
#first run with `render_query` to see the hybrid search and check that parameters get filled-in.

app_name = "RAG_application"                   #search_application built in Kibana Console
params1 = {"query_string" : "My first query","size" : 2}

create_response = es.search_application.render_query(name=app_name, params=params1)

print("The render_query shows the search code is a bool and semantic search combined by RRF: \n")
jsn(create_response)

In [None]:
#run with "search" to do a search on Elasticsearch

app_name = "RAG_application"
params1 = {"query_string" : "My first query", "size" : 3}   #dictionary of key:values

create_response = es.search_application.search(name=app_name, params=params1)

print("Documents from running the query: ")
jsn(create_response)

In [None]:
#retrieve_documemts is a function to run a search template/application
def retrieve_documents(query,  top_n=2, search_template="RAG_application"):
    params = {"query_string": query}
    params["size"]=top_n
    response = es.search_application.search(name=search_template, params=params)
    top_docs = [hit["_source"]["content"] for hit in response["hits"]["hits"][:top_n]]
    return "\n".join(top_docs)

aside: 
Later can consider making parameters to constrain time :  "search_date": {"start": "now-13y",  "end":"now")  <br />
and   "from"  to page thru results

In [None]:
#unit test
query = "How can I secure my networks between elasticsearch nodes?"
retrieved_documents = retrieve_documents(query)
print("Retrieved Documents:", retrieved_documents)

# Interact with LLM

In [7]:
# LLM is from OpenAI 
from openai import OpenAI

In [8]:
#Start with a simple, one-pass interacation with the LLM. The function call2llm takes a systems_prompt, which is the 
#persona the system assumes in the interaction, and "users_prompt" which is the input from the user chatting with the LLM

def call2llm(systems_prompt, users_prompt):
    client = OpenAI(api_key=OPENAI_API_KEY)
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": systems_prompt},
            {"role": "user", "content": users_prompt}
        ],
        model="gpt-4.1",
        temperature=0.000001  # low means consistent LLM responses (high means more creative)
    )
    response = response.choices[0].message.content
    return response

In [9]:
#test
llm_answer = call2llm("You're a helpful assistant", "What is 2+2?")
print(llm_answer)

2 + 2 = 4


In [10]:
llm_answer2 = call2llm("You're a helpful assistant", "What did we just sum?")
print(llm_answer2)

It looks like you’re referring to a previous sum or calculation, but I don’t have any prior context in this conversation. Could you clarify what you’re referring to or provide more details? I’m here to help!


No memory in call2llm of what happened previously.

#### Implement instead as a python class, which will help in adding conversational memory.  

In [11]:
class ChatWithLlm:
    def __init__(self,systems_prompt="assistant",model="gpt-4.1"):
        self.systems_prompt = systems_prompt
        self.model = model
        self.history = [{"role":"system",  
                         "content":systems_prompt}]          #history helps us "keep memory" of what happened before
   
    def call2llm(self, users_prompt, temperature=0.00001):   #low temperature means consistent LLM responses (high means more creative)
        client = OpenAI(api_key=OPENAI_API_KEY)
        self.history.append({"role": "user", "content": users_prompt})   #user role prompts the LLM 
        response = client.chat.completions.create(
            messages=self.history,
            model=self.model,
            temperature=temperature,
        )
        response_llm = str(response.choices[0].message.content)
        self.history.append({"role": "assistant", "content": response_llm})
        return response_llm

# This class uses "old style" conversational memory technique.

In [12]:
#test with an instance of the ChatWithLlm class
chat = ChatWithLlm("You're a helpful assistant")
llm_answer =  chat.call2llm("What is 2 + 2?")
print(llm_answer)

2 + 2 = 4


In [13]:
llm_answer =  chat.call2llm("What did I just ask you?")
print(llm_answer)

You just asked, "What is 2 + 2?"


In [14]:
llm_answer =  chat.call2llm("How did you remember what was asked?")
print(llm_answer)

I remembered what you asked because, as an AI assistant, I have access to the ongoing conversation context within this chat session. This allows me to reference previous messages and provide relevant, coherent responses. I don't have memory beyond this session or access to information from other conversations, but within our current chat, I can "see" and respond to your earlier questions.


## RAG solution

In [15]:
import json
from openai import OpenAI

Elastic_rag both queries Elastisearch and feeds those docs to the LLM in a prompt.

In [16]:
class Elastic_rag:
    def __init__(self, systems_prompt="You are a helpful assistant."):                
        self.previous_response_id = None                                  #no previous response_id the first time through

    #retrieve documents from Elasticsearch
    def retrieve(self, query,  top_n=2, search_template="RAG_application"):
        params = {"query_string": query}
        params["size"]=top_n
        response = es.search_application.search(name=search_template, params=params)
        top_docs = [hit["_source"]["content"] for hit in response["hits"]["hits"][:top_n]]
        return "\n".join(top_docs)

    #combine user's query, conversation history, and docs from Elasticsearch to send to LLM
    def augment (self, query, temperature=0.00001, model="gpt-4.1"):
        client = OpenAI(api_key=OPENAI_API_KEY)
        retrieval = Elastic_rag()
        retrieved = retrieval.retrieve(query)
        prompt = ( "This is the query: "  +  query +  " Here are supporting document. Do not summarize the documents. " + retrieved)
        response = client.responses.create(
            input=[{"role": "user", "content": prompt}],
            model=model,
            temperature=temperature,
            previous_response_id=self.previous_response_id              #set the previous_id to the current; for memory of conversation
        )
        self.previous_response_id=response.id                           #update conversation memory with current response.id
        return response.output_text 

# This class uses "newer style" conversational memory technique - response_id

In [17]:
if __name__ == "__main__":
    conversation = Elastic_rag()   # an instance of a conversation

    # Adding responses
    print(conversation.augment("What is Kibana good for?"))

**What is Kibana good for?**

Based on the provided documents, Kibana is particularly good for:

- **Data Visualization:** Kibana allows users to create a wide variety of charts and graphs from data stored in Elasticsearch. This makes it possible to visually explore and analyze large datasets, such as identifying trends, spikes, or anomalies (e.g., a spike in car models or taxi cabs with over 1 million miles).

- **Making Complex Data Understandable:** Even for non-technical users, Kibana provides tools to break down and visualize complex event data, making it easier to draw meaningful conclusions from large and varied datasets.

- **Aggregations and Analysis:** With Kibana, users can perform aggregations on their data, such as calculating averages, totals, or breaking down data into buckets (e.g., sales data by day of the week). This helps in understanding not just individual data points, but also broader patterns and metrics.

- **Interactive Exploration:** Kibana supports interactiv

In [18]:
print(conversation.augment("Can I run Kibana in a Docker container?"))

**Can I run Kibana in a Docker container?**

Yes, you can run Kibana in a Docker container.

The supporting documents describe how Elastic provides Docker images and Docker Compose examples that allow you to deploy the full Elastic Stack—including Kibana, Elasticsearch, and Beats—using Docker containers. The process is designed to be as simple as possible, even allowing new users to deploy the entire stack with a single command using Docker Compose. The architecture includes containers for Elasticsearch, Kibana, and various Beats modules, and can be customized or extended as needed.

To deploy the stack (including Kibana) using Docker Compose, you would:

1. Download and extract the provided archive, which contains Docker Compose files and configuration for each Elastic Stack component.
2. Ensure Docker is installed on your system.
3. Run the appropriate Docker Compose command for your operating system, such as:
   ```
   docker-compose -f docker-compose-osx.yml up
   ```
   or
   ```


In [19]:
print(conversation.augment("What was the first question that I asked you?"))

The first question that you asked me was:  
**"What is Kibana good for?"**


In [20]:
print(conversation.augment("How do you know what I asked you prior?"))

I know what you asked me prior because I keep track of the sequence of interactions and questions in our conversation. Each time you ask a question, it is recorded as part of the ongoing context of our exchange. This allows me to reference previous questions and answers, maintain continuity, and provide relevant responses based on the history of our conversation. This process is similar to how structured log data is parsed and analyzed—by keeping track of the sequence and structure, I can refer back to earlier points and maintain context throughout our interaction.


Here is a conversation where a user fills in an input box.

In [21]:
print ("Please ask your questions about the Elastic Stack.  Be sure to share the background or context of your question. "
    " For example, mention what you want to achieve with Elasticsearch. (e.g., searching, indexing, analytics). "
    "Instead of general inquiries, ask specific questions. For instance, 'How do I set up an Elasticsearch index?' "
    "or 'What are the best practices for querying in Elasticsearch?' If applicable, provide examples of what " 
    "you're working on or the challenges you're facing. This helps me tailor my responses to your situation."
    "Indicate how detailed you want the response to be. Are you looking for a brief overview or an in-depth "
    "explanation? Let me know if my previous responses were helpful or if there are areas where you need more "
    "clarity.")

instance=Elastic_rag()

while True:
    print ("\n\nPlease ask your questions about the Elastic Stack.  Type exit to stop.")
    user_input = input()
    
    if user_input.lower() == 'exit':
        print("Conversation ended.")
        break
        
    #print("\n\nUser input:", user_input)
    print("\nResponse: \n")
    llm_answer=instance.augment(user_input)
    print(llm_answer)

Please ask your questions about the Elastic Stack.  Be sure to share the background or context of your question.  For example, mention what you want to achieve with Elasticsearch. (e.g., searching, indexing, analytics). Instead of general inquiries, ask specific questions. For instance, 'How do I set up an Elasticsearch index?' or 'What are the best practices for querying in Elasticsearch?' If applicable, provide examples of what you're working on or the challenges you're facing. This helps me tailor my responses to your situation.Indicate how detailed you want the response to be. Are you looking for a brief overview or an in-depth explanation? Let me know if my previous responses were helpful or if there are areas where you need more clarity.


Please ask your questions about the Elastic Stack.  Type exit to stop.


 exit


Conversation ended.
