In [11]:
!pip install langchain langchain-google-genai faiss-cpu gitpython langchain-text-splitters langchain-huggingface sentence-transformers

Collecting langchain-huggingface
  Downloading langchain_huggingface-1.0.1-py3-none-any.whl.metadata (2.1 kB)
INFO: pip is looking at multiple versions of langchain-huggingface to determine which version is compatible with other requirements. This could take a while.
  Downloading langchain_huggingface-1.0.0-py3-none-any.whl.metadata (2.1 kB)
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Downloading langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.1


In [15]:
import os
from collections import defaultdict
from google.colab import userdata
import json

os.environ["GOOGLE_API_KEY"] = userdata.get('GOOGLE_API_KEY')

from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel, Field

# RAG imports
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

In [16]:
# Define the pydantic model for our structured output
class LogAnalysis(BaseModel):
    severity: str = Field(description="The log's severity. (e.g., 'INFO', 'WARNING', 'ERROR', 'CRITICAL')")
    summary: str = Field(description="A brief, human-readable summary of the log entry.")
    entities: list[str] = Field(description="A list of key entities, such as IPs, user IDs, or error codes.")
    suggested_action: str = Field(description="A one-sentence suggested action for an administrator.")

# Initialize the LLM
# We can use a fast model since the task is simple extraction
llm = ChatGoogleGenerativeAI(model="gemini-flash-latest", temperature=0)

In [17]:
# 1. Define the Pydantic model
class LogAnalysis(BaseModel):
    severity: str = Field(description="The log's severity. (e.g., 'INFO', 'WARNING', 'ERROR', 'CRITICAL')")
    summary: str = Field(description="A brief, human-readable summary of the log entry.")
    entities: list[str] = Field(description="A list of key entities, such as IPs, user IDs, or error codes.")
    suggested_action: str = Field(description="A one-sentence suggested action for an administrator.")

# 2. Initialize the LLM
llm = ChatGoogleGenerativeAI(model="gemini-flash-latest", temperature=0)

# 3. Initialize the parser
parser = JsonOutputParser(pydantic_object=LogAnalysis)

# 4. Create the prompt template
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an expert system administrator and log analyst. Your job is to analyze log entries and return a structured JSON analysis. {format_instructions}"),
    ("user", "{log_entry}")
])

# 5. Bind instructions
format_instructions = parser.get_format_instructions()
prompt_with_instructions = prompt.partial(format_instructions=format_instructions)

# 6. Build the analyzer chain
analyzer_chain = prompt_with_instructions | llm | parser

# 7. Define our memory file path
MEMORY_FILE = "log_memory.json"

print("Analyzer chain and memory file path are ready.")

Analyzer chain and memory file path are ready.


In [22]:
# 1. Create your sample runbook/SOP documents
runbook_texts = [
    """
    **Error Code:** (1146) Table Not Found
    **Symptom:** Log entry shows "Table '...' doesn't exist."
    **Cause:** This is a database schema error, usually after a bad migration.
    **Solution:** The database schema is out of sync with the application.
    **Action:** Immediately run the database migration script: `python /scripts/run_migrations.py`
    """,
    """
    **Error Code:** (503) Service Unavailable
    **Symptom:** Kubernetes ingress reports 503 errors.
    **Cause:** The backend pods are crashing or not responding to health checks.
    **Action:** Check pod status with `kubectl get pods`. If in a CrashLoop, check logs with `kubectl logs <pod_name>`. You may need to roll back the latest deployment.
    """,
    """
    **Error Code:** (401) Unauthorized
    **Symptom:** User sees 'Unauthorized' or 'Invalid API Key'.
    **Cause:** The provided API key is missing, expired, or incorrect.
    **Solution:** Ask the user to verify their API key. Check the API Gateway logs to see if the key is being received correctly.
    """
]

# 2. Convert texts to LangChain Document objects
runbook_docs = [Document(page_content=t) for t in runbook_texts]

# 3. Initialize Google Embeddings (if not already done)
# You already have 'llm' from the previous step
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# 4. Create a Text Splitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(runbook_docs)

# 5. Create the new FAISS vector store
runbook_vector_store = FAISS.from_documents(splits, embeddings)

# 6. Create the RAG chain
rag_prompt = ChatPromptTemplate.from_template("""
Answer the user's question based only on the provided context:
<context>
{context}
</context>

Question: {input}
""")

document_chain = create_stuff_documents_chain(llm, rag_prompt)
rag_chain = create_retrieval_chain(runbook_vector_store.as_retriever(), document_chain)

print("Specialist RAG Runbook Agent (rag_chain) is ready.")

Specialist RAG Runbook Agent (rag_chain) is ready.


In [25]:
def load_memory():
    """Loads the entity memory from a JSON file."""
    try:
        with open(MEMORY_FILE, 'r') as f:
            # Use defaultdict to avoid errors if an entity is new
            return defaultdict(lambda: {'count': 0, 'history': []}, json.load(f))
    except FileNotFoundError:
        # Return a new empty defaultdict if the file doesn't exist
        return defaultdict(lambda: {'count': 0, 'history': []})

def save_memory(memory_data):
    """Saves the entity memory to a JSON file."""
    with open(MEMORY_FILE, 'w') as f:
        json.dump(memory_data, f, indent=4)

In [23]:
# --- In a new cell ---

def analyze_log_statefully(log_entry):
    """
    Analyzes a log entry using a stateful, multi-agent process
    that includes a RAG-based runbook.
    """

    # --- 1. Load Memory ---
    memory = load_memory()

    # --- 2. Call 1: Initial Analysis ---
    try:
        initial_analysis = analyzer_chain.invoke({"log_entry": log_entry})
        entities = initial_analysis.get('entities', [])
        severity = initial_analysis.get('severity', 'INFO')
        summary = initial_analysis.get('summary', '')
    except Exception as e:
        print(f"Error during initial analysis: {e}")
        return

    # --- 3. Python Logic: Update Memory ---
    context_string = ""
    most_relevant_entity = ""
    if entities:
        most_relevant_entity = entities[0] # Just use the first one for simplicity

        # Update the count and history for this entity
        memory[most_relevant_entity]['count'] += 1
        memory[most_relevant_entity]['history'].append(summary)

        # Create a context summary for the *next* prompt
        context_string = (
            f"Historical Context: Entity '{most_relevant_entity}' "
            f"has been seen {memory[most_relevant_entity]['count']} time(s)."
        )

    # --- 4. RAG-lite Triage (The New Part) ---
    runbook_solution = "None" # Default

    # *** NEW LOGIC IS HERE ***
    # If it's a critical error AND it's the *first* time we've seen it...
    if severity.upper() == "CRITICAL" and memory[most_relevant_entity]['count'] == 1:
        print("[Triage Router]: CRITICAL new error! Querying RAG Runbook...")
        try:
            # Call the RAG chain!
            rag_input = f"What is the solution for this log summary: {summary}"
            rag_response = rag_chain.invoke({"input": rag_input})
            runbook_solution = rag_response['answer']
        except Exception as e:
            print(f"RAG Runbook failed: {e}")
            runbook_solution = "RAG search failed."

    # --- 5. Call 2: Augmented Analysis ---
    augmented_prompt = f"""
    {context_string}
    Runbook Suggestion: {runbook_solution}

    Log Entry: {log_entry}
    """

    print("--- [Stateful Agent] ---")
    print(f"Augmented Prompt: {augmented_prompt}")

    try:
        # We ask the agent to synthesize everything into a final action
        stateful_analysis = analyzer_chain.invoke({"log_entry": augmented_prompt})
    except Exception as e:
        print(f"Error during stateful analysis: {e}")
        return

    # --- 6. Save Memory & Return ---
    save_memory(memory)

    return stateful_analysis

In [26]:
# --- Test Cell 1 (CRITICAL Error) ---
# Run this cell ONLY ONCE

critical_log = "[2025-11-15 17:01:00] [CRITICAL] (1146) Table 'production.users_auth' doesn't exist. Request from 10.0.1.22"

print(f"--- ANALYZING NEW LOG ---")
print(f"Input: {critical_log}\n")

final_response = analyze_log_statefully(critical_log)

if final_response:
    print("\n--- FINAL STATEFUL RESPONSE ---")
    print(f"Severity: {final_response['severity']}")
    print(f"Summary: {final_response['summary']}")
    print(f"**Suggested Action**: {final_response['suggested_action']}**")

--- ANALYZING NEW LOG ---
Input: [2025-11-15 17:01:00] [CRITICAL] (1146) Table 'production.users_auth' doesn't exist. Request from 10.0.1.22

[Triage Router]: CRITICAL new error! Querying RAG Runbook...
--- [Stateful Agent] ---
Augmented Prompt: 
    Historical Context: Entity '1146' has been seen 1 time(s).
    Runbook Suggestion: The solution is that the database schema is out of sync with the application.

The required action is to immediately run the database migration script: `python /scripts/run_migrations.py`.
    
    Log Entry: [2025-11-15 17:01:00] [CRITICAL] (1146) Table 'production.users_auth' doesn't exist. Request from 10.0.1.22
    

--- FINAL STATEFUL RESPONSE ---
Severity: CRITICAL
Summary: A critical database error occurred because the required table 'production.users_auth' does not exist, indicating the database schema is out of sync with the application.
**Suggested Action**: Immediately run the database migration script: `python /scripts/run_migrations.py` to synch

In [27]:
# --- Test Cell 2 (Same Error) ---
# Run this cell right after the one above

critical_log = "[2025-11-15 17:05:00] [CRITICAL] (1146) Table 'production.users_auth' doesn't exist. Request from 10.0.1.22"

print(f"--- ANALYZING NEW LOG (AGAIN) ---")
print(f"Input: {critical_log}\n")

final_response = analyze_log_statefully(critical_log)

if final_response:
    print("\n--- FINAL STATEFUL RESPONSE ---")
    print(f"Severity: {final_response['severity']}")
    print(f"Summary: {final_response['summary']}")
    print(f"**Suggested Action**: {final_response['suggested_action']}**")

--- ANALYZING NEW LOG (AGAIN) ---
Input: [2025-11-15 17:05:00] [CRITICAL] (1146) Table 'production.users_auth' doesn't exist. Request from 10.0.1.22

--- [Stateful Agent] ---
Augmented Prompt: 
    Historical Context: Entity '1146' has been seen 2 time(s).
    Runbook Suggestion: None
    
    Log Entry: [2025-11-15 17:05:00] [CRITICAL] (1146) Table 'production.users_auth' doesn't exist. Request from 10.0.1.22
    

--- FINAL STATEFUL RESPONSE ---
Severity: CRITICAL
Summary: A critical database operation failed because the required table 'production.users_auth' does not exist.
**Suggested Action**: Immediately verify the database connection, schema integrity, and ensure the 'production.users_auth' table is present and accessible to the application.**
