# Cybersecurity RAG LLM Assistant Project 

## Install Libraries

In [2]:
print("Installing necessary libraries...")
get_ipython().system(
    'pip install -q langchain-openai openai chromadb gradio python-dotenv '
    'tiktoken langchain-community langchain-classic langchain-chroma '
    'langchain-text-splitters pypdf'
)
print("Libraries installed successfully!")


Installing necessary libraries...
Libraries installed successfully!


## OpenAI Setup

In [3]:
# Step 2: OpenAI client setup
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv
import os

# Auto-find .env
env_path = find_dotenv()
print("Env path:", env_path)

# Load it
load_dotenv(env_path, override=True)

# Retrieve key
cyber_api_key = os.getenv("OPENAI_API_KEY")

if not cyber_api_key:
    raise ValueError("OPENAI_API_KEY not found. Check .env file format.")

# Init client
cyber_openai_client = OpenAI(api_key=cyber_api_key)

print("‚úÖ OpenAI client successfully configured")
print("Key preview:", cyber_api_key[:15])


Env path: /workspaces/CyberRAG-LLM-Assistant/.env
‚úÖ OpenAI client successfully configured
Key preview: sk-proj-6-J1Chx


## Import LangChain Components

In [4]:
# LLM + Embeddings
from langchain_openai import OpenAI, OpenAIEmbeddings

# Vector store
from langchain_chroma import Chroma

# Text splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Document loaders
from langchain_community.document_loaders import TextLoader, PyPDFLoader

# Chains (still in main langchain)
from langchain_classic.chains import RetrievalQAWithSourcesChain
print("Langchain and Chroma imported successfully")


Langchain and Chroma imported successfully


## Define Cybersecurity Data File Path

In [5]:
# Define the path to your cybersecurity threat intelligence data

CYBER_DATA_FILE_PATH = "cybersecurity_knowledge_base.txt"

print(f"Cybersecurity data file path set to: {CYBER_DATA_FILE_PATH}")


Cybersecurity data file path set to: cybersecurity_knowledge_base.txt


## Load Cybersecurity Documents

In [6]:
print(f"Attempting to load cybersecurity data from: {CYBER_DATA_FILE_PATH}")

loader = TextLoader(CYBER_DATA_FILE_PATH, encoding="utf-8")

cyber_raw_documents = loader.load()

print(f"Successfully loaded {len(cyber_raw_documents)} cybersecurity document(s).")


Attempting to load cybersecurity data from: cybersecurity_knowledge_base.txt
Successfully loaded 1 cybersecurity document(s).


## Sanity Check

In [7]:
# Display first 500 characters

print(cyber_raw_documents[0].page_content[:500] + "...")


CYBERSECURITY KNOWLEDGE BASE FOR RAG Q&A SYSTEM

This file contains structured cybersecurity knowledge organized by topics.
Each section includes detailed information that can be used for question-answering.

SECTION 1: ZERO TRUST ARCHITECTURE

Source: NIST SP 800-207 Zero Trust Architecture
Category: Architecture & Frameworks
Date: 2020-08
Author:...


## Document Chunking

In [8]:
print("\nSplitting cybersecurity intelligence into chunks...")

cyber_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=150,
)

cyber_documents = cyber_text_splitter.split_documents(cyber_raw_documents)

if not cyber_documents:
    raise ValueError("Error: No chunks created.")

print(f"Cybersecurity document split into {len(cyber_documents)} chunks.")



Splitting cybersecurity intelligence into chunks...
Cybersecurity document split into 80 chunks.


## Inspect Chunks

In [9]:
print("\n--- Example Cybersecurity Chunk ---")
print(cyber_documents[2].page_content)

print("\n--- Metadata ---")
print(cyber_documents[2].metadata)



--- Example Cybersecurity Chunk ---
Core Components of Zero Trust:

- Policy Engine (PE): Makes access decisions based on enterprise policy and external sources
- Policy Administrator (PA): Establishes and shuts down communication paths between subjects and resources
- Policy Enforcement Point (PEP): Enables, monitors, and terminates connections between subjects and enterprise resources

Zero Trust Network Access (ZTNA) replaces traditional VPNs by creating secure, identity-based access to applications without placing users on the network. This prevents lateral movement and reduces the attack surface.

Implementation Strategies:
- Start with identity and access management (IAM)
- Implement multi-factor authentication across all systems
- Deploy endpoint detection and response (EDR) solutions
- Use network segmentation and micro-segmentation
- Deploy security information and event management (SIEM)
- Implement continuous monitoring and analytics

--- Metadata ---
{'source': 'cybersecur

## Embeddings & Vector Store

In [10]:
print("Initializing OpenAI embeddings for cybersecurity data...")

cyber_embeddings = OpenAIEmbeddings(
    openai_api_key=cyber_api_key
)

print("Embeddings initialized.")

print("\nCreating cybersecurity vector store...")

cyber_vector_store = Chroma.from_documents(
    documents=cyber_documents,
    embedding=cyber_embeddings
)

vector_count = cyber_vector_store._collection.count()

print(f"Cyber vector store contains {vector_count} items.")

if vector_count == 0:
    raise ValueError("Vector store empty.")


Initializing OpenAI embeddings for cybersecurity data...
Embeddings initialized.

Creating cybersecurity vector store...
Cyber vector store contains 80 items.


## Inspect Stored Embedding

In [11]:
stored_data = cyber_vector_store._collection.get(
    include=["embeddings", "documents"],
    limit=1
)

print("First stored cyber chunk:\n", stored_data["documents"][0])
print("\nEmbedding vector length:", len(stored_data["embeddings"][0]))


First stored cyber chunk:
 CYBERSECURITY KNOWLEDGE BASE FOR RAG Q&A SYSTEM

This file contains structured cybersecurity knowledge organized by topics.
Each section includes detailed information that can be used for question-answering.

SECTION 1: ZERO TRUST ARCHITECTURE

Source: NIST SP 800-207 Zero Trust Architecture
Category: Architecture & Frameworks
Date: 2020-08
Author: National Institute of Standards and Technology

CONTENT:
Zero Trust Architecture (ZTA) is a cybersecurity paradigm that shifts defense from static, network-based perimeters to focus on users, assets, and resources. The core principle is "never trust, always verify" - no user or device is trusted by default, even if they're already inside the network perimeter.

Key Principles of Zero Trust:

Embedding vector length: 1536


## Test Retrieval

### Similarity Search

In [12]:
print("\n--- Testing Cybersecurity Similarity Search ---")

test_query = "Explain recent ransomware attack techniques."

similar_docs = cyber_vector_store.similarity_search(test_query, k=2)

for i, doc in enumerate(similar_docs):
    print(f"\n--- Document {i+1} ---")
    print(doc.page_content[:700] + "...")
    print("Source:", doc.metadata.get("source", "Unknown"))



--- Testing Cybersecurity Similarity Search ---

--- Document 1 ---
UNDERSTANDING THE THREAT

Ransomware Attack Chain:
1. Initial Access (phishing email, RDP brute force, vulnerability exploitation)
2. Execution (malicious payload runs)
3. Persistence (creates scheduled tasks, registry keys)
4. Defense Evasion (disables antivirus, deletes shadow copies)
5. Discovery (scans network, identifies valuable data)
6. Lateral Movement (spreads to other systems using compromised credentials)
7. Collection (identifies and stages data for encryption)
8. Exfiltration (steals data before encryption - double extortion)
9. Impact (encrypts files, displays ransom note)

Common Ransomware Families:
- LockBit: Ransomware-as-a-Service (RaaS), fast encryption, targets enterpris...
Source: cybersecurity_knowledge_base.txt

--- Document 2 ---
SECTION 4: RANSOMWARE DEFENSE STRATEGY

Source: CISA Ransomware Guide & FBI Internet Crime Report
Category: Threat Defense
Date: 2023-11
Author: Cybersecurity and Inf

## Build RAG Chain

### Configure Retriever + LLM

In [13]:
# Retriever
cyber_retriever = cyber_vector_store.as_retriever(
    search_kwargs={"k": 3}
)

print("Cyber retriever configured.")

# LLM
cyber_llm = OpenAI(
    temperature=0,
    openai_api_key=cyber_api_key
)

print("Cyber LLM initialized.")


Cyber retriever configured.
Cyber LLM initialized.


### Create RAG Chain

In [14]:
cyber_qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=cyber_llm,
    chain_type="stuff",
    retriever=cyber_retriever,
    return_source_documents=True,
    verbose=True,
)

print("Cybersecurity RAG chain created.")


Cybersecurity RAG chain created.


### Test Full Chain

In [15]:
print("\n--- Testing Cybersecurity RAG Chain ---")

chain_test_query = "What are modern ransomware mitigation strategies?"

result = cyber_qa_chain.invoke(
    {"question": chain_test_query}
)

print("\n--- Answer ---")
print(result.get("answer"))

print("\n--- Sources ---")
print(result.get("sources"))



--- Testing Cybersecurity RAG Chain ---


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m

--- Answer ---
 Modern ransomware mitigation strategies include identifying the ransomware variant, checking for decryption tools, determining the initial infection vector, mapping affected systems and data, assessing data exfiltration risk, documenting the timeline of events, preserving evidence, ensuring the threat is eradicated before recovery, rebuilding affected systems, restoring data from backups, resetting passwords and credentials, applying security patches, enhancing security controls, monitoring for re-infection, and validating restored systems. 


--- Sources ---
CISA Ransomware Guide, FBI Internet Crime Report, cybersecurity_knowledge_base.txt


## Gradio Cybersecurity Interface

In [16]:
import gradio as gr


### Assistant Function

In [17]:
def ask_cybersecurity_assistant(user_query):

    print(f"\nProcessing query: {user_query}")

    if not user_query.strip():
        return "Please enter a cybersecurity question.", ""

    try:
        result = cyber_qa_chain.invoke(
            {"question": user_query}
        )

        answer = result.get(
            "answer",
            "No intelligence found."
        )

        sources = result.get(
            "sources",
            "No sources identified."
        )

        if isinstance(sources, list):
            sources = ", ".join(list(set(sources)))

        return answer.strip(), sources

    except Exception as e:
        return f"Error: {e}", "Execution failure"


### Build Interface

In [18]:
with gr.Blocks(
    theme=gr.themes.Soft(),
    title="Cybersecurity RAG Assistant"
) as cyber_demo:

    gr.Markdown(
        """
        # üõ°Ô∏è Cybersecurity AI Threat Intelligence Assistant

        Ask questions about:

        ‚Ä¢ Ransomware  
        ‚Ä¢ Phishing campaigns  
        ‚Ä¢ CVEs & exploits  
        ‚Ä¢ Zero-day threats  
        ‚Ä¢ Nation-state attacks  

        Answers include cited sources from uploaded intelligence.
        """
    )

    question_input = gr.Textbox(
        label="Cybersecurity Question",
        placeholder="e.g., Latest LockBit ransomware tactics?",
        lines=2,
    )

    with gr.Row():
        answer_output = gr.Textbox(
            label="Answer",
            interactive=False,
            lines=6,
        )

        sources_output = gr.Textbox(
            label="Sources",
            interactive=False,
            lines=2,
        )

    with gr.Row():
        submit_button = gr.Button(
            "Ask Cyber AI ü§ñ",
            variant="primary",
        )

        clear_button = gr.ClearButton(
            components=[
                question_input,
                answer_output,
                sources_output,
            ]
        )

    gr.Examples(
        examples=[
            "Recent ransomware groups in 2025",
            "How do phishing kits operate?",
            "Explain zero-day vulnerabilities",
            "Top mitigation strategies for SOC teams",
        ],
        inputs=question_input,
        cache_examples=False,
    )

    submit_button.click(
        fn=ask_cybersecurity_assistant,
        inputs=question_input,
        outputs=[
            answer_output,
            sources_output,
        ],
    )

print("Launching Cybersecurity Gradio App...")
cyber_demo.launch(share=True)


  with gr.Blocks(


Launching Cybersecurity Gradio App...
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://09f45d5775e00f112f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)





Processing query: Recent ransomware groups in 2025


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m

Processing query: Explain zero-day vulnerabilities


[1m> Entering new RetrievalQAWithSourcesChain chain...[0m

[1m> Finished chain.[0m
