In [1]:
# Standard libs
import os
import uuid
import random
import warnings
from typing import List, Dict
import json
import xattr

# Faker for PERSON/ORG/GPE fillers
from faker import Faker
fake = Faker()

# LlamaCpp & LangChain
from llama_cpp import Llama
from langchain_community.llms import LlamaCpp
from langchain.chains import LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler

# Embeddings & Vector Store
from sentence_transformers import SentenceTransformer
import chromadb
from langchain_community.vectorstores import Chroma

# Silence LangChain deprecation warnings
from langchain._api import LangChainDeprecationWarning
warnings.simplefilter("ignore", category=LangChainDeprecationWarning)

In [2]:
# Cell 2 — Templates & Value Pools
# ─────────────────────────────────
templates: List[
    tuple[str, Dict[str,str]]
] = [
    (
        "The {vuln} vulnerability in {product} was exploited by {apt}.",
        {"vuln":"VULNERABILITY","product":"PRODUCT","apt":"APT_GROUP"},
    ),
    (
        "A {attack} targeted {org} using the {malware} backdoor.",
        {"attack":"ATTACK_TYPE","org":"ORG","malware":"MALWARE"},
    ),
    (
        "{person} from {org} discovered a zero-day {vuln}.",
        {"person":"PERSON","org":"ORG","vuln":"VULNERABILITY"},
    ),
    (
        "A new strain of {malware} was seen in {geo}.",
        {"malware":"MALWARE","geo":"GPE"},
    ),
    (
        "{org} alerted its users of a potential {attack} yesterday.",
        {"org":"ORG","attack":"ATTACK_TYPE"},
    ),
]

values: Dict[str,List[str]] = {
    "vuln":[f"CVE-{random.randint(2000,2023)}-{random.randint(1000,9999)}" for _ in range(200)],
    "product":["OpenSSL","Linux kernel","Windows 10","WordPress","Apache HTTPD"],
    "apt":["APT28","Lazarus Group","APT29","Sandworm Team","FIN7"],
    "attack":["phishing attack","DDoS","ransomware campaign","SQL injection"],
    "malware":["Emotet","TrickBot","Zeus","Ryuk","LockBit"],
    "person":[fake.name() for _ in range(200)],
    "org":[fake.company() for _ in range(100)],
    "geo":[fake.country() for _ in range(50)],
}

In [3]:
# 4) Spin up the LlamaCpp LLM via LangChain
model_path = "model/llama-2-7b-chat.Q4_K_M.gguf"  # ← your gguf on-disk
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

llm = LlamaCpp(
    model_path=model_path,
    n_ctx=2048,             # total context window if supported
    max_tokens=1000,        # allow up to 1000 new tokens per call
    temperature=0.3,        # more deterministic sampling
    top_p=0.8,              # narrower nucleus
    repetition_penalty=1.1,
    do_sample=True,
    callback_manager=callback_manager,
    verbose=False,          # suppress llama_cpp logs
)

                repetition_penalty was transferred to model_kwargs.
                Please confirm that repetition_penalty is what you intended.
  if await self.run_code(code, result, async_=asy):
                do_sample was transferred to model_kwargs.
                Please confirm that do_sample is what you intended.
  if await self.run_code(code, result, async_=asy):
llama_context: n_batch is less than GGML_KQ_MASK_PAD - increasing to 64
llama_context: n_ctx_per_seq (2048) < n_ctx_train (4096) -- the full capacity of the model will not be utilized


In [4]:
# Cell 4 — Build a single few-shot ChatPromptTemplate
# ──────────────────────────────────────────────────────
system = SystemMessagePromptTemplate.from_template(
    """You are a senior cybersecurity analyst.
You will receive:
  1) A one-sentence template with placeholders.
  2) A list of possible values for each placeholder.

Your job:
  • Fill in the template with actual values.
  • Then write a structured incident report, divided into four sections:
      A. Background & Context
      B. Vulnerability & Exploit Details
      C. Observed Impact & Affected Systems
      D. Remediation & Recommendations

For each section, write **at least two complete sentences** (3–4 sentences preferred). Use concrete, professional language. Do NOT include the words “Template:” or “Values:” in your final output."""
)

human = HumanMessagePromptTemplate.from_template(
    # ——— Few-shot Example ———
    "Example:\n"
    "Template: The CVE-2023-3695 vulnerability in Windows Print Spooler was exploited by UNC1151.\n"
    "Values:\n"
    "  vuln: CVE-2023-3695\n"
    "  product: Windows Print Spooler\n"
    "  apt: UNC1151\n\n"
    "FILL + REPORT:\n"
    "The CVE-2023-3695 vulnerability in Windows Print Spooler was exploited by UNC1151.\n"
    "A. Background & Context: In mid-July 2023, Microsoft disclosed CVE-2023-3695—a remote code execution flaw in the Windows Print Spooler service. The flaw has existed in various Windows Server and client OS versions for over three years before discovery. Security researchers observed UNC1151 conducting reconnaissance activities on vulnerable print servers in multiple financial institutions.\n"
    "B. Vulnerability & Exploit Details: UNC1151 leveraged a crafted print job that overflowed a buffer within the Print Spooler’s RPC interface. The attackers then deployed a custom shellcode to gain SYSTEM-level privileges. They used this foothold to drop a signed malicious driver, ensuring persistence across reboots.\n"
    "C. Observed Impact & Affected Systems: Affected systems included dozens of domain-joined servers and workstations across at least three banking networks. Sensitive files—account ledgers and transaction logs—were encrypted, causing two days of service outages. Forensic logs show lateral movement to back-office file shares.\n"
    "D. Remediation & Recommendations: Organizations should immediately apply Microsoft’s July 2023 security update to all print spooler endpoints. Network segmentation should isolate print servers from sensitive directories. Additionally, implement least-privilege printing controls and monitor RPC calls for anomalous buffer-overflow patterns.\n\n"
    "# Now it’s your turn:\n"
    "Template: {template}\n"
    "Values:\n"
    "{values}\n\n"
    "FILL + REPORT:"
)

# simply drop input_variables – it will infer ["template","values"] because those are the only un-escaped braces
prompt = ChatPromptTemplate.from_messages([system, human])  

chain = LLMChain(llm=llm, prompt=prompt, verbose=False)
#print("ℹ️  Re-created `chain` in this kernel session.")

In [7]:
# Cell 5 — Generate, write .txt, add xattr if DoD, embed & store in Chroma
# ────────────────────────────────────────────────────────────────────────

n_docs     = 25
output_dir = "../../llm/synthetic_data"
os.makedirs(output_dir, exist_ok=True)

# Prepare 30 metadata tag names
METADATA_TAGS = [
    "classification",
    "classification_level",
    "originator_id",
    "data_type",
    "sensitivity",
    "transfer_control",
    "data_owner",
    "data_steward",
    "dissemination_control",
    "release_authority",
    "document_type",
    "teaming_partner",
    "handling_descriptor",
    "distribution_statement",
    "sci_control",
    "caveats",
    "ps_percent_complete",
    "point_of_contact",
    "report_version",
    "system_of_record",
    "citation",
    "related_doc",
    "platform",
    "environment",
    "operation_name",
    "mission_area",
    "geo_scope",
    "cyber_domain",
    "policy_impact",
    "audit_trail",
]

# init embedder + Chroma
embedder      = SentenceTransformer("all-MiniLM-L6-v2")
chromaclient  = chromadb.Client()
try:
    collection = chromaclient.create_collection(name="cyber_synth2")
except Exception as e:
    if "already exists" in str(e):
        collection = chromaclient.get_collection(name="cyber_synth2")
    else:
        raise

for i in range(n_docs):
    tpl, slot2label = random.choice(templates)

    # build a real values block
    vals_str = "\n".join(
        f"  {slot}: {random.choice(values[slot])}"
        for slot in slot2label.keys()
    )

    # generate
    report = chain.predict(
        template=tpl,
        values=vals_str,
        max_tokens=1000
    ).strip()

    if not report:
        print(f"⚠️  Empty response for doc {i}, retrying…")
        continue

    # write .txt
    uid       = f"doc_{i}_{uuid.uuid4().hex[:8]}"
    file_path = os.path.join(output_dir, uid + ".txt")
    with open(file_path, "w", encoding="utf-8") as fp:
        fp.write(report)

       # --- NEW: attach 30 synthetic xattr tags to every file ---
    xattrs = xattr.xattr(file_path)
    for tag in METADATA_TAGS:
        value_obj = {
            "doc_id": uid,
            "tag": tag,
            "value": fake.word()
        }
        xattrs[f"user.{tag}"] = json.dumps(value_obj).encode("utf-8")

    # embed & add to Chroma
    emb = embedder.encode(report)
    collection.add(
        documents=[report],
        embeddings=[emb],
        ids=[uid],
    )

    print(f"✅ [{i+1}/{n_docs}] wrote {uid}.txt")

print("Wrote file and set 30 xattrs → now verify on the CLI:")
print(f"  getfattr -d {file_path}")


A new strain of Zeus malware was seen in Sweden.✅ [1/25] wrote doc_0_96543947.txt

A DDoS attack targeted Preston, Burns and Williams using the Ryuk backdoor.
A. Background & Context: On August 15, 2023, Preston, Burns and Williams reported a distributed denial-of-service (DDoS) attack that lasted for three hours. The company’s IT team observed suspicious network traffic originating from an external IP address. Upon further investigation, they discovered the Ryuk backdoor had been installed on their network.
B. Vulnerability & Exploit Details: The Ryuk malware was deployed through a phishing email campaign. Attackers used a malicious Word document to trick employees into opening it, allowing the payload to execute and establish persistence. Once inside, the attackers leveraged the DDoS capabilities of the Ryuk backdoor to overwhelm Preston, Burns and Williams’ network infrastructure.
C. Observed Impact & Affected Systems: The DDoS attack resulted in a complete outage of the company’s 