In [3]:
import feedparser
import requests
import json
import os
import numpy as np
from datetime import datetime
from typing import List, Dict, Any
from dotenv import load_dotenv
from llama_index.embeddings.openai import OpenAIEmbedding, OpenAIEmbeddingModelType
from llama_index.llms.openai import OpenAI
from llama_index.core import VectorStoreIndex, Document, StorageContext
from llama_index.core import Settings
from constants import embed_model, llm_model, INDEX_PERSIST_PATH, ENHANCED_RAG_TEMPLATE, CISA_ICS_RSS_URL, MAX_ADVISORIES 

In [None]:
def fetch_cisa_advisories() -> List[Dict[str, Any]]:
    """
    Fetch ICS security advisories from CISA RSS feed
    """
    try:
        # Parse RSS feed
        feed = feedparser.parse(CISA_ICS_RSS_URL)
        
        advisories = []
        for i, entry in enumerate(feed.entries[:MAX_ADVISORIES]):
            advisory = {
                'id': entry.get('id', f'advisory_{i}'),
                'title': entry.get('title', 'No Title'),
                'summary': entry.get('summary', 'No Summary'),
                'link': entry.get('link', ''),
                'published': entry.get('published', str(datetime.now())),
                'content': entry.get('summary', '') + ' ' + entry.get('title', '')
            }
            advisories.append(advisory)
            print(f"Fetched advisory {i+1}: {advisory['title']}")
            
        return advisories
        
    except Exception as e:
        print(f"Error fetching CISA advisories: {e}")
        return []

In [4]:
def create_mitre_embeddings(embed_model):
    """
    Create embeddings for all MITRE ATT&CK techniques
    """
    mitre_embeddings = {}
    with open("assets/mitre-ics.json", "r", encoding="utf-8") as f:
        mitre_data = json.load(f)

    for item in mitre_data:
        # Create searchable text combining name and description
        technique_text = f"{item['name']} {item['description']} {item['tactics']}"
        technique_id = item['Id']
        # Generate embedding
        embedding = embed_model.get_text_embedding(technique_text)
        mitre_embeddings[technique_id] = {
            'embedding': embedding,
            'text': technique_text,
            'details': item
        }
        print(f"Generated embedding for {technique_id}")
    
    return mitre_embeddings

In [5]:
def find_similar_mitre_techniques(advisory_content: str, mitre_embeddings: Dict, top_k: int = 5):
    """
    Find top-k most similar MITRE ATT&CK techniques using embedding similarity
    """
    # Generate embedding for advisory content
    advisory_embedding = embed_model.get_text_embedding(advisory_content)
    
    # Calculate cosine similarity with all MITRE techniques
    similarities = []
    
    for technique_id, data in mitre_embeddings.items():
        mitre_emb = np.array(data['embedding'])
        advisory_emb = np.array(advisory_embedding)
        
        # Cosine similarity
        cosine_sim = np.dot(mitre_emb, advisory_emb) / (
            np.linalg.norm(mitre_emb) * np.linalg.norm(advisory_emb)
        )
        
        similarities.append({
            'technique_id': technique_id,
            'similarity': float(cosine_sim),
            'details': data['details']
        })
    
    # Sort by similarity and return top-k
    similarities.sort(key=lambda x: x['similarity'], reverse=True)
    return similarities[:top_k]

In [6]:
def map_to_mitre_attack(advisory_content: str, mitre_embeddings=None) -> Dict[str, Any]:
    """
    Enhanced MITRE ATT&CK mapping using two-stage approach:
    1. Embedding-based similarity filtering
    2. LLM-based refined analysis
    """
    try:
        # Stage 1: Use embeddings to find top candidate techniques
        candidate_techniques = None
        if mitre_embeddings:
            print("Stage 1: Finding similar MITRE techniques using embeddings...")
            candidates = find_similar_mitre_techniques(
                advisory_content, mitre_embeddings, top_k=5
            )
            
            # Prepare candidate techniques for LLM analysis
            candidate_techniques = {
                cand['technique_id']: cand['details'] 
                for cand in candidates
            }
            
            print(f"Top candidates: {list(candidate_techniques.keys())}")
        
        # Stage 2: Use LLM for refined mapping on filtered candidates
        print("Stage 2: LLM-based refined analysis...")

        with open("assets/mitre-ics.json", "r", encoding="utf-8") as f:
            mitre_data = json.load(f)
        
        # Use candidate techniques if available, otherwise full set
        techniques_to_analyze = candidate_techniques if candidate_techniques else mitre_data
        
        # Enhanced prompt for refined analysis
        refined_prompt = REFINED_MITRE_PROMPT_TEMPLATE.format(
            advisory_content=advisory_content,
            techniques_to_analyze=json.dumps(techniques_to_analyze, indent=2)
        )
        
        response = llm_model.complete(refined_prompt)
        print(f"LLM response: {response.text}")

        mapping = json.loads(response.text)
        print(f"Final mapping: {mapping}")
        return mapping
        
    except Exception as e:
        print(f"Error in map_to_mitre_attack: {e}")
        return {"mapped_techniques": []}

In [7]:
def create_and_store_index(advisories, mitre_embeddings):
    """
    Create vector store index and persist to storage
    """
    print("Processing advisories and mapping to MITRE ATT&CK...")
    documents = []
    processed_advisories = []
    
    for advisory in advisories:
        # Map to MITRE ATT&CK techniques
        mitre_mapping = map_to_mitre_attack(
            advisory['content'], 
            mitre_embeddings,
        )
        
        # Create enhanced content with MITRE mapping
        enhanced_content = f"""
        Title: {advisory['title']}
        Summary: {advisory['summary']}
        Published: {advisory['published']}
        Link: {advisory['link']}
        
        MITRE ATT&CK Mapping:
        Techniques: {', '.join(mitre_mapping.get('mapped_techniques', []))}
        
        Full Content: {advisory['content']}
        """
        
        # Create document with metadata
        doc = Document(
            text=enhanced_content,
            metadata={
                'id': advisory['id'],
                'title': advisory['title'],
                'published': advisory['published'],
                'link': advisory['link'],
                'mitre_techniques': mitre_mapping.get('mapped_techniques', []),
            }
        )
        
        documents.append(doc)
        
        # Store processed advisory data
        advisory_data = advisory.copy()
        advisory_data['mitre_mapping'] = mitre_mapping
        processed_advisories.append(advisory_data)
    
    print("Creating vector store index...")
    # Set up LlamaIndex settings
    Settings.llm = llm_model
    Settings.embed_model = embed_model
    
    # Create index
    index = VectorStoreIndex.from_documents(documents)
    
    # Persist the index
    print("Persisting index to storage...")
    index.storage_context.persist(persist_dir=INDEX_PERSIST_PATH)
    
    # Save advisory metadata
    metadata_path = os.path.join(INDEX_PERSIST_PATH, "advisories_metadata.json")
    os.makedirs(INDEX_PERSIST_PATH, exist_ok=True)
    with open(metadata_path, 'w') as f:
        json.dump(processed_advisories, f, indent=2)
    
    print(f"Successfully processed and stored {len(documents)} advisories!")
    return index, processed_advisories