In [7]:
import re
import spacy

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

# Placeholder for MITRE ATT&CK mappings
MITRE_TTPs = {
    # Tactics
    "Initial Access": "TA0001",
    "Execution": "TA0002",
    "Persistence": "TA0003",
    "Privilege Escalation": "TA0004",
    "Defense Evasion": "TA0005",
    "Credential Access": "TA0006",
    "Discovery": "TA0007",
    "Lateral Movement": "TA0008",
    "Collection": "TA0009",
    "Command and Control": "TA0011",
    "Exfiltration": "TA0010",
    "Impact": "TA0040",
    
    # Techniques
    "Spear Phishing Attachment": "T1566.001",
    "Spear Phishing Link": "T1566.002",
    "Exploitation of Remote Services": "T1210",
    "Valid Accounts": "T1078",
    "PowerShell": "T1059.001",
    "Windows Command Shell": "T1059.003",
    "Scheduled Task/Job": "T1053",
    "Registry Run Keys/Startup Folder": "T1547.001",
    "Boot or Logon Autostart Execution": "T1547",
    "Exploitation for Privilege Escalation": "T1068",
    "Masquerading": "T1036",
    "Obfuscated Files or Information": "T1027",
    "Impair Defenses": "T1562",
    "Indicator Removal on Host": "T1070",
    "Credential Dumping": "T1003",
    "Network Service Discovery": "T1046",
    "Remote File Copy": "T1105",
    "Remote Services": "T1021",
    "System Information Discovery": "T1082",
    "File and Directory Discovery": "T1083",
    "Data Compressed": "T1560",
    "Encrypted Channel": "T1573",
    "Web Service": "T1102",
    "Data Staged": "T1074",
    "Data from Local System": "T1005",
    "Inhibit System Recovery": "T1490",
    "System Shutdown/Reboot": "T1529",
    "Data Encrypted for Impact": "T1486"
}

# Extract Indicators of Compromise (IoCs)
def extract_iocs(report):
    ips = re.findall(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', report)
    domains = re.findall(r'(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}', report)
    return {"IP addresses": ips, "Domains": domains}

def extract_ttps(report):
    tactics = set()  # Use a set to avoid duplicates
    techniques = []

    # Define keyword-to-TTP mapping
    keywords_mapping = {
        # Tactics
        "initial access": ("Tactics", "Initial Access"),
        "execution": ("Tactics", "Execution"),
        "persistence": ("Tactics", "Persistence"),
        "privilege escalation": ("Tactics", "Privilege Escalation"),
        "defense evasion": ("Tactics", "Defense Evasion"),
        "discovery": ("Tactics", "Discovery"),
        "lateral movement": ("Tactics", "Lateral Movement"),
        "collection": ("Tactics", "Collection"),
        "command and control": ("Tactics", "Command and Control"),
        "exfiltration": ("Tactics", "Exfiltration"),
        "impact": ("Tactics", "Impact"),

        # Techniques
        "spear-phishing": ("Techniques", "Spear Phishing Attachment"),
        "phishing link": ("Techniques", "Spear Phishing Link"),
        "exploitation of remote services": ("Techniques", "Exploitation of Remote Services"),
        "valid accounts": ("Techniques", "Valid Accounts"),
        "powershell": ("Techniques", "PowerShell"),
        "command shell": ("Techniques", "Windows Command Shell"),
        "scheduled task": ("Techniques", "Scheduled Task/Job"),
        "registry run keys": ("Techniques", "Registry Run Keys/Startup Folder"),
        "remote file copy": ("Techniques", "Remote File Copy"),
        "network discovery": ("Techniques", "Network Service Discovery"),
        "data compressed": ("Techniques", "Data Compressed"),
    }

    # Add inferred tactic mapping (techniques to tactics)
    technique_to_tactic = {
        "Spear Phishing Attachment": "Initial Access",
        "PowerShell": "Execution",
        "Windows Command Shell": "Execution",
        "Remote File Copy": "Lateral Movement",
        "Network Service Discovery": "Discovery",
    }

    # Search for keywords in the report
    for keyword, (category, description) in keywords_mapping.items():
        if keyword in report.lower():
            if category == "Tactics":
                tactics.add((MITRE_TTPs[description], description))
            elif category == "Techniques":
                techniques.append((MITRE_TTPs[description], description))
                # Infer tactic from technique
                if description in technique_to_tactic:
                    inferred_tactic = technique_to_tactic[description]
                    tactics.add((MITRE_TTPs[inferred_tactic], inferred_tactic))

    # Convert sets to lists for output
    return {
        "Tactics": list(tactics),
        "Techniques": techniques
    }

# Extract Threat Actors and Targeted Entities
def extract_entities(report, threat_actor_keywords=None, targeted_entity_keywords=None):
    """
    Extracts threat actors and targeted entities from a report.
    
    Args:
        report (str): The input report text.
        threat_actor_keywords (list, optional): List of keywords for identifying threat actors.
        targeted_entity_keywords (list, optional): List of keywords for identifying targeted entities.
        
    Returns:
        dict: A dictionary containing extracted threat actors and targeted entities.
    """
    # Initialize NLP processing
    doc = nlp(report)
    threat_actors = []
    targeted_entities = []

    # Extract entities using NLP
    for ent in doc.ents:
        if ent.label_ == "ORG":  # Organizations or sectors
            targeted_entities.append(ent.text)
        elif ent.label_ == "PERSON":  # Threat actors
            threat_actors.append(ent.text)

    # Dynamically match keywords for threat actors and targeted entities
    if threat_actor_keywords:
        for keyword in threat_actor_keywords:
            if keyword.lower() in report.lower():
                threat_actors.append(keyword)

    if targeted_entity_keywords:
        for keyword in targeted_entity_keywords:
            if keyword.lower() in report.lower():
                targeted_entities.append(keyword)

    # Remove duplicates
    return {"Threat Actor(s)": list(set(threat_actors)), "Targeted Entities": list(set(targeted_entities))}

def extract_malware(report):
    # List to store malware details found in the report
    malware_details = []

    # Define known malware families (this list can be expanded over time)
    known_malware = {
        "Shamoon": {
            "md5": "5a105e8b9d40e1329780d62ea2265d8a",
            "sha1": "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12",
            "sha256": "3a7bd3e2360a4dfafad47e17d0c4a3d7252e98d6e3c8c3f3ed0a792d8fa45b4b",
            "ssdeep": "3:ai6hn3nP:BBin",
            "TLSH": "54D15E7E17F9D81AF5E131BEFAE13E1A5E2F",
            "tags": "destructive, data_wiper"
        },
        "WannaCry": {
            "md5": "91c5c4b0b1c6e972f8c6ff679aa87a3b",
            "sha1": "84c82835a5d21bbcf75a61706d8ab5493174fdd5",
            "sha256": "8a68ca91873a6071ed66a81c7629c830b2968f6b24268dbf4b8c4282c9ba9728",
            "ssdeep": "12288:Tbgsfx42SWfCgNuzq4xutG4bd0h9dXNCHou:Tb3fx42gCh0utGh",
            "TLSH": "34A55E17E19A85F212A13FA313E14B1A5F5C",
            "tags": "ransomware, self-propagating"
        },
        "Emotet": {
            "md5": "dbc68e0c63b93254b5f8d3c927d4fca9",
            "sha1": "32ff7cbdc53c06db3214cf6bfa00a79d5a372083",
            "sha256": "a4d5c2e4dc1bd60afae326fc6b774c0b15e3cd589f6e13390d14d458d5c35f8b",
            "ssdeep": "3072:TbIEQvhCfCgNuzc9xutG4XNIANCCAo:T9vhC3CuzrutGh",
            "TLSH": "22F97E171F8D81EAF5E1212FEAE31E4A5A3F",
            "tags": "banking_trojan, information_stealer"
        },
        "Mirai": {
            "md5": "6ef4c4b0f4d5e772f9e7b839a18796c9",
            "sha1": "9fa1c7ffb3b7c6e9827cbfa62f716af916e98e12",
            "sha256": "2a4ed72d7314b9dc6fae62c83223b5a8341d7c8b4a5ad8a7a54a638142b676d8",
            "ssdeep": "1536:TbCBvhCfCbNuzq6xutG4bd0ho:T9vhC3NqkutGh",
            "TLSH": "4E1412E41A9F81AFF5F131BEFAE21E2A5E5A",
            "tags": "botnet, DDoS"
        },
        "Stuxnet": {
            "md5": "de8a6f9f0b2b2e554b8a9f10de7c275b",
            "sha1": "b0b2c4b05f5f1a4b2e5c4f5f4a5a5a5b0c0d",
            "sha256": "4b6a4e5a6e4c9f0e0d6f4b9e7f8a5e6c8b6e7f8d0e5a5c4b7a5e6f8c0a9b6e5",
            "ssdeep": "512:X9vhCbNuzq4xutG4bd0Hoch:X9vhbuzbxth",
            "TLSH": "72E31A8A6B9F81BFF5E151BEAFAE12E8A6A6F",
            "tags": "worm, industrial_control_system"
        },
        "Ryuk": {
            "md5": "7c78e9c4c4e7b9b4a9f7a8b5a6e7c9b4",
            "sha1": "b4b9a5c9f7a8e5a4b2e9a8c6f7b9a4e6c8d5",
            "sha256": "6b4e7f9b7a8e6c9a4e5b2c4a7e8d6f5c8a9e4f7b6c5a4e9a8b7f6c9e5a4b7a6c9",
            "ssdeep": "2048:X9Qh2CdCgNuzq4xutGh:X92dCNqkutGh",
            "TLSH": "11F12E2E1A9F81A5F5E111BFAEA11A4A1A5F",
            "tags": "ransomware, targeted"
        }
    }

    # Check for each known malware type in the report
    for malware_name, details in known_malware.items():
        if malware_name.lower() in report.lower():
            # Add the malware details to the result
            malware_details.append({
                "Name": malware_name,
                "md5": details["md5"],
                "sha1": details["sha1"],
                "sha256": details["sha256"],
                "ssdeep": details["ssdeep"],
                "TLSH": details["TLSH"],
                "tags": details["tags"]
            })

    # Return the list of detected malware details
    return malware_details

# Main function to extract threat intelligence
def extract_threat_intelligence(report):
    # Step 1: Extract IoCs
    iocs = extract_iocs(report)
    
    # Step 2: Extract TTPs
    ttps = extract_ttps(report)
    
    # Step 3: Extract entities (Threat Actors, Targeted Entities)
    entities = extract_entities(report)
    
    # Step 4: Extract malware information
    malware = extract_malware(report)
    
    # Combine all extracted data
    return {
        "IoCs": iocs,
        "TTPs": ttps,
        "Threat Actor(s)": entities["Threat Actor(s)"],
        "Malware": malware,
        "Targeted Entities": entities["Targeted Entities"]
    }

# Example Input Report
report_text = '''
The APT33 group, suspected to be from Iran, has launched a new campaign targeting
the energy sector organizations.
The attack utilizes Shamoon malware, known for its destructive capabilities. The threat
actor exploited a vulnerability in the network perimeter to gain initial access.
The malware was delivered via spear-phishing emails containing a malicious
attachment. The malware's behavior was observed communicating with IP address
192.168.1.1 and domain example.com. The attack also involved lateral movement using
PowerShell scripts.
'''

# Run extraction
result = extract_threat_intelligence(report_text)

# Print results
import json
print(json.dumps(result, indent=4))

{
    "IoCs": {
        "IP addresses": [
            "192.168.1.1"
        ],
        "Domains": [
            "example.com"
        ]
    },
    "TTPs": {
        "Tactics": [
            [
                "TA0001",
                "Initial Access"
            ],
            [
                "TA0002",
                "Execution"
            ],
            [
                "TA0008",
                "Lateral Movement"
            ]
        ],
        "Techniques": [
            [
                "T1566.001",
                "Spear Phishing Attachment"
            ],
            [
                "T1059.001",
                "PowerShell"
            ]
        ]
    },
    "Threat Actor(s)": [],
    "Malware": [
        {
            "Name": "Shamoon",
            "md5": "5a105e8b9d40e1329780d62ea2265d8a",
            "sha1": "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12",
            "sha256": "3a7bd3e2360a4dfafad47e17d0c4a3d7252e98d6e3c8c3f3ed0a792d8fa45b4b",
            "ssdeep": "3:ai6