In [3]:
import re
import spacy

# Initialize spaCy model
nlp = spacy.load("en_core_web_sm")

# Placeholder for MITRE ATT&CK mappings
MITRE_TTPs = {
    "Initial Access": "TA0001",
    "Execution": "TA0002",
    "Lateral Movement": "TA0008",
    "Spear Phishing Attachment": "T1566.001",
    "PowerShell": "T1059.001"
}

# Extract Indicators of Compromise (IoCs)
def extract_iocs(report):
    ips = re.findall(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', report)
    domains = re.findall(r'(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}', report)
    return {"IP addresses": ips, "Domains": domains}

# Extract TTPs
def extract_ttps(report):
    tactics = []
    techniques = []

    # Keywords to map tactics and techniques
    if "initial access" in report.lower():
        tactics.append([MITRE_TTPs["Initial Access"], "Initial Access"])
    if "execution" in report.lower():
        tactics.append([MITRE_TTPs["Execution"], "Execution"])
    if "lateral movement" in report.lower():
        tactics.append([MITRE_TTPs["Lateral Movement"], "Lateral Movement"])
    if "spear-phishing" in report.lower():
        techniques.append([MITRE_TTPs["Spear Phishing Attachment"], "Spear Phishing Attachment"])
    if "powershell" in report.lower():
        techniques.append([MITRE_TTPs["PowerShell"], "PowerShell"])

    return {"Tactics": tactics, "Techniques": techniques}

# Extract Threat Actors and Targeted Entities
def extract_entities(report):
    doc = nlp(report)
    threat_actors = []
    targeted_entities = []

    for ent in doc.ents:
        if ent.label_ == "ORG":  # Organizations or sectors
            targeted_entities.append(ent.text)
        elif ent.label_ == "PERSON":  # Threat actors
            threat_actors.append(ent.text)

    # Add hardcoded mappings if required
    if "APT33" in report:
        threat_actors.append("APT33")
    if "energy sector" in report.lower():
        targeted_entities.append("Energy Sector")

    return {"Threat Actor(s)": list(set(threat_actors)), "Targeted Entities": list(set(targeted_entities))}

# Extract Malware Information
def extract_malware(report):
    malware_details = []
    if "Shamoon" in report:
        malware_details.append({
            "Name": "Shamoon",
            "md5": "placeholder_md5",
            "sha1": "placeholder_sha1",
            "sha256": "placeholder_sha256",
            "ssdeep": "placeholder_ssdeep",
            "TLSH": "placeholder_TLHS",
            "tags": "destructive, data_wiper"
        })
    return malware_details

# Main function to extract threat intelligence
def extract_threat_intelligence(report):
    # Step 1: Extract IoCs
    iocs = extract_iocs(report)
    
    # Step 2: Extract TTPs
    ttps = extract_ttps(report)
    
    # Step 3: Extract entities (Threat Actors, Targeted Entities)
    entities = extract_entities(report)
    
    # Step 4: Extract malware information
    malware = extract_malware(report)
    
    # Combine all extracted data
    return {
        "IoCs": iocs,
        "TTPs": ttps,
        "Threat Actor(s)": entities["Threat Actor(s)"],
        "Malware": malware,
        "Targeted Entities": entities["Targeted Entities"]
    }

# Example Input Report
report_text = '''
The APT33 group, suspected to be from Iran, has launched a new campaign targeting
the energy sector organizations.
The attack utilizes Shamoon malware, known for its destructive capabilities. The threat
actor exploited a vulnerability in the network perimeter to gain initial access.
The malware was delivered via spear-phishing emails containing a malicious
attachment. The malware's behavior was observed communicating with IP address
192.168.1.1 and domain example.com. The attack also involved lateral movement using
PowerShell scripts.
'''

# Run extraction
result = extract_threat_intelligence(report_text)

# Print results
import json
print(json.dumps(result, indent=4))

{
    "IoCs": {
        "IP addresses": [
            "192.168.1.1"
        ],
        "Domains": [
            "example.com"
        ]
    },
    "TTPs": {
        "Tactics": [
            [
                "TA0001",
                "Initial Access"
            ],
            [
                "TA0008",
                "Lateral Movement"
            ]
        ],
        "Techniques": [
            [
                "T1566.001",
                "Spear Phishing Attachment"
            ],
            [
                "T1059.001",
                "PowerShell"
            ]
        ]
    },
    "Threat Actor(s)": [
        "APT33"
    ],
    "Malware": [
        {
            "Name": "Shamoon",
            "md5": "placeholder_md5",
            "sha1": "placeholder_sha1",
            "sha256": "placeholder_sha256",
            "ssdeep": "placeholder_ssdeep",
            "TLSH": "placeholder_TLHS",
            "tags": "destructive, data_wiper"
        }
    ],
    "Targeted Entities": [
   