In [8]:
import re
from transformers import pipeline

# Initialize Hugging Face's NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

# Placeholder for MITRE ATT&CK mappings
MITRE_TTPs = {
    # Tactics
    "Initial Access": "TA0001",
    "Execution": "TA0002",
    "Persistence": "TA0003",
    "Privilege Escalation": "TA0004",
    "Defense Evasion": "TA0005",
    "Credential Access": "TA0006",
    "Discovery": "TA0007",
    "Lateral Movement": "TA0008",
    "Collection": "TA0009",
    "Command and Control": "TA0011",
    "Exfiltration": "TA0010",
    "Impact": "TA0040",
    
    # Techniques
    "Spear Phishing Attachment": "T1566.001",
    "PowerShell": "T1059.001",
}

# Extract Indicators of Compromise (IoCs)
def extract_iocs(report):
    ips = re.findall(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', report)
    domains = re.findall(r'(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}', report)
    return {"IP addresses": ips, "Domains": domains}

def extract_ttps(report):
    tactics = []
    techniques = []

    keywords_mapping = {
        "initial access": ("Tactics", "Initial Access"),
        "execution": ("Tactics", "Execution"),
        "lateral movement": ("Tactics", "Lateral Movement"),
        "spear-phishing": ("Techniques", "Spear Phishing Attachment"),
        "powershell": ("Techniques", "PowerShell"),
    }

    for keyword, (category, description) in keywords_mapping.items():
        if keyword in report.lower():
            if category == "Tactics":
                tactics.append({MITRE_TTPs[description]: description})
            elif category == "Techniques":
                techniques.append({MITRE_TTPs[description]: description})

    return {
        "Tactics": tactics,
        "Techniques": techniques
    }

def extract_entities(report):
    # Use the Hugging Face NER model to extract entities
    ner_results = ner_pipeline(report)
    
    threat_actors = set()
    targeted_entities = set()
    
    # Inspect and process each entity
    for entity in ner_results:
        if entity['entity_group'] == 'ORG':  # Organization (threat actors)
            if "APT" in entity['word'] or "group" in entity['word'].lower() or "actor" in entity['word'].lower():
                threat_actors.add(entity['word'])
        
        if entity['entity_group'] == 'LOC':  # Location (targeted entities, i.e., geopolitical entities)
            targeted_entities.add(entity['word'])
        
    return {
        "Threat Actor(s)": list(threat_actors),
        "Targeted Entities": list(targeted_entities)
    }

def extract_malware(report):
    known_malware = {
        "Shamoon": {
            "md5": "5a105e8b9d40e1329780d62ea2265d8a",
            "sha1": "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12",
            "sha256": "3a7bd3e2360a4dfafad47e17d0c4a3d7252e98d6e3c8c3f3ed0a792d8fa45b4b",
            "ssdeep": "3:ai6hn3nP:BBin",
            "TLSH": "54D15E7E17F9D81AF5E131BEFAE13E1A5E2F",
            "tags": "destructive, data_wiper"
        }
    }

    malware_details = []
    for malware_name, details in known_malware.items():
        if malware_name.lower() in report.lower():
            malware_details.append({
                "Name": malware_name,
                **details
            })

    return malware_details

def extract_threat_intelligence(report):
    iocs = extract_iocs(report)
    ttps = extract_ttps(report)
    entities = extract_entities(report)
    malware = extract_malware(report)

    return {
        "IoCs": iocs,
        "TTPs": ttps,
        "Threat Actor(s)": entities["Threat Actor(s)"],
        "Malware": malware,
        "Targeted Entities": entities["Targeted Entities"]
    }

# Example Input Report
report_text = '''
The APT33 group, suspected to be from Iran, has launched a new campaign targeting
the energy sector organizations.
The attack utilizes Shamoon malware, known for its destructive capabilities. The threat
actor exploited a vulnerability in the network perimeter to gain initial access.
The malware was delivered via spear-phishing emails containing a malicious
attachment. The malware's behavior was observed communicating with IP address
192.168.1.1 and domain example.com. The attack also involved lateral movement using
PowerShell scripts.
'''

# Run extraction
result = extract_threat_intelligence(report_text)

# Print results
import json
print(json.dumps(result, indent=4))

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps:0


{
    "IoCs": {
        "IP addresses": [
            "192.168.1.1"
        ],
        "Domains": [
            "example.com"
        ]
    },
    "TTPs": {
        "Tactics": [
            {
                "TA0001": "Initial Access"
            },
            {
                "TA0008": "Lateral Movement"
            }
        ],
        "Techniques": [
            {
                "T1566.001": "Spear Phishing Attachment"
            },
            {
                "T1059.001": "PowerShell"
            }
        ]
    },
    "Threat Actor(s)": [
        "APT33"
    ],
    "Malware": [
        {
            "Name": "Shamoon",
            "md5": "5a105e8b9d40e1329780d62ea2265d8a",
            "sha1": "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12",
            "sha256": "3a7bd3e2360a4dfafad47e17d0c4a3d7252e98d6e3c8c3f3ed0a792d8fa45b4b",
            "ssdeep": "3:ai6hn3nP:BBin",
            "TLSH": "54D15E7E17F9D81AF5E131BEFAE13E1A5E2F",
            "tags": "destructive, data_wiper"
       