In [16]:
import os
import re
import requests
from transformers import pipeline
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()
VT_API_KEY = os.getenv("VIRUS_TOTAL_API_KEY")

# Initialize Hugging Face's NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

# Placeholder for MITRE ATT&CK mappings
MITRE_TTPs = {
    "Initial Access": "TA0001",
    "Execution": "TA0002",
    "Persistence": "TA0003",
    "Privilege Escalation": "TA0004",
    "Defense Evasion": "TA0005",
    "Credential Access": "TA0006",
    "Discovery": "TA0007",
    "Lateral Movement": "TA0008",
    "Collection": "TA0009",
    "Command and Control": "TA0011",
    "Exfiltration": "TA0010",
    "Impact": "TA0040",
    "Spear Phishing Attachment": "T1566.001",
    "Spear Phishing Link": "T1566.002",
    "Exploitation of Remote Services": "T1210",
    "Valid Accounts": "T1078",
    "PowerShell": "T1059.001",
    "Windows Command Shell": "T1059.003",
    "Scheduled Task/Job": "T1053",
    "Registry Run Keys/Startup Folder": "T1547.001",
    "Boot or Logon Autostart Execution": "T1547",
    "Exploitation for Privilege Escalation": "T1068",
    "Masquerading": "T1036",
    "Obfuscated Files or Information": "T1027",
    "Impair Defenses": "T1562",
    "Indicator Removal on Host": "T1070",
    "Credential Dumping": "T1003",
    "Network Service Discovery": "T1046",
    "Remote File Copy": "T1105",
    "Remote Services": "T1021",
    "System Information Discovery": "T1082",
    "File and Directory Discovery": "T1083",
    "Data Compressed": "T1560",
    "Encrypted Channel": "T1573",
    "Web Service": "T1102",
    "Data Staged": "T1074",
    "Data from Local System": "T1005",
    "Inhibit System Recovery": "T1490",
    "System Shutdown/Reboot": "T1529",
    "Data Encrypted for Impact": "T1486"
}

# Extract Indicators of Compromise (IoCs)
def extract_iocs(report):
    ips = re.findall(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', report)
    domains = re.findall(r'(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}', report)
    return {"IP addresses": ips, "Domains": domains}

def extract_ttps(report):
    tactics = set()  # Use a set to avoid duplicates
    techniques = []

    # Define keyword-to-TTP mapping
    keywords_mapping = {
        "initial access": ("Tactics", "Initial Access"),
        "execution": ("Tactics", "Execution"),
        "persistence": ("Tactics", "Persistence"),
        "privilege escalation": ("Tactics", "Privilege Escalation"),
        "defense evasion": ("Tactics", "Defense Evasion"),
        "discovery": ("Tactics", "Discovery"),
        "lateral movement": ("Tactics", "Lateral Movement"),
        "collection": ("Tactics", "Collection"),
        "command and control": ("Tactics", "Command and Control"),
        "exfiltration": ("Tactics", "Exfiltration"),
        "impact": ("Tactics", "Impact"),
        "spear-phishing": ("Techniques", "Spear Phishing Attachment"),
        "phishing link": ("Techniques", "Spear Phishing Link"),
        "exploitation of remote services": ("Techniques", "Exploitation of Remote Services"),
        "valid accounts": ("Techniques", "Valid Accounts"),
        "powershell": ("Techniques", "PowerShell"),
        "command shell": ("Techniques", "Windows Command Shell"),
        "scheduled task": ("Techniques", "Scheduled Task/Job"),
        "registry run keys": ("Techniques", "Registry Run Keys/Startup Folder"),
        "remote file copy": ("Techniques", "Remote File Copy"),
        "network discovery": ("Techniques", "Network Service Discovery"),
        "data compressed": ("Techniques", "Data Compressed"),
    }

    technique_to_tactic = {
        "Spear Phishing Attachment": "Initial Access",
        "PowerShell": "Execution",
        "Windows Command Shell": "Execution",
        "Remote File Copy": "Lateral Movement",
        "Network Service Discovery": "Discovery",
    }

    for keyword, (category, description) in keywords_mapping.items():
        if keyword in report.lower():
            if category == "Tactics":
                tactics.add((MITRE_TTPs[description], description))
            elif category == "Techniques":
                techniques.append((MITRE_TTPs[description], description))
                if description in technique_to_tactic:
                    inferred_tactic = technique_to_tactic[description]
                    tactics.add((MITRE_TTPs[inferred_tactic], inferred_tactic))

    return {"Tactics": list(tactics), "Techniques": techniques}

def extract_entities(report):
    ner_results = ner_pipeline(report)
    threat_actors = set()
    targeted_entities = set()
    
    for entity in ner_results:
        if entity['entity_group'] == 'ORG':
            if "APT" in entity['word'] or "group" in entity['word'].lower() or "actor" in entity['word'].lower():
                threat_actors.add(entity['word'])
        
        if entity['entity_group'] == 'LOC':
            targeted_entities.add(entity['word'])
        
    return {"Threat Actor(s)": list(threat_actors), "Targeted Entities": list(targeted_entities)}

# Function to perform a VirusTotal hash lookup
def virus_total_lookup(hash_value):
    url = f"https://www.virustotal.com/api/v3/files/{hash_value}"
    headers = {
        "x-apikey": VT_API_KEY
    }
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
        else:
            return {"error": f"Unable to fetch data: {response.status_code}"}
    except requests.RequestException as e:
        return {"error": str(e)}

# Define known malware families (this list can be expanded over time)
known_malware = {
    "Shamoon": {
        "md5": "5a105e8b9d40e1329780d62ea2265d8a",
        "sha1": "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12",
        "sha256": "3a7bd3e2360a4dfafad47e17d0c4a3d7252e98d6e3c8c3f3ed0a792d8fa45b4b",
    },
    "WannaCry": {
        "md5": "91c5c4b0b1c6e972f8c6ff679aa87a3b",
        "sha1": "84c82835a5d21bbcf75a61706d8ab5493174fdd5",
        "sha256": "8a68ca91873a6071ed66a81c7629c830b2968f6b24268dbf4b8c4282c9ba9728",
    },
    "NotPetya": {
        "md5": "91c5c4b0b1c6e972f8c6ff679aa87a3b",
        "sha1": "84c82835a5d21bbcf75a61706d8ab5493174fdd5",
        "sha256": "1aba3e1865753ed876013537812846b7b842306ea5a1b96a2658a0478124a6cf",
    },
    "Conficker": {
        "md5": "f42e2b7c7325b893e2f69497d7d6a10d",
        "sha1": "d6a38422065ed56767a3a702495d7669e755756d",
        "sha256": "ac75996784773afa5465926129939963df09eb2700d5774352abcdeab37845c1",
    }
}
# Add more known malware entries here...

def extract_malware(report):
    malware_details = []

    # Check for known malware names in the report
    for malware_name, hashes in known_malware.items():
        if malware_name.lower() in report.lower():
            # Add the malware details if a known malware name is mentioned
            malware_details.append({
                "Name": malware_name,
                "md5": hashes.get("md5", "N/A"),
                "sha1": hashes.get("sha1", "N/A"),
                "sha256": hashes.get("sha256", "N/A"),
                "ssdeep": "N/A",  # Placeholder (update if VirusTotal provides ssdeep)
                "TLSH": "N/A",  # Placeholder (update if VirusTotal provides TLSH)
                "tags": ["known malware", malware_name]  # Example tag
            })

    if not malware_details:
        print("No malware found in the report.")
    return malware_details


def extract_threat_intelligence(report):
    iocs = extract_iocs(report)
    ttps = extract_ttps(report)
    entities = extract_entities(report)
    malware = extract_malware(report)

    return {
        "IoCs": iocs,
        "TTPs": ttps,
        "Threat Actor(s)": entities["Threat Actor(s)"],
        "Malware": malware,
        "Targeted Entities": entities["Targeted Entities"]
    }

# Example Input Report
report_text = '''
The APT33 group, suspected to be from Iran, has launched a new campaign targeting
the energy sector organizations.
The attack utilizes Shamoon malware, known for its destructive capabilities. The threat
actor exploited a vulnerability in the network perimeter to gain initial access.
The malware was delivered via spear-phishing emails containing a malicious
attachment. The malware's behavior was observed communicating with IP address
192.168.1.1 and domain example.com. The attack also involved lateral movement using
PowerShell scripts.
'''

# Run extraction
result = extract_threat_intelligence(report_text)

# Print results
import json
print(json.dumps(result, indent=4))


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


{
    "IoCs": {
        "IP addresses": [
            "192.168.1.1"
        ],
        "Domains": [
            "example.com"
        ]
    },
    "TTPs": {
        "Tactics": [
            [
                "TA0002",
                "Execution"
            ],
            [
                "TA0001",
                "Initial Access"
            ],
            [
                "TA0008",
                "Lateral Movement"
            ]
        ],
        "Techniques": [
            [
                "T1566.001",
                "Spear Phishing Attachment"
            ],
            [
                "T1059.001",
                "PowerShell"
            ]
        ]
    },
    "Threat Actor(s)": [
        "APT33"
    ],
    "Malware": [
        {
            "Name": "Shamoon",
            "md5": "5a105e8b9d40e1329780d62ea2265d8a",
            "sha1": "2fd4e1c67a2d28fced849ee1bb76e7391b93eb12",
            "sha256": "3a7bd3e2360a4dfafad47e17d0c4a3d7252e98d6e3c8c3f3ed0a792d8fa45b4b",
       