In [None]:
import os
import re
import requests
from transformers import pipeline
from dotenv import load_dotenv
import PyPDF2
import json

# Load environment variables from .env file
load_dotenv()
VT_API_KEY = os.getenv("VIRUS_TOTAL_API_KEY")


# Initialize Hugging Face's NER pipeline
ner_pipeline = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english", aggregation_strategy="simple")

# Placeholder for MITRE ATT&CK mappings
MITRE_TTPs = {
    "Initial Access": "TA0001",
    "Execution": "TA0002",
    "Persistence": "TA0003",
    "Privilege Escalation": "TA0004",
    "Defense Evasion": "TA0005",
    "Credential Access": "TA0006",
    "Discovery": "TA0007",
    "Lateral Movement": "TA0008",
    "Collection": "TA0009",
    "Command and Control": "TA0011",
    "Exfiltration": "TA0010",
    "Impact": "TA0040",
    # Add more mappings if needed
}

# Hardcoded malware definitions
known_malware = {
    "WannaCry": {"md5": "e6f77589c78fabc7cb84c4d5a4318d55", "sha1": "af7b26a981c024c65b9199b1b82a14de48f2596f", "sha256": "3ae56a1fca8cbd5a5b9e0dc573d8fa73a9d3e0d98c56140871f712d764cd44fa"},
    "NotPetya": {"md5": "71b6a493388e7d0b40c83ce903bc6b04", "sha1": "643d1c2f2c78b4cb45d6f6ae8c8e8d342b5e3e41", "sha256": "e285b6b6c0503376f5ff4c1c1231b1cc0415fcadf47fbc7f4551f22b276e20a4"},
    "Emotet": {"md5": "74c8c349d8d517ed74cd950afabb5b26", "sha1": "b5d8ff12e7409372171f6c497604eb08e6bb6a84", "sha256": "b8cb6c7dfc95fdd1b32a5892c7e56b11d95fd75920c47f07b979c7163699b5d7"},
    "TrickBot": {"md5": "5d41402abc4b2a76b9719d911017c592", "sha1": "2c3b9d42a38b6fcb177ed982c702cb65e4a3c9ea", "sha256": "7dfc1f1e68b4b8bb40573a3ea4b9d4d378f0d40b81780d5f2c491c148947a2b1"},
    "Dridex": {"md5": "827ccb0eea8a706c4c34a16891f84e7b", "sha1": "5c7b6c2d9b927e1537c12bc765f2dfed6b841123", "sha256": "d16b6dc9a89f1a567c40c1874b5cfb7fd07c72d8dbdbcb61fc7a09c6fa1b6b36"},
    "Zeus": {"md5": "1a79a4d60de6718e8e5b326e338ae533", "sha1": "7c211433f02071597741e6ff5a8ea34789abbf43", "sha256": "356a192b7913b04c54574d18c28d46e6395428ab5b9e642b0429baad106e94b5"},
    "Ryuk": {"md5": "d41d8cd98f00b204e9800998ecf8427e", "sha1": "da39a3ee5e6b4b0d3255bfef95601890afd80709", "sha256": "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"},
    "Locky": {"md5": "098f6bcd4621d373cade4e832627b4f6", "sha1": "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", "sha256": "9e107d9d372bb6826bd81d3542a419d6bdaea8f5672cd455ad17b2d8b2bbdcbf"},
    "DarkSide": {"md5": "c4ca4238a0b923820dcc509a6f75849b", "sha1": "03c7c0ace395d80182db07ae2c30f034b9e7fd16", "sha256": "e99a18c428cb38d5f260853678922e03abd833d679d5b9921d3c9cd21f4e9b37"},
    "Conti": {"md5": "eccbc87e4b5ce2fe28308fd9f2a7baf3", "sha1": "f96b697d7cb7938d525a2f31aaf161d0ff8b4939", "sha256": "45c48cce2e2d7fbdea1afc51c7c6ad26ee1d0400e22cd0e6c3b23e62e7bd6f72"}
}

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    try:
        with open(pdf_path, 'rb') as pdf_file:
            pdf_reader = PyPDF2.PdfReader(pdf_file)
            text = "".join(page.extract_text() for page in pdf_reader.pages if page.extract_text())
            return text
    except Exception as e:
        print(f"Error reading PDF: {e}")
        return ""

# Function to perform a VirusTotal hash lookup
def virus_total_lookup(hash_value):
    url = f"https://www.virustotal.com/api/v3/files/{hash_value}"
    headers = {"x-apikey": VT_API_KEY}
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
        else:
            print(f"Error: {response.status_code} - {response.json()}")
    except requests.RequestException as e:
        print(f"Request error: {e}")
    return None

# Extract malware information
def extract_malware(report):
    malware_details = []

    for malware_name, hashes in known_malware.items():
        if malware_name.lower() in report.lower():
            malware_details.append({
                "Name": malware_name,
                "md5": hashes.get("md5"),
                "sha1": hashes.get("sha1"),
                "sha256": hashes.get("sha256"),
                "VirusTotal Report": virus_total_lookup(hashes.get("sha256")),
            })

    return malware_details

# Process a PDF and extract intelligence
def process_pdf(pdf_path):
    report_text = extract_text_from_pdf(pdf_path)
    if not report_text:
        print("Failed to extract text from the PDF.")
        return None

    malware = extract_malware(report_text)
    return {"Malware": malware}

# Main execution
if __name__ == "__main__":
    pdf_path = "path_of_pdf.pdf"  # Replace with your PDF path
    result = process_pdf(pdf_path)
    if result:
        print(json.dumps(result, indent=4))
    else:
        print("No data extracted.")


Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Error reading PDF: [Errno 2] No such file or directory: 'sample_report.pdf'
Failed to extract text from the PDF.
No data extracted.
