# Entity and Relationship Extraction for Threat Intelligence

## Overview
This notebook implements entity and relationship extraction from threat intelligence text using LLM-based approach.

### Task Description
- **Input**: Threat intelligence text content
- **Output**: Named entities and relationships in structured format
- **Entity Types**: malware, threat type, attacker, vulnerability, tool, etc.
- **Relationship Types**: use, target, exploit, etc.

### Example
**Input**: A hitherto unknown attack group has been observed targeting a materials research organization in Asia. The group, which Symantec calls Clasiopa, is characterized by a distinct toolset, which includes one piece of custom malware (Backdoor.Atharvan).

**Output**:
- Named Entities: (Clasiopa, attacker), (custom malware, malware), (Backdoor.Atharvan, malware)
- Relationships: (Clasiopa, use, custom malware), (custom malware, name, Backdoor.Atharvan)


In [1]:
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple, Any
from collections import defaultdict
import datetime

# Load environment and model setup
from dotenv import load_dotenv
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load environment variables
load_dotenv()

print("🔧 Setting up Entity & Relationship Extraction Pipeline")
print("=" * 60)


🔧 Setting up Entity & Relationship Extraction Pipeline


In [2]:
def load_data(input_file: str) -> list:
    """
    Load threat intelligence data from JSON file.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"✅ Loaded {len(data)} records from {input_file}")
        return data
    except Exception as e:
        print(f"❌ Error loading {input_file}: {e}")
        return []

# Load threat intelligence data
data_path = '../data/processed/merged_threat_intelligence.json'
data = load_data(data_path)

if data:
    print(f"📊 Sample data structure:")
    print(f"   Keys: {list(data[0].keys())}")
    print(f"   Title: {data[0]['title'][:100]}...")


✅ Loaded 427 records from ../data/processed/merged_threat_intelligence.json
📊 Sample data structure:
   Keys: ['title', 'content', 'link']
   Title: FortiGuard Labs Threat Research...


In [3]:
# Device setup
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

print(f"🖥️  Using device: {device.upper()}")
print(f"🔧 PyTorch version: {torch.__version__}")

# Memory cleanup
if device == "cuda":
    torch.cuda.empty_cache()
elif device == "mps":
    import gc
    gc.collect()
    if hasattr(torch.mps, 'empty_cache'):
        torch.mps.empty_cache()


🖥️  Using device: CUDA
🔧 PyTorch version: 2.7.1+cu128


In [4]:
# Get configuration from environment
HF_TOKEN = os.getenv('HF_TOKEN')
DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')
FALLBACK_MODEL = os.getenv('FALLBACK_MODEL', 'gpt2')

def setup_model_for_extraction(model_name: str = None, hf_token: str = None):
    """
    Tải model từ Hugging Face với token từ environment variables.
    """
    model_name = model_name or DEFAULT_MODEL
    hf_token = hf_token or HF_TOKEN

    print(f"🤖 Đang tải mô hình: {model_name}")
    print(f"📱 Thiết bị: {device.upper()}")
    print(f"🔑 Token: {'✅ Found' if hf_token else '❌ Missing'}")

    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            token=hf_token,
            trust_remote_code=True
        )
        tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

        # Thiết lập kiểu dữ liệu và bản đồ thiết bị
        torch_dtype = torch.float16 if device == "cuda" else torch.float32
        device_map = "auto" if device == "cuda" else None

        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=hf_token,
            trust_remote_code=True,
            torch_dtype=torch_dtype,
            device_map=device_map,
            use_cache=False
        )

        if device_map is None and device in ["mps", "cuda"]:
            model.to(device)

        if device_map is None:
            # Nếu không sử dụng device_map="auto", có thể chỉ định device
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                device=0 if device != "cpu" else -1,
                torch_dtype=torch_dtype,
                model_kwargs={"use_cache": False}
            )
        else:
            # Nếu sử dụng device_map="auto", không chỉ định device
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                torch_dtype=torch_dtype,
                model_kwargs={"use_cache": False}
            )

        print(f"✅ Đã tải thành công {model_name} trên {device.upper()}")
        return pipe

    except Exception as e:
        print(f"❌ Lỗi khi tải {model_name}: {e}")
        return setup_fallback_model(hf_token)

def setup_fallback_model(hf_token: str = None):
    """
    Tải fallback model nếu model chính lỗi.
    """
    fallback_name = FALLBACK_MODEL
    hf_token = hf_token or HF_TOKEN
    print(f"🔄 Đang tải mô hình dự phòng: {fallback_name}")

    try:
        tokenizer = AutoTokenizer.from_pretrained(fallback_name, token=hf_token)
        tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            fallback_name,
            token=hf_token,
            torch_dtype=torch.float32,
            use_cache=False
        )

        if device in ["cuda", "mps"]:
            model.to(device)

        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=0 if device != "cpu" else -1,
            model_kwargs={"use_cache": False}
        )

        print(f"✅ {FALLBACK_MODEL} đã sẵn sàng trên {device.upper()}")
        return pipe

    except Exception as e:
        print(f"❌ Lỗi khi tải {FALLBACK_MODEL} fallback: {e}")
        return None

# Load model
extraction_model = setup_model_for_extraction()


🤖 Đang tải mô hình: Qwen/Qwen2.5-1.5B-Instruct
📱 Thiết bị: CUDA
🔑 Token: ✅ Found


Device set to use cuda:0


✅ Đã tải thành công Qwen/Qwen2.5-1.5B-Instruct trên CUDA


In [5]:
def create_entity_extraction_prompt(text: str) -> str:
    """
    Create prompt for entity and relationship extraction focusing on core cybersecurity entity types.
    """
    # Truncate text to avoid token limits
    text_truncated = (text[:1500] if text else "").replace('\n', ' ').strip()
    
    prompt = f"""Instruction: Please identify the following types of entities and then extract the relationships between these extracted entities:

Entity Types (focus on these only):
- Malware: Malicious software (e.g., 'Stuxnet', 'Emotet', 'Backdoor.Atharvan')
- Threat Type: Category of threats (e.g., 'Ransomware', 'APT', 'Botnet')
- Attacker: Threat actors/groups (e.g., 'APT28', 'Lazarus Group', 'Shuckworm')
- Technique: Attack techniques/TTPs (e.g., 'T1057: Process Discovery', 'Privilege Escalation', 'Phishing')
- Tool: Security tools or attack tools (e.g., 'PowerShell', 'Cobalt Strike', 'EHole')
- Vulnerability: Security weaknesses (e.g., 'CVE-2020-1472', 'CVE-2021-44228')
- IP: IP addresses (e.g., '45.153.243.93', '192.168.1.100')
- Domain: Domain names (e.g., 'malicious-domain[.]com', 'evil[.]example[.]com')
- URL: URLs (e.g., 'hxxp://178.73.192[.]15/cal.exe')
- File: File names (e.g., 'rtk.lnk', 'payload.exe', 'shtasks.exe')
- Hash: File hashes (e.g., '2aee8bb2a953124803bc42e5c42935c9', MD5/SHA1/SHA256)

Relationship Types:
- use, hash, aka, execute, used by, download, resolved to, IP, drop, associated with, deploy, communicate with, connect to, install, exploit, contain, run, launch, target, linked to

If there are no entities and relationships pertaining to the specified types, please state 'No related entities and relations'. Make sure to follow the output format shown in the following examples.

Example 1:
Input: A hitherto unknown attack group has been observed targeting a materials research organization in Asia. The group, which Symantec calls Clasiopa, is characterized by a distinct toolset, which includes one piece of custom malware (Backdoor.Atharvan).
Output: Named Entities: (Clasiopa, Attacker), (Backdoor.Atharvan, Malware)\\nRelationships: (Clasiopa, uses, Backdoor.Atharvan)

Example 2:
Input: The Emotet malware has been observed using new phishing techniques to target banking institutions. The malware exploits CVE-2021-1234 vulnerability in Microsoft Office.
Output: Named Entities: (Emotet, Malware), (phishing, Technique), (CVE-2021-1234, Vulnerability), (Microsoft Office, Tool)\\nRelationships: (Emotet, uses, phishing), (Emotet, exploits, CVE-2021-1234)

Example 3:
Input: The threat actor downloaded malicious payload from hxxp://malicious-domain[.]com/payload.exe and used hash 2aee8bb2a953124803bc42e5c42935c9 to verify file integrity. The attack targeted IP address 192.168.1.100.
Output: Named Entities: (threat actor, Attacker), (malicious payload, File), (hxxp://malicious-domain[.]com/payload.exe, URL), (2aee8bb2a953124803bc42e5c42935c9, Hash), (192.168.1.100, IP)\\nRelationships: (threat actor, uses, hxxp://malicious-domain[.]com/payload.exe), (threat actor, targets, 192.168.1.100)

Example 4:
Input: H2Miner botnet uses Kinsing malware and Cobalt Strike to deploy XMRig miners. The campaign communicates with C2 server at evil[.]domain[.]com and is attributed to APT group.
Output: Named Entities: (H2Miner, Threat Type), (Kinsing, Malware), (Cobalt Strike, Tool), (XMRig, Tool), (evil[.]domain[.]com, Domain), (APT group, Attacker)\\nRelationships: (H2Miner, uses, Kinsing), (H2Miner, uses, Cobalt Strike), (H2Miner, uses, XMRig), (Kinsing, communicatesWith, evil[.]domain[.]com), (H2Miner, attributedTo, APT group)

Example 5:
Input: The weather forecast shows sunny skies and moderate temperatures for the weekend.
Output: No related entities and relations

Now extract entities and relationships from the following text:
Input: {text_truncated}
Output:"""
    
    return prompt

# Test the prompt creation
if data:
    sample_prompt = create_entity_extraction_prompt(data[0]['content'])
    print("📝 Sample prompt (first 500 chars):")
    print(sample_prompt[:500] + "...")


📝 Sample prompt (first 500 chars):
Instruction: Please identify the following types of entities and then extract the relationships between these extracted entities:

Entity Types (focus on these only):
- Malware: Malicious software (e.g., 'Stuxnet', 'Emotet', 'Backdoor.Atharvan')
- Threat Type: Category of threats (e.g., 'Ransomware', 'APT', 'Botnet')
- Attacker: Threat actors/groups (e.g., 'APT28', 'Lazarus Group', 'Shuckworm')
- Technique: Attack techniques/TTPs (e.g., 'T1057: Process Discovery', 'Privilege Escalation', 'Phishi...


In [6]:
def extract_entities_and_relationships(pipe, text: str) -> Dict[str, Any]:
    """
    Extract entities and relationships from text using the LLM.
    """
    try:
        prompt = create_entity_extraction_prompt(text)
        
        # Generate response
        response = pipe(
            prompt,
            max_new_tokens=300,
            do_sample=False,
            temperature=0.1,
            pad_token_id=pipe.tokenizer.eos_token_id,
        )
        
        # Extract generated text
        generated_text = response[0]['generated_text']
        answer = generated_text[len(prompt):].strip()
        
        print(f"🔍 Raw model output: {answer[:200]}...")
        
        # Parse the response
        entities, relationships = parse_extraction_output(answer)
        
        return {
            "raw_output": answer,
            "entities": entities,
            "relationships": relationships,
            "has_entities": len(entities) > 0
        }
        
    except Exception as e:
        print(f"❌ Error in extraction: {e}")
        return {
            "raw_output": "",
            "entities": [],
            "relationships": [],
            "has_entities": False,
            "error": str(e)
        }

def parse_extraction_output(output: str) -> Tuple[List[Tuple], List[Tuple]]:
    """
    Parse the model output to extract entities and relationships.
    """
    entities = []
    relationships = []
    
    # Check for "No related entities" case
    if "no related entities" in output.lower():
        return entities, relationships
    
    try:
        # Split output into lines
        lines = [line.strip() for line in output.split('\n') if line.strip()]
        
        current_section = None
        for line in lines:
            line_lower = line.lower()
            
            if "named entities:" in line_lower:
                current_section = "entities"
                # Extract entities from the same line
                entity_part = line.split(":", 1)[1] if ":" in line else ""
                entities.extend(extract_tuples_from_text(entity_part))
                
            elif "relationships:" in line_lower:
                current_section = "relationships"
                # Extract relationships from the same line
                rel_part = line.split(":", 1)[1] if ":" in line else ""
                relationships.extend(extract_tuples_from_text(rel_part))
                
            elif current_section == "entities":
                entities.extend(extract_tuples_from_text(line))
                
            elif current_section == "relationships":
                relationships.extend(extract_tuples_from_text(line))
    
    except Exception as e:
        print(f"⚠️  Error parsing output: {e}")
    
    return entities, relationships

def extract_tuples_from_text(text: str) -> List[Tuple]:
    """
    Extract tuples from text using regex pattern matching.
    """
    tuples = []
    
    # Pattern to match (item1, item2) or (item1, item2, item3)
    pattern = r'\(([^)]+)\)'
    matches = re.findall(pattern, text)
    
    for match in matches:
        # Split by comma and clean up
        parts = [part.strip() for part in match.split(',')]
        if len(parts) >= 2:
            tuples.append(tuple(parts))
    
    return tuples



In [None]:

# Test the extraction function
if extraction_model and data:
    print("\n🧪 Testing entity extraction on sample data...")
    test_result = extract_entities_and_relationships(extraction_model, data[0]['content'])

    print(f"\n📊 Extraction Results:")
    print(f"   Entities found: {len(test_result['entities'])}")
    print(f"   Relationships found: {len(test_result['relationships'])}")

    if test_result['entities']:
        print("\n🏷️  Sample Entities:")
        for entity in test_result['entities'][:5]:
            print(f"     {entity}")

    if test_result['relationships']:
        print("\n🔗 Sample Relationships:")
        for rel in test_result['relationships'][:5]:
            print(f"     {rel}")

In [7]:
def process_articles_for_extraction(data: List[Dict], pipe, start: int = 0, offset:int=5) -> List[Dict]:
    """
    Process multiple articles for entity and relationship extraction.
    """
    end = min(start + offset, len(data))
    articles_to_process = data[start:end]
    results = []

    print(f"🔍 Processing {len(articles_to_process)} articles for entity extraction...")

    for i, article in enumerate(articles_to_process):
        print(f"\nProcessing {i+1}/{len(articles_to_process)}: {article.get('title', 'Unknown')[:60]}...")

        # Extract entities and relationships
        extraction_result = extract_entities_and_relationships(pipe, article.get('content', ''))

        # Combine with original article data
        result = {
            "title": article.get('title', ''),
            "link": article.get('link', ''),
            "content": article.get('content', ''),
            "extraction": extraction_result,
            "entity_count": len(extraction_result['entities']),
            "relationship_count": len(extraction_result['relationships'])
        }

        results.append(result)
        
        # Progress update
        if (i + 1) % 5 == 0:
            print(f"  ✅ Processed {i+1}/{len(articles_to_process)} articles")

    return results

# # Process a small batch first for testing
# print("\n🚀 Processing first 5 articles for entity extraction...")
# extraction_results = process_articles_for_extraction(data, extraction_model, start=0, offset = 2)


In [8]:
def save_extraction_results(results: List[Dict], output_file: str = "entity-extraction.json"):
    """
    Save extraction results to files.
    """
    try:
        # Create output directory
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Convert to an absolute path to avoid relative path issues
        absolute_path = output_path.resolve()
        print(f"💾 Saving to: {absolute_path}")
        
        with open(absolute_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        print(f"\n💾 SAVED EXTRACTION RESULTS to {output_file}")

        
    except Exception as e:
        print(f"❌ Error saving results: {e}")
        print(f"   Attempted path: {output_path}")
        print(f"   Current working directory: {os.getcwd()}")
        print(f"   Absolute path would be: {Path(output_path).resolve()}")


In [10]:
# test save results
import datetime
today = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
offset = 2
start = 0
end = min(len(data), start+offset)

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing 2 articles for entity extraction...

Processing 1/2: FortiGuard Labs Threat Research...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (NailaoLocker, Malware), (SM2, Technique), (Lcrypt0rx, Malware), (Dark 101, Malware), (FortiCNAPP Composite Alerts, Tool), (Lcrypt0rx, Malware), (FortiCNAPP Labs, Tool), (FortiSandbox ...

Processing 2/2: NailaoLocker Ransomware’s “Cheese”...
🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (NailaoLocker, Malware), (AES-256-CBC, Technique), (SM2 cryptographic key, Vulnerability), (Windows, Platform), (user files, File), (high sev...


In [11]:
output_path = f"../data/entity-extraction/entity-extraction_{today}_{start}_{end}.json"
save_extraction_results(results, output_path)

💾 Saving to: /Users/huynguyen/Documents/UIT/2nd/NLP/LLM-TKIG/data/entity-extraction/entity-extraction_2025-08-04_18-30-40_0_2.json

💾 SAVED EXTRACTION RESULTS to ../data/entity-extraction/entity-extraction_2025-08-04_18-30-40_0_2.json


In [9]:
import datetime
today = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
offset = 50


In [13]:
start = 0
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/entity_extraction_results_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing 50 articles for entity extraction...

Processing 1/50: FortiGuard Labs Threat Research...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (NailaoLocker, Malware), (SM2, Technique), (Lcrypt0rx, Malware), (Dark 101, Malware), (FortiCNAPP Composite Alerts, Tool), (Lcrypt0rx, Malware), (FortiCNAPP Labs, Tool), (FortiSandbox ...

Processing 2/50: NailaoLocker Ransomware’s “Cheese”...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (NailaoLocker, Malware), (AES-256-CBC, Technique), (SM2 cryptographic key, Vulnerability), (Windows, Platform), (user files, File), (high sev...

Processing 3/50: Improving Cloud Intrusion Detection and Triage with FortiCNA...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Cloud, Entity), (Multi-stage technique, Technique), (Authentication abuse, Technique), (Privilege escalation, Technique), (Command execution...

Processing 4/50: Old Miner, New Tricks...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiCNAPP team, Attacker), (H2miner, Threat Type), (Lcrypt0rx, Malware), (Linux, OS), (Windows, OS), (Containers, OS), (KinSing, Tool), (Xmrig miners, Tool), (Lcrypt0rx, Malware), (L...

Processing 5/50: How FortiSandbox 5.0 Detects Dark 101 Ransomware Despite Eva...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Dark 101, Malware Family), (ransomware, Threat Type), (ransomnote, Threat Type), (Bitcoin, Currency), (Task Manager, Tool), (backupcatalog, Object), (...
  ✅ Processed 5/50 articles

Processing 6/50: Catching Smarter Mice with Even Smarter Cats...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Organization), (AI, Technique), (Malware, Malware), (IDA Pro, Tool), (Ghidra, Tool), (Linux/Prometei botnet, Threat Type), (February 2025, Date), (Lin...

Processing 7/50: NordDragonScan: Quiet Data-Harvester on Windows...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Microsoft Windows, Tool), (NordDragonScan, Malware), (kpuszkiev.com, Domain), (hxxps://cutt[.]ly/4rnmskDe, URL), (hxxps://secfileshare[.]com...

Processing 8/50: RondoDox Unveiled: Breaking Down a New Botnet Threat...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (botnet, Threat Type), (TBK DVR-4104, Platform), (TBK DVR-4216, Platform), (Four-Faith router model, Device), (F3x24, Device), (F3x36, Device), (RondoD...

Processing 9/50: DCRAT Impersonating the Colombian Government...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Attacker), (Remote Access Trojan, Malware), (DCRAT, Malware), (Modular Architecture, Technique), (Comprehensive Surveillance Capabilities, Technique),...

Processing 10/50: Dissecting a Malicious Havoc Sample...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Intrusion into Middle East Critical National Infrastructure, Threat Type), (Windows, Platform), (Windows Users, Target), (High, Severity Lev...
  ✅ Processed 10/50 articles

Processing 11/50: Threat Group Targets Companies in Taiwan...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (winos 4.0, Malware), (Taiwan, Country), (twszz[.]xin, Domain), (Gh0stBins, Malware), (holdinghands rat, Threat Type), (tax inspection, Technique), (ac...

Processing 12/50: RolandSkimmer: Silent Credit Card Thief Uncovered...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Activity), (Windows, Platform), (RolandSkimmer, Malware), (Bulgaria, Country), (Chrome, Browser), (Edge, Browser), (Firefox, Browser)...

Processing 13/50: How a Malicious Excel File (CVE-2017-0199) Delivers the Form...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Microsoft Windows, Platform), (Windows Users, Targeted Party), (FormBook, Malware), (CVE-2017-0199, Vulnerability), (Figure 1, Image), (Fort...

Processing 14/50: Deep Dive into a Dumped Malware without a PE Header...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (incident investigation, Activity), (FortiGuard Incident Response Team, Attacker), (malware, Malware), (dllhost.exe, Process), (pid.8200.vad....

Processing 15/50: Infostealer Malware FormBook Spread via Phishing Campaign – ...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Microsoft Windows, Platform), (Windows Users, Impact), (fully remotely control the victim’s computer, Severity level), (FormBook, Threat Typ...
  ✅ Processed 15/50 articles

Processing 16/50: Ransomware Roundup – VanHelsing...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Topic), (VanHelsing ransomware, Malware), (Windows, Platform), (Microsoft Windows, Platform), (SHA2, Hash), (99959C5141F62D4FBB60EFDC...

Processing 17/50: Horabot Unleashed: A Stealthy Phishing Threat...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Horabot, Malware), (VBScript, Tool), (AutoIt, Tool), (PowerShell, Tool), (Outlook COM Automation, TTP), (Latin America, Region), (Mexico, Country), (G...

Processing 18/50: Multilayered Email Attack: How a PDF Invoice and Geo-Fencing...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Windows, Platform), (Linux & macOS, Platform), (Java, Tool), (users, Party), (system, Party), (Java Runtime Environment (JRE), Party), (atta...

Processing 19/50: FortiGuard Incident Response Team Detects Intrusion into Mid...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (FGIR, Attacker), (Iranian state-sponsored threat group, Attacker), (Novel malware, Malware), (plink, Tool), (Ngrok, Tool), (glider proxy, To...

Processing 20/50: Key Takeaways from the 2025 Global Threat Landscape Report...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (Threat Research, Topic), (2024, Year), (Global Threat Landscape Report, Document), (SIP-based VoIP system, Device), (RDP server, Device), (Industrial proto...
  ✅ Processed 20/50 articles

Processing 21/50: IngressNightmare: Understanding CVE‑2025‑1974 in Kubernetes ...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity Type), (Ingress-NGINX Containers v1.11.0-4, Malware), (v1.12.0, Malware), (<v1.11.0, Malware), (Impacted Users, Entity Type), (Any Organization...

Processing 22/50: Infostealer Malware FormBook Spread via Phishing Campaign – ...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Topic), (Fortinet, Company), (FortiGuard Labs, Malware), (Formbook, Malware), (CVE-2017-11882, Vulnerability), (Figure 1, Image)
Rela...

Processing 23/50: New Rust Botnet "RustoBot" is Routed via Routers...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (TOTOLINK N600R V4.3.0cu.7570_B20200620, Device), (TOTOLINK A830R V5.9c.4729_B20191112, Device), (A3100R V4.1.2cu.5050_B20200504, Device), (A950RG V4.1...

Processing 24/50: Malicious NPM Packages Targeting PayPal Users...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (OSS malware detection system, Tool), (NPM packages, File), (stolen sensitive information, Exploit), (PayPal users, Target), (PayPal, Organization), (PayPal...

Processing 25/50: Real-Time Anti-Phishing: Essential Defense Against Evolving ...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research Phishing, Threat Type), (AI, Technique), (email filter, Tool), (blacklist, Tool), (real-time anti-phishing (RTAP), Tool), (Verizon DBIR report, Documen...
  ✅ Processed 25/50 articles

Processing 26/50: Fortinet Identifies Malicious Packages in the Wild: Insights...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Malicious Software Packages, File), (November 2024, Date), (OSS malware detection system, Tool), (Automated threat detection platform, Tool), (2025 Gl...

Processing 27/50: Havoc: SharePoint with Microsoft Graph API turns into FUD C2...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (Threat Research, Topic), (Microsoft Windows, Platform), (Any organization, Target), (Attackers, Actor), (Havoc, Malware), (ClickFix, Tool), (multi-stage ma...

Processing 28/50: Winos 4.0 Spreads via Impersonation of Official Email to Tar...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Winos4.0, Malware), (Taiwan, Country), (gaming-related applications, Tool), (email, Communication medium), (National Taxation Bureau, Organization), (...

Processing 29/50: FortiSandbox 5.0 Detects Evolving Snake Keylogger Variant...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Snake Keylogger, Malware), (AutoIt/Injector.GTY!tr, Malware), (SMTP, Protocol), (Telegram, Service), (China, Region), (Turkey, Region), (Indonesia, Re...

Processing 30/50: Ransomware Roundup – Lynx...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Topic), (Ransomware Roundup, Publication), (Lynx ransomware, Malware), (INC ransomware, Malware), (Windows, Platform), (ESXi, Platfor...
  ✅ Processed 30/50 articles

Processing 31/50: Analyzing ELF/Sshdinjector. A!tr with a Human and Artificial...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (LinuxImpacted, Threat Type), (Linux-based network appliances or IoT, Device), (Data exfiltration, Impact), (Medium, Severity Level), (ELF/Ss...

Processing 32/50: Coyote Banking Trojan: A Stealthy Attack via LNK Files...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Windows, Operating System), (Coyote Banking Trojan, Malware), (Brazil, Country), (keylogging, Technique), (capturing screenshots, Technique), (display...

Processing 33/50: Deep Dive Into a Linux Rootkit Malware...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (CentOS Linux, OS), (CentOS Users, Target), (Linux, OS), (Azero Day Exploit, Threat Type), (Threat Actor, Attacker), (rootkit, Malware), (sys...

Processing 34/50: Phish-free PayPal Phishing...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (CISO, Attacker), (Phishing attempt, Threat Type), (PayPal, Tool), (billingdepartments1[@]gkjyryfjy876.onmicrosoft.com, Domain), (MS365 test ...

Processing 35/50: Catching "EC2 Grouper"- no indicators required!...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Organization), (EC2 Grouper, Attacker), (AWS tools, Tool), (AWSPowerShell.Common/4.1.90.0, Tool), (NET_Core/6.0.5, Tool), (OS/Microsoft_Windows_10.0.1...
  ✅ Processed 35/50 articles

Processing 36/50: Botnets Continue to Target Aging D-Link Vulnerabilities...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Mirai, Malware), (Kaiten, Malware), (HNAP, Tool), (CVE-2015-2051, Vulnerability), (CVE-2019-10891, Vulnerability), (CVE-2022-37056, Vulnerability), (C...

Processing 37/50: Analyzing Malicious Intent in Python Code: A Case Study...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Zebo-0.1.0, Malware), (Cometlogger-0.1, Malware), (pynput, Tool), (ImageGrab, Tool), (Obfuscation, Technique), (Keylogging, Technique), (Scr...

Processing 38/50: Fortinet Contributes to Major Cybercrime Operation Arrests...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity Type), (International Criminal Police Organization, Attacker), (African Union Mechanism for Police Cooperation, Attacker), (cybercrime groups, ...

Processing 39/50: SmokeLoader Attack Targets Companies in Taiwan...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (SmokeLoader, Malware), (Taiwan, Location), (VBS file, File), (AndeLoader, Malware), (idem, File)\nRelationships: (FortiGuard Labs, observes, attack), ...

Processing 40/50: Ransomware Roundup - Interlock...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (ransomware, Threat Type), (Interlock ransomware, Malware), (Windows, Platform), (FreeBSD, Platform), (Sina Kheirkhah, Attacker), (backdoor, Technique)...
  ✅ Processed 40/50 articles

Processing 41/50: Advanced Cyberthreats Targeting Holiday Shoppers...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (Black Friday, Event), (Cyber Monday, Event), (holiday sales, Event), (AI-powered phishing lures, Technique), (website cloning tools, Tool), (remote code ex...

Processing 42/50: Threat Predictions for 2025: Get Ready for Bigger, Bolder At...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (Threat Research, Topic), (Cybercrime-as-a-Service, Threat Type), (CaaS, Threat Type), (adversaries, Attacker), (Sophisticated Playbook, Technique), (digita...

Processing 43/50: New Campaign Uses Remcos RAT to Exploit Victims...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Remcos, Malware), (Microsoft Office, Tool), (WordPad, Tool), (CVE-2017-0199, Vulnerability), (Figure 1, Image), (Figure 2, Image)\nRelationships: (For...

Processing 44/50: Threat Campaign Spreads Winos4.0 Through Game Application...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Winos4.0, Malware), (ad59t82g[.]com, Domain), (you.dll, File), (you, Function)\nRelationships: (FortiGuard Labs, affectedPlatforms, Microsoft Windows)...

Processing 45/50: Burning Zero Days: Suspected Nation-State Adversary Targets ...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Ivanti Cloud Services Appliance, Malware), (CVE-2024-8190, Vulnerability), (PHP, Tool), (IP address, IP), (206[.]189[.]156[.]69, IP)\nRelati...
  ✅ Processed 45/50 articles

Processing 46/50: Threat Actors Exploit GeoServer Vulnerability CVE-2024-36401...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (GeoServer, Tool), (GeoServer, Vulnerability), (GeoServer, Threat Type), (GeoServer, Associated With), (GeoServer, Deployed By), (GeoServer, Targeted B...

Processing 47/50: Emansrepo Stealer: Multi-Vector Attack Chains...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Threat Type), (Emansrepo, Malware), (Purchase-Order.7z, File), (Purchase-Order.exe, File), (HTML file, File), (emailed, Method), (PyI...

Processing 48/50: Ransomware Roundup - Underground...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Ransomware Roundup, Report), (Underground ransomware, Threat Type), (Windows, Platform), (RomCom group, Attacker), (CVE-2023-36884, Vulnerability), (e...

Processing 49/50: Deep Analysis of Snake Keylogger’s New Variant...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Attacker), (Malicious Excel document, File), (snake keylogger, Malware), (Fortinet's FortiGuard Labs, Tool), (swift copy.xls, File), (virus detected, Hash)
Relationsh...

Processing 50/50: A Deep Dive into a New ValleyRAT Campaign Targeting Chinese ...
🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (ValleyRAT, Malware), (Chinese speakers, Target), (Microsoft Office, Tool), (Temp\%, File), (Test, Mutex), (Registry, Registry), (Temp\%, Directory), (...
  ✅ Processed 50/50 articles
💾 Saving to: C:\Users\KietVu\Testplace\LLM_TKIG\data\entity-extraction\entity_extraction_results_Qwen\Qwen2.5-1.5B-Instruct_test_2025-08-17_13-24-10_0_50.json

💾 SAVED EXTRACTION RESULTS to ../data/entity-extraction/entity_extraction_results_Qwen/Qwen2.5-1.5B-Instruct_test_2025-08-17_13-24-10_0_50.json


In [14]:
start = 51
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/entity_extraction_results_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing 50 articles for entity extraction...

Processing 1/50: PureHVNC Deployed via Python Multi-stage Loader...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (VenomRAT, Malware), (ScrubCrypt, Tool), (Python obfuscator, Tool), ('Kramer', Tool), ('donut', Tool), ('laZzzy', Tool), (XWorm, Malware), (Venom RAT, ...

Processing 2/50: Malicious Packages Hidden in PyPI...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (PyPI, Tool), (zlibxjson, Malware), (Discord_token_grabber.pyget_cookies.pypassword_grabber.py, File), (discord, Domain), (python, Tool), (dll, File), ...

Processing 3/50: Phishing Campaign Targeting Mobile Users in India Using Indi...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Organization), (iPhone users, User), (India Post, Organization), (smishing, Technique), (China-based threat actor, Attacker), (Apple, Tool), (Hotmail,...

Processing 4/50: Dark Web Shows Cybercriminals Ready for Olympics. Are You?...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (Threat Research, Topic), (World Cup, Event), (Super Bowl, Event), (Wimbledon, Event), (Qatar 2022 World Cup, Event), (Olympics, Event), (Tokyo 2020 Games, ...

Processing 5/50: MerkSpy: Exploiting CVE-2021-40444 to Infiltrate Systems...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Microsoft Windows, Platform), (Spyware, Malware), (CVE-2021-40444, Vulnerability), (MerkSpy, Malware), (MSHTML, Tool), (Internet Explorer, T...
  ✅ Processed 5/50 articles

Processing 6/50: The Growing Threat of Malware Concealed Behind Cloud Service...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Linux Distributions, Platform), (UNSTABLE, Malware), (JAWS, Website), (Dasan GPON home router, Device), (Huawei HG532 router, Device), (TP-Link Archer...

Processing 7/50: Fickle Stealer Distributed via Multiple Attack Chain...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Windows, Operating System), (VBA dropper, Technique), (VBA downloader, Technique), (link downloader, Technique), (executable downloader, Technique), (...

Processing 8/50: Ransomware Roundup – Shinra and Limpopo Ransomware...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Topic), (ransomware variants, Threat Type), (FortiGuard Labs, Organization), (OSINT, Topic), (Ransomware Roundup, Report), (Microsoft...

Processing 9/50: New Agent Tesla Campaign Targeting Spanish-Speaking People...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Agent Tesla, Malware), (Spanish, Language), (JavaScript code, Technique), (PowerShell code, Technique), (fileless modules, Technique), (attacker, Atta...

Processing 10/50: Menace Unleashed: Excel File Deploys Cobalt Strike at Ukrain...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Malware, Malware), (VBA macro, Technique), (DLL file, File), (Cobalt Strike, Tool), (Excel document, File), (Ukraine, Country), (CERT-UA, Organization...
  ✅ Processed 10/50 articles

Processing 11/50: zEus Stealer Distributed via Crafted Minecraft Source Pack...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Organization), (Microsoft Windows, Platform), (zEus stealer, Malware), (Discord webhook, Tool), (WinRAR, Tool), (zEus, Malware), (zEus, Variant), (zEu...

Processing 12/50: Key Findings from the 2H 2023 FortiGuard Labs Threat Report...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (Threat Research, Function), (second half of 2023, Time Period), (APT groups, Attacker), (ransomware, Threat Type), (botnets, Threat Type), (IoT, Technology...

Processing 13/50: New “Goldoon” Botnet Targeting D-Link Devices...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (D-Link, Manufacturer), (DIR-645 Wired/Wireless Router Rev. Ax, Device), (CVE-2015-2051, Vulnerability), (hxxp://94[.]228[.]168[.]60:8080, URL), (94[.]...

Processing 14/50: Ransomware Roundup - KageNoHitobito and DoNex...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (ransomware, Threat Type), (KageNoHitobito, Malware), (DoNex, Malware), (TOR, Tool), (file sharing, Technique), (Chile, Country), (China, Country), (Cu...

Processing 15/50: Unraveling Cyber Threats: Insights from Code Analysis...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (discordpy_bypass-1.7, Malware), (discordpy_bypass, Malware), (PyPI, Tool), (March 10, 2024, Date), (March 12, 2024, Date), (aos, Author), (authored by...
  ✅ Processed 15/50 articles

Processing 16/50: Botnets Continue Exploiting CVE-2023-1389 for Wide-Scale Spr...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (TP-Link Archer AX21 (AX1800), Device), (AX1800, Model), (Golang, Tool), (hxxp://5[.]10[.]249[.]153, URL), (AGoent, Tool), (Moobot, Botnet), (Miori, Bo...

Processing 17/50: ScrubCrypt Deploys VenomRAT with an Arsenal of Plugins...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Microsoft Windows, Platform), (ScrubCrypt, Malware), (Oracle WebLogic Servers, Targeted Systems), (VenomRAT, Malware), (BatCloak, Tool), (C2...

Processing 18/50: Byakugan – The Malware Behind a Phishing Attack...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Threat Type), (Byakugan, Malware), (PDF, File), (thinkforce[.]com[.]br, Domain), (Port 8080, Port)\nRelationships: (FortiGuard Labs, ...

Processing 19/50: Ransomware Roundup – RA World...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (RA World ransomware, Malware), (Microsoft Windows, Platform), (Victims, Target), (Volume Shadow Copies, Vulnerability), (System backups, Vulnerability)
Relationships: (RA World ransom...

Processing 20/50: VCURMS: A Simple and Functional Weapon...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Java, Tool), (VCURMS, Threat Type), (STRRAT, Threat Type), (RAT, Threat Type), (Amazon Web Services, Commercial Protector), (GitHub, Commercial Protec...
  ✅ Processed 20/50 articles

Processing 21/50: New Banking Trojan “CHAVECLOAK” Targets Brazil...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Organization), (Microsoft Windows, Platform), (CHAVECLOAK, Malware), (PDF, File), (ZIP file, File), (DLL, Tool), (Brazil, Country), (Casbaneiro, Banki...

Processing 22/50: FortiGuard Labs Outbreak Alerts Annual Report 2023: A Glimps...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (Outbreak Alerts, Threat Type), (Annual Report, Document), (2023, Year), (Cybersecurity, Topic), (Subscriptions, Action), (FortiGuard Labs, Company), (Repor...

Processing 23/50: Ransomware Roundup – Abyss Locker...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (ransomware, Threat Type), (Abyss Locker, Malware), (Linux, Platform), (Windows, Platform), (users, Target), (high severity, Severity Level), (HelloKit...

Processing 24/50: Android/SpyNote Moves to Crypto Currencies...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity Type), (Android Impacted Users, Entity Type), (Android users, Entity Type), (mobile crypto wallet, Entity Type), (banking applications, Entity ...

Processing 25/50: TicTacToe Dropper...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Organization), (Malware dropper, Malware), (Leonem, File), (AgentTesla, File), (SnakeLogger, File), (RemLoader, File), (Sabsik, File), (LokiBot, File)...
  ✅ Processed 25/50 articles

Processing 26/50: Python Info-stealer Distributed by Malicious Excel Document...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Excel document, File), (info-stealer, Malware), (Windows Update.bat, File), (filebin.net, URL), (Abobus, Tool), (test.vbs, File), (script.py, File), (...

Processing 27/50: Ransomware Roundup - Albabat...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Albabat ransomware, Malware), (White Bat, AKA), (Counter-Strike 2, Game), (Windows 10, Operating System), (Argentine, Country), (Brazilian, Country), ...

Processing 28/50: Another Phobos Ransomware Variant Launches Attack – FAUST...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Topic), (Phobos ransomware, Malware), (EKING, Malware), (8Base, Malware), (FAUST ransomware, Malware), (Gitea, Tool), (XLAM document,...

Processing 29/50: Info Stealing Packages Hidden in PyPI...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (PyPI, Tool), (Malware, Malware), (Checkmarx blog, Website), (PE, File), (base64, Hash), (setup.py, File), (TestLibs111, File), (telerer, Fil...

Processing 30/50: Deceptive Cracked Software Spreads Lumma Variant on YouTube...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity Type), (YouTube, Tool), (GitHub, Tool), (MediaFire, Tool), (Telegram, Tool), (dark web, Domain), (Lumma Stealer, Malware), (private.NET loader,...
  ✅ Processed 30/50 articles

Processing 31/50: Three New Malicious PyPI Packages Deploy CoinMiner on Linux ...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Organization), (Linux, Platform), (PyPI, Tool), (CoinMiner, Malware), (modularseven-1.0, File), (driftme-1.0, File), (catme-1.0, File), (sastra, Attac...

Processing 32/50: Ransomware Roundup - 8base...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Ransomware Roundup, Report), (8base ransomware, Malware), (SmokeLoader, Tool), (bab3c87cac6db1700f0a0babaa31f5cd544961d1b9ec03fd8bcdeff837fc9755, File...

Processing 33/50: Bandook - A Persistent Threat That Keeps Evolving...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Bandook, Malware), (msinfo32.exe, File), (PDF file, File), (password-protected.7z file, File), (registry key, Hash), (PID, Hash), (control code, Hash), (msinfo32.exe, File)\nRelations...

Processing 34/50: MrAnon Stealer Spreads via Email with Fake Hotel Booking PDF...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Malware, Malware), (MrAnon Stealer, Malware), (PowerShell, Tool), (cx-Freeze, Tool), (Germany, Country), (November 2023, Month), (Hotel room reservati...

Processing 35/50: GoTitan Botnet - Ongoing Exploitation on Apache ActiveMQ...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Apache Active MQ, Software), (FortiGuard Labs, Organization), (CVE-2023-46604, Vulnerability), (Golang, Language), (Sliver, Tool), (PrCtrl Rat, Tool)
Relationships: (Apache Active MQ,...
  ✅ Processed 35/50 articles

Processing 36/50: Konni Campaign Distributed Via Malicious Document...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Konni campaign, Campaign), (Russian-language Word document, File), (VBA script, File), (temp.zip, File), (check.bat, File), (vbHide, Parameter), (oleF...

Processing 37/50: Investigating the New Rhysida Ransomware...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Rhysida ransomware group, Malware), (ESXi-based ransomware, Technique), (PSExec, Tool), (AnyDesk, Tool), (WinSCP, Tool), (FortiGuard IR team...

Processing 38/50: Ransomware Roundup – NoEscape...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (FortiGuard Labs Threat Research, Function), (ransomware variants, Threat Type), (NoEscape ransomware, Malware), (Microsoft Windows, Platform), (Linux,...

Processing 39/50: Threat Predictions for 2024: Chained AI and CaaS Operations ...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (Threat Research, Topic), (Cybercrime-as-a-Service, Threat Type), (generative AI, Technology), (advanced persistent cybercrime, Threat Type), (APT groups, A...

Processing 40/50: Ransomware Roundup - Knight...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Knight ransomware, Malware), (CERT Italy, Organization), (Italian organizations, Target), (Remcos, Tool), (Qakbot, Tool)
Relationships: (Knight ransomware, uses, phishing), (Knight ra...
  ✅ Processed 40/50 articles

Processing 41/50: Another InfoStealer Enters the Field, ExelaStealer...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (InfoStealer, Malware), (RedLine, Attacker), (Raccoon, Attacker), (Vidar, Attacker), (SaphireStealer, Attacker), (ExelaStealer, Malware), (quicaxd, Attacker...

Processing 42/50: Ransomware Roundup - Akira...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Akira Ransomware, Malware), (CERT India, Organization), (VPN, Service), (multi-factor authentication, Vulnerability), (network access, Tool)
Relationships: (Akira Ransomware, contains...

Processing 43/50: IZ1H9 Campaign Enhances Its Arsenal with Scores of Exploits...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Mirai-based DDoS campaign, Threat Type), (IZ1H9, Malware), (D-Link, Vulnerability), (Netis wireless router, Vulnerability), (Sunhillo SureLine, Vulner...

Processing 44/50: Malicious Packages Hidden in NPM...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (NPM, Tool), (PyPI, Domain), (Node Package Manager, Tool), (webhook, Technique), (file-sharing link, Technique), (system, Vulnerability), (user, Vulner...

Processing 45/50: Threat Actors Exploit the Tensions Between Azerbaijan and Ar...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (Nagorno-Karabakh Autonomous Oblast, Territory), (Azerbaijan, Country), (Armenia, Country), (Balkans, Region), (Soviet Union, Country), (Ukraine, Country), ...
  ✅ Processed 45/50 articles

Processing 46/50: Ransomware Roundup - Retch and S. H. O....


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Retch, Malware), (S.H.O, Malware), (FortiGuard Labs, Organization), (Ransomware Roundup, Report), (Windows, Platform), (Microsoft Windows, Platform), (High, Severity Level)
Relationsh...

Processing 47/50: New MidgeDropper Variant...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (MidgeDropper, Malware), (RAR archive, File), (!PENTING_LIST OF OFFICERS.rar, Archive), ("Notice to Work-From-Home groups.pdf", File), ("062023_PENTING...

Processing 48/50: OriginBotnet Spreads via Malicious Word Document...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Word document, File), (malicious URL, URL), (https://bankslip[.]info/document/scancop20233108[.]exe, File), (Aphishing email, Email), (XOR operation, ...

Processing 49/50: New Agent Tesla Variant Being Spread by Crafted Excel Docume...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Topic), (Agent Tesla, Malware), (CVE-2017-11882, Vulnerability), (CVE-2018-0802, Vulnerability), (Microsoft, Company), (IPS, Tool), (...

Processing 50/50: Ransomware Roundup - Rhysida...
🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Rhysida ransomware, Malware), (Health Sector Cybersecurity Coordination Center, Organization), (Cobalt Strike, Tool), (HC3, Organization), (August 4, ...
  ✅ Processed 50/50 articles
💾 Saving to: C:\Users\KietVu\Testplace\LLM_TKIG\data\entity-extraction\entity_extraction_results_Qwen\Qwen2.5-1.5B-Instruct_test_2025-08-17_13-24-10_51_101.json

💾 SAVED EXTRACTION RESULTS to ../data/entity-extraction/entity_extraction_results_Qwen/Qwen2.5-1.5B-Instruct_test_2025-08-17_13-24-10_51_101.json


In [None]:
start = 102
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/entity_extraction_results_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing 50 articles for entity extraction...

Processing 1/50: Ransomware Roundup – Trash Panda and A New Minor Variant of ...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Trash Panda, Malware), (NoCry ransomware, Malware), (Windows, Platform), (Fortinet, Tool), (file extension, File), (Command Prompt, Tool), (public file scanning service, Tool), (diagc...

Processing 2/50: FortiGuard AI Detects Malicious Packages Hidden in the Pytho...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research Python Package Index (PyPI), Tool), (Malware, Malware), (FortiGuard Labs team, Attacker), (AI engine, Tool), (OSS supply chain threats hunting system, ...

Processing 3/50: Attackers Distribute Malware via Freeze.rs And SYK Crypter...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (injector, Malware), (Rust, Language), (shellcode, Payload), (XWorm, Threat Type), (Windows, Platform), (SYK Crypter, Tool), (Remcos, Threat Type), (nj...

Processing 4/50: Key Findings from the 1H 2023 FortiGuard Labs Threat Report...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Company), (Threat Research, Department), (Fortinet, Company), (billions of threat events, Quantity), (worldwide, Location), (APT groups, Threat Type), (ransomware, Th...

Processing 5/50: Ransomware Roundup - DoDo and Proton...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (ransomware, Threat Type), (DoDo, Malware), (Proton, Malware), (Mercurial Grabber, Tool), (GitHub, Tool), (June 3, 2021, Date), (discord tokens, Vulner...
  ✅ Processed 5/50 articles

Processing 6/50: FortiGuard Labs Discovers Multiple Vulnerabilities in Micros...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Microsoft Message Queuing (MSMQ), Service), (MQSVC.EXE, Executable), (MQQM.DLL, DLL), (MQAC.SYS, System), (RabbitMQ, Tool), (Table 1, Document)
Relati...

Processing 7/50: Ransomware Roundup - Cl0p...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Ransomware Roundup, Report), (Cl0p ransomware, Malware), (MOVEit Transfer, Tool), (CVE-2023-34362, Vulnerability), (Linux, Platform), (Windows, Platfo...

Processing 8/50: DDoS Botnets Target Zyxel Vulnerability CVE-2023-28771...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Linux, OS), (DDoS botnets, Threat Type), (Zyxel, Malware), (CVE-2023-28771, Vulnerability), (command injection, Technique), (TRAPA Security, Organizat...

Processing 9/50: FortiGuard Labs Discovers Multiple Vulnerabilities in Adobe ...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Adobe InDesign, Tool), (CVE-2023-29308, Vulnerability), (CVE-2023-29309, Vulnerability), (CVE-2023-29310, Vulnerability), (CVE-2023-29311, V...

Processing 10/50: LokiBot Campaign Targets Microsoft Office Document Using Vul...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Topic), (Microsoft Windows, Platform), (Windows users, Targeted Party), (Control and collect sensitive information from a victim’s de...
  ✅ Processed 10/50 articles

Processing 11/50: Meet LockBit: The Most Prevalent Ransomware in 2022...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity Type), (Microsoft Windows, Platform), (Linux, Platform), (ESXi, Platform), (MacOS, Platform), (LockBit ransomware, Malware), (LockBit Group, At...

Processing 12/50: Ransomware Roundup - Rancoz...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Ransomware Roundup, Report), (Rancoz ransomware, Malware), (Microsoft Windows, Platform), (Tor, Domain), (November 2022, Date)
Relationships: (FortiGu...

Processing 13/50: New Fast-Developing ThirdEye Infostealer Pries Open System I...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Windows, Platform), (ThirdEye, Malware), (time sheet, File), (CMK Правила оформления больничных листов.pdf, File), (f6e6d44137cb5fcee20bcde0a162768dad...

Processing 14/50: Ransomware Roundup - Black Basta...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Black Basta, Threat Type), (Conti ransomware, Malware), (FortiGuard Labs, Organization), (Fortinet, Tool), (VMWare ESXi servers, Device), (US government contractor, Impacted Party), (...

Processing 15/50: Fortinet Reverses Flutter-based Android Malware “Fluhorse”...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Android/Fluhorseis, Malware), (Flutter, Tool), (previous instances, Threat Type), (MoneyMonger, Malware), (reverse engineering, Technique), ...
  ✅ Processed 15/50 articles

Processing 16/50: Condi DDoS Botnet Spreads via TP-Link's CVE-2023-1389...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Linux, OS), (Condi, Malware), (TP-Link Archer AX21 (AX1800), Device), (CVE-2023-1389, Vulnerability), (CDN2[.]duc3k[.]com, Domain), (admin[.]duc3k[.]c...

Processing 17/50: Ransomware Roundup - Big Head...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Big Head, Malware), (ransomware, Threat Type), (May 2023, Date), (Microsoft Word, Tool), (counterfeit software, Tool), (variant A, Malware Variant)\nR...

Processing 18/50: MOVEit Transfer Critical Vulnerability (CVE-2023-34362) Expl...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (MOVEit Transfer, Software), (SQL database, Vulnerability), (Cl0p ransomware, Attacker), (CVE-2023-34362, Vulnerability), (CISA, Organization)
Relationships: (MOVEit Transfer, uses, SQ...

Processing 19/50: YouTube Pirated Software Videos Deliver Triple Threat: Vidar...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Threat Research, Service), (YouTube, Platform), (pirated software, Product), (cracked, AKA), (malicious binaries, File), (multiple malware, Malware), ...

Processing 20/50: WINTAPIX: A New Kernel Driver Targeting Countries in The Mid...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Organization), (WindowsImpacted parties, User), (WinTapix.sys, File), (WINTAPIX, File), (Donutproject, Tool), (kernel driver, Device), (Virus Total, S...
  ✅ Processed 20/50 articles

Processing 21/50: More Supply Chain Attacks via Malicious Python Packages...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (PyPI, Tool), (setup.py, File), (URL, Domain), (Python, Tool), (Discord, Tool), (zip file, File), (script, File), (encoded data, File), (Figure 5, Imag...

Processing 22/50: Ransomware Roundup - Maori...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Maori, Malware), (Linux, Platform), (Linux Users, Impact), (Go, Tool), (FortiGuard Labs, Organization), (Linux, Platform), (Linux Users, Impact)
Relationships: (Maori, uses, Linux), (...

Processing 23/50: RapperBot DDoS Botnet Expands into Cryptojacking...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Linux, Platform), (RapperBot, Malware), (Intel x64, Vulnerability), (YouTube, Domain), (January 2023, Date), (late January 2023, Date)
Relationships: ...

Processing 24/50: AndoryuBot – New Botnet Campaign Targets Ruckus Wireless Adm...


In [None]:
start = 153
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/entity_extraction_results_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing 50 articles for entity extraction...

Processing 1/50: Analyzing Malware Code that Cryptojacks System to Mine for M...


In [10]:
start = 204
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/entity_extraction_results_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=46)
save_extraction_results(results, output_path)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing 46 articles for entity extraction...

Processing 1/46: spyware obfuscation static analysis...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Spyware, Malware), (code obfuscation, Technique), (Jadx, Tool), (Android applications, File)
Relationships: (Spyware, employs, code obfuscation), (Spyware, uses, Jadx)...

Processing 2/46: crambus middle east government...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Iranian Crambus espionage group, Attacker), (OilRig, Threat Type), (APT34, Attacker), (PowerShell backdoor, Malware), (Plink, Tool), (Windows firewall rule modification, Technique), (...

Processing 3/46: grayling taiwan cyber attacks...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Grayling, Attacker), (custom malware, Malware), (publicly available tools, Tool), (DLL sideloading, Technique), (web shell, File), (Cobalt Strike, Tool), (NetSpy, Tool), (Havoc framew...

Processing 4/46: budworm tool update telecoms govt...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Budworm, Attacker), (SysUpdate, Malware), (INISafeWebSSO, Tool), (DLL sideloading, Technique)
Relationships: (Budworm, uses, SysUpdate), (Budworm, uses, INISafeWebSSO), (Budworm, empl...

Processing 5/46: 3am ransomware lockbit...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (ransomware, Threat Type), (LockBit, Malware), (3AM, Malware), (Cobalt Strike, Tool), (Wput, Tool), (FTP server, Domain), (Windows PowerShell, Tool), (whoami, Command), (netstat, Comma...
  ✅ Processed 5/46 articles

Processing 6/46: critical infrastructure attacks...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Espionage actors, Attacker), (Redfly, Attacker), (ShadowPad, Malware), (national grid, Target), (credentials, Exploit), (computers, Target), (U.S., UK, Australian, Canadian, New Zeala...

Processing 7/46: qakbot takedown disruption...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Qakbot, Malware), (Batbug, Attacker), (Emotet, Threat Type), (SMB, Technique), (Active Directory, Vulnerability), (networks, Environment), (email, Method)
Relationships: (Qakbot, uses...

Processing 8/46: carderbee software supply chain certificate abuse...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (APT group, Attacker), (Korplug, Malware), (PlugX, Malware), (Carderbee, Attacker), (Cobra DocGuard, Tool), (Cobra DocGuard Client, Tool), (Esfanet, Tool), (NSFOCUS, Tool), (Budworm, A...

Processing 9/46: syssphinx fin8 backdoor...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec's Threat Hunter Team, Attacker), (Syrinx, Malware), (FIN8, Attacker), (Sardonic, Malware), (Noberus, Malware), (living-off-the-land, Technique), (POSS, Technique), (ragnar lo...

Processing 10/46: microsoft zeroday exploit...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (CVE-2023-36884, Vulnerability), (Microsoft, Tool), (Storm-0978, Attacker), (RomCom, Attacker), (NATO Summit, Event)
Relationships: (CVE-2023-36884, contains, Microsoft), (Storm-0978, ...
  ✅ Processed 10/46 articles

Processing 11/46: flea backdoor microsoft graph apt15...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Flea, Attacker), (backdoor.Graphican, Malware), (BS2005, Malware), (Ketrican, Malware)
Relationships: (Flea, uses, backdoor.Graphican), (Flea, uses, BS2005), (Flea, uses, Ketrican)...

Processing 12/46: shuckworm russia ukraine military...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Shuckworm, Attacker), (Ukraine, Target), (Gamaredon, aka), (armageddon, aka), (Russia, Country), (FSB, Organization), (armed conflicts, Threat Type), (criminal proceedings, Threat Typ...

Processing 13/46: moveit vulnerabilities exploits...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Extortion actors, Attacker), (MOVEit Transfer, Tool), (CVE-2023-34362, Vulnerability), (Progress Software, Company), (Clop ransomware, Threat Actor), (SQL, Technology)
Relationships: ...

Processing 14/46: buhti ransomware...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Buhti, Threat Type), (LockBit, Malware), (Babuk, Malware), (PaperCut, Vulnerability), (Windows, OS), (Linux, OS), (Blacktail, Attacker), (LockBit Black, Malware), (LockBit, Malware), ...

Processing 15/46: lancefly merdoor zxshell custom backdoor...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Lancefly APT, Attacker), (backdoor, Malware), (Merdoor, Malware), (ZXShell rootkit, Tool), (South and Southeast Asia, Region), (government, Sector), (aviation, Sector), (education, Se...
  ✅ Processed 15/46 articles

Processing 16/46: xtrader 3cx supply chain...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (X_Trader, Tool), (Trojanized X_Trader software, Malware), (3CX, Attacker), (Trading Technologies, Tool), (energy futures, Vulnerability), (North Korea, Attacker)\nRelationships: (X_Tr...

Processing 17/46: apt attacks telecoms africa mgbot...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Daggerfly, Attacker), (MgBot, Malware), (PlugX, Tool), (AnyDesk, Tool), (DLL, File), (Exchange, Domain), (Malwarebytes, Tool), (Evasive Panda, Attacker), (China, Country), (Symantec, ...

Processing 18/46: play ransomware volume shadow copy...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Play ransomware group, Attacker), (Grixba, Malware), (.NET infostealer, Tool), (Volume Shadow Copy Service, Tool), (Symantec, Organization), (Costura, Tool), (costura.commandline.dll,...

Processing 19/46: mantis palestinian attacks...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Mantis cyber-espionage group, Attacker), (Arid Viper, aka), (Desert Falcon, aka), (APT-C-23, Attacker), (Spear-phishing, Technique), (fake social media profile, Technique), (Israel, T...

Processing 20/46: 3cx supply chain attack...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Yara rule, Tool), (Trojan, Malware), (3CX's DesktopApp, Tool), (Windows, OS), (MacOS, OS), (SolarWinds, Threat Type), (North Korea, Attacker), (information-stealing malware, Malware),...
  ✅ Processed 20/46 articles

Processing 21/46: blackfly espionage materials...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Blackfly, Attacker), (PlugX/Fast, Malware), (Winnti/Pasteboy, Malware), (Shadowpad, Malware), (Grayfly, Attacker), (APT41, Attacker), (Apt41.A, Indictment)
Relationships: (Blackfly, u...

Processing 22/46: clasiopa materials research...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Clasiopa, Attacker), (Backdoor.Atharvan, Malware), (Agile DGS, Tool), (Agile FD, Tool), (HCL Domino, Tool), (IBM Domino, Tool), (mutex, Hash)
Relationships: (Clasiopa, uses, Backdoor....

Processing 23/46: hydrochasma asia medical shipping intelligence gathering...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Shipping companies, Entity), (medical laboratories, Entity), (intelligence-gathering campaign, Threat Type), (Hydrochasma, Attacker), (living-off-the-land tools, Tool), (Symantec, Too...

Processing 24/46: frebniis malware iis...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec, Company), (Broadcom Software, Tool), (Frebniis, Malware), (Backdoor.Frebniis, Malware), (Failed Request Event Buffering, Technique), (Windows System, Tool), (IIS, Tool), (HT...

Processing 25/46: nodaria ukraine infostealer...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Nodaria, Attacker), (UAC-0056, Attacker), (Infostealer.Graphiron, Malware), (Downloader.Graphiron, Tool), (Infostealer.Graphiron, Tool), (AES, Technique), (Table 1, Hash), (OfficeTemp...
  ✅ Processed 25/46 articles

Processing 26/46: bluebottle banks targeted africa...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Bluebottle, Attacker), (living off the land, Technique), (dual-use tools, Tool), (commodity malware, Malware), (operated by, employs), (Symantec, Tool), (Group-IB, Report), (OPERA1ER,...

Processing 27/46: espionage asia governments cert authority...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec, Company), (Billbug, Attacker), (Thrip, Attacker), (Lotus Blossom, Attacker), (Hannotog, Malware), (Sagerunex, Malware), (certificate authority, Victim), (government agency, ...

Processing 28/46: cranefly new tools technique geppei danfuan...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (UNC3524, Attacker), (Trojan.Geppei, Malware), (Trojan.Danfuan, Malware), (Regeorg webshell, Tool), (PyInstaller, Tool), (IIS, Tool), (webpages, File), (apps, File)
Relationships: (UNC...

Processing 29/46: blackbyte exbyte ransomware...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (BlackByte, Threat Type), (Infostealer.Exbyte, Malware), (Exbyte, Tool), (Mega.co.nz, Domain), (IsDebuggerPresent, Technique), (CheckRemoteDebuggerPresent, Technique), (anti-virus, Vul...

Processing 30/46: spyder loader cuckoobees hong kong...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Operation CuckooBees, Threat Type), (Spyder Loader, Malware), (Cybereason, Attacker), (SonicWall, Tool), (March 2021, Date), (May 2022, Date), (Hong Kong, Location), (information stor...
  ✅ Processed 30/46 articles

Processing 31/46: budworm espionage us state...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Budworm, Attacker), (Log4j vulnerabilities, Vulnerability), (Apache Tomcat, Tool), (HyperBro malware family, Malware), (CyberArk Viewfinity, Tool), (vf_host.exe, File), (Telstra, IP),...

Processing 32/46: witchetty steganography espionage...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Witchetty, Attacker), (LookingFrog, aka), (Backdoor.Stegmap, Malware), (ProxyShell, Vulnerability), (CVE-2021-34473, Vulnerability), (CVE-2021-34523, Vulnerability), (CVE-2021-31207, ...

Processing 33/46: noberus blackcat ransomware ttps...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Noberus, Malware), (BlackCat, Malware), (Exmatter, Tool), (Eamfo, Malware), (Darkside, Malware), (Coreid, Attacker), (FIN7, Attacker), (Carbon Spider, Attacker), (Colonial Pipeline, T...

Processing 34/46: webworm espionage rats...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec, Company), (Webworm, Attacker), (Trochilus, Malware), (GitHub, Tool), (Space Pirates, Attacker), (Positive Technologies, Company), (May 2022, Date), (IT service provider, Vic...

Processing 35/46: espionage asia governments...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (espionage attackers, Attacker), (ShadowPad RAT, Malware), (DLL side-loading, Technique)
Relationships: (espionage attackers, uses, ShadowPad RAT), (espionage attackers, employs, DLL s...
  ✅ Processed 35/46 articles

Processing 36/46: russia ukraine shuckworm...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (UPDATE, Event), (17.40 BST, Time), (August 15, 2022, Date), (UPDATE, Event), (17.50 BST, Time), (August 17, 2022, Date), (additional IOCs, Indicator), (ASC, Tool), (H264, Tool), (VCD,...

Processing 37/46: lockbit targets servers...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec, Company), (LockBit, Malware), (Syrphid, Attacker), (AnyDesk, Tool), (Windows RDP, Tool), (bug bounty program, Campaign), (Group Policy, Technique), (domain controller, Syste...

Processing 38/46: bumblebee loader cybercrime...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Bumblebee, Malware), (Trickbot, Malware), (BazarLoader, Malware), (Conti, Threat Type), (Quantum, Threat Type), (Mountlocker, Threat Type), (Spear-phishing, Technique), (ISO file, Fil...

Processing 39/46: follina msdt exploit malware...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec, Company), (Broadcom Software, Company), (Threat actors, Attacker), (Follina, Malware), (CVE-2022-30190, Vulnerability), (ms-msdt, Tool), (RTF, File), (WinWord, Tool), (msdt....

Processing 40/46: clipminer bitcoin mining hijacking...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec's Threat Hunter Team, Attacker), (Trojan.Clipminer, Malware), (KryptoCibule, Malware), (Clipminer, Malware), (WinRAR, Tool), (Packed Portable Executable DLL, File), (CPL, Fil...
  ✅ Processed 40/46 articles

Processing 41/46: ransomware hive conti avoslocker...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (ransomware, Threat Type), (Hive, Malware), (Conti, Malware), (Avoslocker, Malware), (Symantec, Company), (Broadcom Software, Company), (ransomware-as-a-service, Business Model), (affi...

Processing 42/46: stonefly north korea espionage...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Stonefly, Attacker), (Backdoor.Prioxer, Malware), (Jokra, Threat Type), (DDoS, Technique), (South Korean, Country), (U.S. government, Organization), (financial website, Website), (ban...

Processing 43/46: shuckworm intense campaign ukraine...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Shuckworm, Attacker), (Backdoor.Pterodo, Malware), (Visual Basic Script, Tool), (Scheduled Tasks, Tool), (C&C server, Domain), (Symantec's Threat Hunter Team, Tool), (Broadcom Softwar...

Processing 44/46: lazarus dream job chemical...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec, Company), (Broadcom Software, Company), (North Korea, Country), (Advanced Persistent Threat, Threat Type), (Lazarus, Attacker), (Operation Dream Job, Campaign), (fake job of...

Processing 45/46: cicada apt10 china ngo government attacks...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Cicada, Attacker), (Apt10, Threat Type), (custom loader, Tool), (custom malware, Malware), (mid-2021, Date), (February 2022, Date)
Relationships: (Cicada, uses, custom loader), (Cicad...
  ✅ Processed 45/46 articles

Processing 46/46: spring4shell rce vuln java...
🔍 Raw model output: Named Entities: (Spring Core Java framework, Tool), (unauthenticated remote code execution (RCE), Technique), (CVE-2022-22965, CVE), (Symantec products, Tool), (Data Center Security (DCS) Intrusion Pr...
💾 Saving to: C:\Users\KietVu\Testplace\LLM_TKIG\data\entity-extraction\entity_extraction_results_Qwen\Qwen2.5-1.5B-Instruct_test_2025-08-17_16-40-06_204_254.json

💾 SAVED EXTRACTION RESULTS to ../data/entity-extraction/entity_extraction_results_Qwen/Qwen2.5-1.5B-Instruct_test_2025-08-17_16-40-06_204_254.json


In [18]:
start = 250
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/entity_extraction_results_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

In [19]:
start = 300
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/entity_extraction_results_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

In [20]:
start = 350
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/entity_extraction_results_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing 50 articles for entity extraction...

Processing 1/50: symantec latest intelligence refresh...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec, Company), (PDF report, Document), (financial Trojan, Threat Type), (Ramnit, Malware), (September, Month), (August, Month), (IoT device, Device)
Relationships: (Symantec, pub...

Processing 2/50: formjacking attacks retailers...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Magecart, Attacker), (Ticketmaster, Domain), (British Airways, Domain), (Feedify, Domain), (Newegg, Domain), (formjacking, Threat Type), (Malicious JavaScript, Tool), (payment card de...

Processing 3/50: microsoft patch tuesday september 2018...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Microsoft, Tool), (Chakra Scripting Engine, Tool), (Memory Corruption Vulnerability, Vulnerability), (Internet Explorer, Tool), (PDF Remote Code Execution Vulnerability, Vulnerability...

Processing 4/50: wmic download malware...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Windows Management Instrumentation Command-line, Tool), (eXtensible Stylesheet Language, Tool), (WMIC, Tool), (XSL, Tool), (Malware, Malware), (XML, File), (WMI, Technique), (eXtensib...

Processing 5/50: mirai cross platform infection...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Mirai botnet, Threat Type), (Linux.Mirai, Malware), (Mirai, Attacker), (shell script, Tool), (vulnerable device, Target), (executables, File), (remote server, Host), (July, Date)
Rela...
  ✅ Processed 5/50 articles

Processing 6/50: jrat new anti parsing techniques...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (jRAT, Malware), (Trojan.Maljava, Malware), (MZ, Tool), (JAR file, File), (spam email, Threat Type), (social engineering, Technique)
Relationships: (jRAT, uses, Trojan.Maljava), (jRAT,...

Processing 7/50: microsoft patch tuesday august 2018...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Microsoft, Tool), (Browsers, Tool), (Chakra, Tool), (Scripting Engine, Tool), (Memory Corruption Vulnerability, Vulnerability), (Information Disclosure Vulnerability, Vulnerability), ...

Processing 8/50: hacked mikrotik router...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Cryptocurrency coinminers, Malware), (ransomware, Threat Type), (Symantec, Attacker), (MikroTik routers, Tool), (Brazil, Country), (August 2018, Time Period), (Figure 2, Image), (Figu...

Processing 9/50: leafminer espionage middle east...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Leafminer, Attacker), (Malware, Malware), (e-qht.az, Domain), (publicly accessible, Linked To)
Relationships: (Leafminer, uses, Malware), (Leafminer, downloads, e-qht.az), (e-qht.az, ...

Processing 10/50: evolution emotet trojan distributor...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Mealybug, Attacker), (Trojan.Emotet, Malware), (WannaCry, Threat Type), (Petya/NotPetya, Threat Type), (Conficker, Threat Type), (W32.Downadup, Threat Type)
Relationships: (Mealybug, ...
  ✅ Processed 10/50 articles

Processing 11/50: powershell threats grow further and operate plain sight...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Windows PowerShell, Tool), (WMI, Tool), (PsExec, Tool), (PowerSploit, Tool), (Empire, Tool), (living off the land, Technique), (fileless, Technique), (PowerShell, Tool), (PowerShell s...

Processing 12/50: microsoft patch tuesday july 2018...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Cumulative Security Update for Microsoft Browsers Scripting Engine Memory Corruption Vulnerability(CVE-2018-8242), Technique), (Scripting Engine Memory Corruption Vulnerability(CVE-20...

Processing 13/50: thrip hits satellite telecoms defense targets...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Living off the land, Technique), (operating system features, Tool), (legitimate network administration tools, Tool), (victim's network, Target), (Sunny skies, Weather), (moderate temp...

Processing 14/50: microsoft patch tuesday june 2018...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Cumulative Security Update for Microsoft Browsers, Tool), (Internet Explorer Memory Corruption Vulnerability, Vulnerability), (Chakra Scripting Engine Memory Corruption Vulnerability,...

Processing 15/50: industry and law enforcement cooperation bears fruit fight a...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Business email attacks, Threat Type), (419 scams, Threat Type), (FBI, Organization), (Symantec, Company), (Operation Wire-Wire, Name), (BEC attackers, Attacker), (private sector compa...
  ✅ Processed 15/50 articles

Processing 16/50: vpnfilter iot malware...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Cisco Talos, Attacker), (Stage 3 module, Malware), (ssler, Malware), (VPNFilter, Malware), (Modbus SCADA, Vulnerability), (SCADA industrial control systems, Vulnerability), (Ukraine, ...

Processing 17/50: scan4you masterminds guilty...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Scan4You, Threat Type), (Jurijs Martisevs, Attacker), (Ruslans Bondars, Attacker), (Malware, Malware), (credit and debit card numbers, Vulnerable Data), (FBI, Organization), (undergro...

Processing 18/50: latest intelligence march 2018...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Inception Framework, Attacker), (email malware, Threat Type), (Trojan.Coinminer, Malware), (browser-based cryptocurrency mining, Technique), (Agriculture, Sector), (1 in 1,394, Vulner...

Processing 19/50: coin mining without browser...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (browser-based cryptocurrency mining, Threat Type), (JavaScript, Tool), (WebAssembly (WASM), Tool), (Portable Executable file (.NET), Tool), (Coinhive, Tool), (Form1, File), (script ta...

Processing 20/50: istr 23 cyber security threat landscape...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Coin mining, Threat Type), (ransomware, Threat Type), (targeted attacks, Threat Type), (mobile security, Threat Type), (software supply chain, Threat Type)\nRelationships: (Coin minin...
  ✅ Processed 20/50 articles

Processing 21/50: fakebank intercepts calls banks...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Fakebank, Threat Type), (Android, Platform), (command and control (C&C), Technique), (phone number, Vulnerability), (fake UI, File), (system alert window, Permission), (scammer, Attac...

Processing 22/50: inception framework hiding behind proxies...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Inception Framework, Attacker), (stealthy new tools, Technique), (cloud, Tool), (Internet of Things (IoT), Tool), (advanced, Technique), (automated framework, Tool), (spear-phishing e...

Processing 23/50: microsoft patch tuesday march 2018...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: No related entities and relations

Explanation: The input text does not contain any named entities or relationships that match the provided entity types and relationship types. It appears to be a list...

Processing 24/50: latest intelligence february 2018...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Chafer attack group, Attacker), (email malware, Threat Type), (Necurs botnet, Tool), (Facebook account, Target), (Finance, Industry), (Mining, Industry), (Ne, Domain)
Relationships: (...

Processing 25/50: chafer latest attacks reveal heightened ambitions...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Chafer, Attacker), (Iran, Country), (targeted attack group, Entity Type), (seven new tools, Technique), (nine new target organizations, Target), (Israel, Country), (Jordan, Country), ...
  ✅ Processed 25/50 articles

Processing 26/50: android malware harvests facebook details...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Facebook, Website), (Android.Fakeapp, Malware), (third-party markets, Tool), (English speakers, Target), (C&C server, IP)
Relationships: (Facebook, contains, Android.Fakeapp), (Facebo...

Processing 27/50: microsoft patch tuesday february 2018...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Microsoft, Tool), (Browsers, Vulnerability), (Edge, Tool), (CVE-2018-0763, Vulnerability), (Critical, MS Rating), (Scripting Engine, Tool), (Memory Corruption, Vulnerability), (CVE-20...

Processing 28/50: meltdown spectre cpu bugs...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Meltdown, Vulnerability), (Spectre, Vulnerability), (kernel, File), (JavaScript, Tool), (operating system, Tool), (Symantec, Company), (personal computer, Device), (virtual machine, D...

Processing 29/50: android malware uber credentials deep links...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Android.Fakeappmalware, Malware), (Uber, Threat Type), (deep link URI, Technique), (current location, Vulnerability), (Ride Request activity, Technique), (victim, Target)
Relationship...

Processing 30/50: browser mining cryptocurrency...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Browser-based cryptocurrency mining, Threat Type), (Coinhive, Malware), (JavaScript, Tool), (BitcoinPlus.com, Domain), (Monero, Vulnerable Software), (ASIC mining, Technique)
Relation...
  ✅ Processed 30/50 articles

Processing 31/50: triton malware ics...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec, Company), (Trojan, Malware), (Triton, Malware), (Trojan.Trisis, Malware), (Safety Instrumented Systems, Vulnerability), (Windows, Tool), (Industrial Control System, Threat T...

Processing 32/50: microsoft patch tuesday december...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Cumulative Security Update for Microsoft Browsers, Tool), (CVE-2017-11888, Vulnerability), (Critical, Severity), (Scripting Engine Memory Corruption Vulnerability, Malware), (CVE-2017...

Processing 33/50: mailsploit email exploit spoofing...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Mailsploit, Malware), (RFC-1342, Vulnerability), (Yahoo Mail for iOS and Android, Tool), (Sabri Haddouche, Attacker), (Domain-based Message Authentication, Reporting and Conformance (...

Processing 34/50: surge adwind distribution emails...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Adwind, Malware), (JAR, File), (ZIP, File), (Symantec, Tool), (August 2017, Month), (October 2017, Month), (November 2017, Month), (Holiday/Shopping Season, Event)
Relationships: (Adw...

Processing 35/50: latest intel november 2017...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (spam, Threat Type), (Black Friday, Event), (Cyber Monday, Event), (Necurs, Malware), (SMS, Tool), (legitimate company, Victim), (personal information, Target), (Malware, Malware), (SM...
  ✅ Processed 35/50 articles

Processing 36/50: doublehidden android malware google play...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Android.Trojan, Malware), (Google Play Store, Tool), (photograph by fiery, File), (i.r.r developer, Attacker), (com.aseee.apptec.treeapp, File), (Device Administrator, Vulnerability),...

Processing 37/50: android malware porn apps chinese...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Android.Rootnik.B, Malware), (Android.Reputation.1, Malware), (app-centric websites, Technique), (forums, Technique), (torrent sites, Technique), (social messaging networks, Technique...

Processing 38/50: ms patch tuesday november 2017...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Cumulative Security Update for Microsoft Browsers Scripting Engine Memory Corruption Vulnerability(CVE-2017-11858), Tool), (Scripting Engine Memory Corruption Vulnerability(CVE-2017-1...

Processing 39/50: tech support scams aes...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (tech support scam, Threat Type), (code obfuscation, Technique), (string-based detection engines, Tool), (JavaScript, Language), (SMB, Protocol), (ransomware, Malware), (Microsoft, Org...

Processing 40/50: sowbug cyber espionage south america asia...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Sowbug, Attacker), (Felismus, Malware), (South America, Region), (Southeast Asia, Region), (Argentina, Country), (Brazil, Country), (Ecuador, Country), (Peru, Country), (Brunei, Count...
  ✅ Processed 40/50 articles

Processing 41/50: ransomeware risks 2017...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (ransomware, Threat Type), (WannaCry, Malware), (Petya, Malware), (EternalBlue, Vulnerability), (Windows SMB protocol, Tool), (SMB protocol, Tool)
Relationships: (ransomware, contains,...

Processing 42/50: petya ransomware wiper...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Petya, Malware), (EternalBlue, Technique), (MEDoc, Tool), (Norton products, Tool), (Symantec Endpoint Protection, Tool)
Relationships: (Petya, uses, EternalBlue), (Petya, spreadsAcros...

Processing 43/50: wannacry ransomware attack...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Symantec, Company), (WannaCry, Malware), (Lazarus group, Attacker), (Eternal Blue, Technique), (Shadow Brokers, Threat Actor), (Equation cyber espionage group, Threat Actor), (SEP, To...

Processing 44/50: dragonfly energy sector cyber attacks...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Cyber attack, Threat Type), (Dragonfly, Attacker), (Ukraine's power system, Target), (operational systems, Target), (Nuclear facility, Target), (Symantec, Tool), (energy sector, Targe...

Processing 45/50: bachosens cyber crime investigation...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Trojan.Bachosens, Malware), (Igor, Attacker), (international airline, Target), (Chinese auto-tech company, Target), (car diagnostics software, Vulnerability), (underground forums and ...
  ✅ Processed 45/50 articles

Processing 46/50: longhorn cyberespionage vault7...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Longhorn, Attacker), (Vault 7, Document), (back door Trojans, Tool), (zero-day vulnerabilities, Vulnerability), (United States, Country)
Relationships: (Longhorn, uses, back door Troj...

Processing 47/50: bayrob suspects extradited...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Bayrob, Threat Type), (Bogdan Nicolescu, Attacker), (Danet Tiberiu, Attacker), (Radu Miclaus, Attacker), (Masterfraud, Attacker), (Amy, Attacker), (Minolta, Attacker), (Amightysa, Att...

Processing 48/50: shamoon back destructive...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Shamoon, Malware), (W32.Disttrack, Malware), (W32.Disttrack.B, Malware), (Alan Kurdi, Victim), (Saudi Arabian, Location), (working week, Time Period), (Thursday, Day)
Relationships: (...

Processing 49/50: gatak healthcare...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Gatak Trojan, Malware), (healthcare sector, Target), (insurance sector, Target), (enterprise computers, Device), (product key generator, File), (software, Software), (website, Website...

Processing 50/50: odinaff trojan financial attacks...
🔍 Raw model output: Named Entities: (Trojan.Odinaff, Malware), (Odinaff, Malware), (Carbanak, Threat Type), (Backdoor.Batel, Malware), (Carbanak, Threat Type)
Relationships: (Trojan.Odinaff, uses, Backdoor.Batel), (Carba...
  ✅ Processed 50/50 articles
💾 Saving to: /Users/huynguyen/Documents/UIT/2nd/NLP/LLM-TKIG/data/entity-extraction/entity_extraction_results_Qwen/Qwen2.5-1.5B-Instruct_test_2025-08-04_18-31-25_350_400.json

💾 SAVED EXTRACTION RESULTS to ../data/entity-extraction/entity_extraction_results_Qwen/Qwen2.5-1.5B-Instruct_test_2025-08-04_18-31-25_350_400.json


In [21]:
start = 400
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/entity_extraction_results_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing 27 articles for entity extraction...

Processing 1/27: buckeye cyberespionage hong kong...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Buckeye, Attacker), (APT3, Attacker), (Gothic Panda, Attacker), (UPS Team, Attacker), (TG-0110, Attacker), (Hong Kong, Location), (US, Location), (backdoor.pirpi, Malware), (spear-phi...

Processing 2/27: equation cyberespionage group breached...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Shadow Brokers, Attacker), (Equation, Threat Type), (Malware, Tool), (router, Device), (firewall appliance, Device), (exploit, Technique)
Relationships: (Shadow Brokers, uses, Equatio...

Processing 3/27: strider cyberespionage sauron...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Strider, Attacker), (Remsec, Malware), (Sauron, Threat Type), (Regin, Threat Type), (Flamer, Threat Type), (Lua, Technique)
Relationships: (Strider, uses, Remsec), (Strider, linkedTo,...

Processing 4/27: swift malware financial attacks...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (bank in the Philippines, Target), (Bangladesh central bank, Target), (Tien Phong Bank, Target), (Vietnam's Tien Phong Bank, Target), (Banco del Austro, Target), (Ecuador's Banco del A...

Processing 5/27: tick cyberespionage japan...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Tick, Attacker), (Backdoor.Daserf, Malware), (Gofarer, Tool), (Flash(.swf), Vulnerability), (Japanese, Location), (technology, Sector), (aquatic engineering, Sector), (broadcasting, S...
  ✅ Processed 5/27 articles

Processing 6/27: taiwan cyberespionage backdoor trojan...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Backdoor.Dripion, Malware), (Budminer, Attacker), (Trojan.Taidoor, Malware), (file hashes, Hash), (Taiwan, Location), (Brazil, Location), (United States, Location), (command and contr...

Processing 7/27: operation blockbuster lazarus...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Operation Blockbuster, Initiative), (Lazarus, Attacker), (Novetta, Company), (Symantec, Company), (u, Technique)
Relationships: (Operation Blockbuster, launchedBy, Novetta), (Operatio...

Processing 8/27: dridex financial trojan spam...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Dridex, Malware), (Symantec, Company), (W32.Cridex, Malware), (English, Language), (financial, Threat Type), (banking, Target), (Symantec, Company), (whitepaper, Document), (Symantec,...

Processing 9/27: dyre bank fraud group takedown...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Dyre, Malware), (Upatre, Malware), (Downloader.Upatre, Tool), (email spam campaigns, Technique), (November, Date), (November 18, Date), (Downloader.Upatre, usedBy, Dyre), (Updater.Upa...

Processing 10/27: destructive disakil malware ukraine...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Disakil, Malware), (BlackEnergy, Malware), (Sandworm, Attacker), (Apress, Organization), (SBU, Organization), (Ukraine, Location), (energy sector, Threat Type)
Relationships: (Disakil...
  ✅ Processed 10/27 articles

Processing 11/27: dridex takedown...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Dridex, Threat Type), (W32.Cridex, Malware), (Bugat, Malware), (financial threat, Threat Type), (malicious macros, Technique), (Microsoft Office, Tool), (Symantec, Company), (State of...

Processing 12/27: regin mysteries cyberespionage...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Regin, Malware), (Symantec, Attacker), (technical whitepaper, Document), (command-and-control (C&C) infrastructure, Infrastructure)
Relationships: (Regin, uses, technical whitepaper),...

Processing 13/27: black vine cyberespionage aerospace healthcare...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Anthem, Threat Type), (Black Vine, Attacker), (zero-day vulnerability, Vulnerability), (watering-hole attack, Technique), (custom malware, Malware), (legitimate website, Domain), (rem...

Processing 14/27: forkmeiamfamous seaduke duke...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Seaduke, Malware), (Cozyduke, Malware), (Cyberespionage group, Attacker), (United States, Country), (Europe, Continent)
Relationships: (Seaduke, uses, Cozyduke), (Cyberespionage group...

Processing 15/27: butterfly corporate attacks...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Butterfly, Attacker), (Windows, OS), (Apple, OS), (zero-day vulnerability, Vulnerability), (Twitter, Domain), (Facebook, Domain), (Apple, Company), (Microsoft, Company)
Relationships:...
  ✅ Processed 15/27 articles

Processing 16/27: dyre financial trojan...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Dyre, Malware), (Symantec, Tool), (Infostealer.Dyre, Hash), (spam emails, Technique), (Malicious website, URL)
Relationships: (Dyre, uses, Infostealer.Dyre), (Dyre, spreadsUsing, spam...

Processing 17/27: duqu 20 cyberespionage...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Duqu 2.0, Malware), (Kaspersky Lab, Attacker), (Stuxnet, Malware), (Iranian nuclear development program, Target), (European telecoms operator, Target), (North African telecoms operato...

Processing 18/27: equation cyberespionage group...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Equation, Attacker), (Malware, Malware), (Wipbot, Malware), (Trojan Turla, Malware), (Infostealer.Micstus, Malware), (Trojan.Tripfant, Malware), (Grayphish, Malware), (GrayFish, Malwa...

Processing 19/27: carbanak cybercrime gang...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Carbanak, Threat Type), (Trojan.Carberp.B, Malware), (Trojan.Carberp, Malware), (Silicon, Attacker), (Anunak, Attacker), (ATM, Object), (money mule, Object)
Relationships: (Carbanak, ...

Processing 20/27: destover destructive malware south korea...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Backdoor.Destover, Malware), (FBI, Attacker), (Trojan.Volgmer, Malware), (Volgmer, Malware), (Jokra, Malware), (Shamoon, Malware), (commercially available drivers, Tool), (Destover, M...
  ✅ Processed 20/27 articles

Processing 21/27: regin espionage surveillance...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Regin, Malware), (Backdoor.Regin, Malware)
Relationships: (Regin, uses, Backdoor.Regin) ```...

Processing 22/27: turla espionage diplomats...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Wipbot, Malware), (Turla, Malware), (Spear phishing, Technique), (Watering hole, Technique), (IP address, IP), (Legitimate website, Domain), (Compromised website, Domain), (Malware, F...

Processing 23/27: dragonfly energy companies sabotage...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Dragonfly, Attacker), (Stuxnet, Malware), (remote access type Trojan, Malware), (ICS equipment providers, Target), (software, File), (ICS equipment, File), (ICS computers, Target), (S...

Processing 24/27: hidden lynx professional hackers hire...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Hidden Lynx, Attacker), (Advanced Persistent Threats, Threat Type), (Watering hole, Technique), (zero-day vulnerabilities, Vulnerability), (supply chain, Tool), (intelligent hunter, A...

Processing 25/27: darkseoul cyberattacks south korea...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (DarkSeoul gang, Attacker), (DDoS, Technique), (Trojan.Castov, Malware), (Jokra attacks, Threat Type), (United States Independence Day, Event), (South Korean independence day, Event), ...
  ✅ Processed 25/27 articles

Processing 26/27: duqu next stuxnet...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (Stuxnet, Malware), (Duqu, Malware), (Industrial Control System, Vulnerability), (remote access trojan, Tool), (telemetry, Data)
Relationships: (Stuxnet, contains, Duqu), (Duqu, uses, ...

Processing 27/27: stuxnet dossier espionage...
🔍 Raw model output: Named Entities: (Stuxnet, Malware), (VirusBlokada, Attacker), (unpatched vulnerability, Vulnerability), (removable drive, Device), (industrial control systems, Target)
Relationships: (Stuxnet, uses, u...
💾 Saving to: /Users/huynguyen/Documents/UIT/2nd/NLP/LLM-TKIG/data/entity-extraction/entity_extraction_results_Qwen/Qwen2.5-1.5B-Instruct_test_2025-08-04_18-31-25_400_427.json

💾 SAVED EXTRACTION RESULTS to ../data/entity-extraction/entity_extraction_results_Qwen/Qwen2.5-1.5B-Instruct_test_2025-08-04_18-31-25_400_427.json


In [21]:
from typing import List, Dict, Any

def load_json_file(file_path: str) -> List[Dict[str, Any]]:
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        if not isinstance(data, list):
            return []
        return data
        
    except Exception as e:
        return []

In [22]:
def get_unique_records(records: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    seen = set()
    unique_records = []
    duplicates_count = 0
    
    for record in records:
        # Create a unique identifier based on title and link
        if isinstance(record, dict) and 'title' in record and 'link' in record:
            identifier = (record['title'], record['link'])
        else:
            # If record doesn't have expected structure, use the entire record as identifier
            identifier = str(record)
        
        if identifier not in seen:
            seen.add(identifier)
            unique_records.append(record)
        else:
            duplicates_count += 1
    
    return unique_records

In [23]:
import glob
import os
import json

def merge_entity_extraction_files(input_dir: str, output_file: str) -> None:
    """
    Merge all entity extraction result files into a single file.
    """
    # Get all JSON files in the input directory
    json_files = glob.glob(os.path.join(input_dir, "*.json"))
    
    if not json_files:
        print(f"❌ No JSON files found in {input_dir}")
        return
    
    print(f"📁 Found {len(json_files)} JSON files to merge")
    
    # Load all records from all files
    all_records = []
    total_files_processed = 0
    
    for file_path in sorted(json_files):
        file_name = os.path.basename(file_path)
        print(f"📄 Processing {file_name}...")
        
        records = load_json_file(file_path)
        if records:
            all_records.extend(records)
            total_files_processed += 1
    
    if not all_records:
        print("❌ No records found in any files")
        return
    
    print(f"📊 Total records loaded: {len(all_records)}")
    print(f"�� Files successfully processed: {total_files_processed}")
    
    # Remove duplicates
    unique_records = get_unique_records(all_records)
    
    # Create output directory if it doesn't exist
    output_dir = os.path.dirname(output_file)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"📁 Created output directory: {output_dir}")
    
    # Save merged data
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(unique_records, f, indent=2, ensure_ascii=False)
        
        print(f"💾 Successfully merged data to {output_file}")
        print(f"�� Final record count: {len(unique_records)}")
        
        # Print summary statistics
        print("\n" + "="*60)
        print("MERGE SUMMARY")
        print("="*60)
        print(f"Input files processed: {total_files_processed}")
        print(f"Total records loaded: {len(all_records)}")
        print(f"Duplicate records removed: {len(all_records) - len(unique_records)}")
        print(f"Final unique records: {len(unique_records)}")
        print(f"Output file: {output_file}")
        print("="*60)
        
    except Exception as e:
        print(f"❌ Error saving merged file: {e}")
        raise

In [24]:
def convert_to_instruction_format(records: list) -> list:
    """
    Convert records to instruction format.
    """
    instruction_records = []
    processed_count = 0
    skipped_count = 0
    
    for record in records:
        try:
            # Check if record has required fields
            if not isinstance(record, dict):
                skipped_count += 1
                continue
                
            if 'content' not in record or 'extraction' not in record:
                skipped_count += 1
                continue
                
            extraction = record['extraction']
            if not isinstance(extraction, dict) or 'raw_output' not in extraction:
                skipped_count += 1
                continue
            
            # Create instruction format record
            instruction_record = {
                "instruction": record['content'],
                "input": None,
                "output": extraction['raw_output']
            }
            
            instruction_records.append(instruction_record)
            processed_count += 1
            
        except Exception as e:
            print(f"⚠️  Error processing record: {e}")
            skipped_count += 1
            continue
    
    print(f"✅ Successfully processed {processed_count} records")
    if skipped_count > 0:
        print(f"⚠️  Skipped {skipped_count} records due to missing fields")
    
    return instruction_records

def create_entity_extraction_instructions(input_file: str, output_file: str) -> None:
    """
    Create entity extraction instruction file from merged entity extraction results.
    """
    print(f"🚀 Starting conversion process...")
    print(f"�� Input file: {input_file}")
    print(f"📤 Output file: {output_file}")
    
    try:
        # Load merged data
        records = load_json_file(input_file)
        
        # Convert to instruction format
        instructions = convert_to_instruction_format(records)
        
        if not instructions:
            print("❌ No valid instructions created")
            return
        
        # Save instruction file
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(instructions, f, indent=2, ensure_ascii=False)
        
        print(f"�� Successfully saved {len(instructions)} instructions to {output_file}")
        
        # Print summary
        print("\n" + "="*60)
        print("CONVERSION SUMMARY")
        print("="*60)
        print(f"Input records: {len(records)}")
        print(f"Valid instructions created: {len(instructions)}")
        print(f"Output file: {output_file}")
        print("="*60)
        
        print("✅ Conversion process completed successfully!")
        
    except Exception as e:
        print(f"❌ Conversion process failed: {e}")
        raise

In [26]:
def run_complete_pipeline():
    """
    Run the complete pipeline: merge files and create instructions
    """
    print("🚀 Starting complete entity extraction pipeline...")
    
    # Step 1: Merge files
    input_dir = "../data/entity-extraction/entity_extraction_results_Qwen"
    merged_file = "../data/entity-extraction/Qwen2.5-1.5B-Instruct-entity-extraction.json"
    
    print("\n📁 Step 1: Merging entity extraction files...")
    merge_entity_extraction_files(input_dir, merged_file)
    
    # Step 2: Create instructions
    instruction_file = "../data/entity-extraction/entity_extraction_instruction.json"
    
    print("\n📝 Step 2: Creating entity extraction instructions...")
    create_entity_extraction_instructions(merged_file, instruction_file)
    
    print("\n🎉 Complete pipeline finished successfully!")

# Uncomment to run the complete pipeline
run_complete_pipeline()

🚀 Starting complete entity extraction pipeline...

📁 Step 1: Merging entity extraction files...
📁 Found 9 JSON files to merge
📄 Processing Qwen2.5-1.5B-Instruct_test_2025-08-04_09-45-09_250_300.json...
📄 Processing Qwen2.5-1.5B-Instruct_test_2025-08-04_09-45-09_300_350.json...
📄 Processing Qwen2.5-1.5B-Instruct_test_2025-08-04_18-31-25_350_400.json...
📄 Processing Qwen2.5-1.5B-Instruct_test_2025-08-04_18-31-25_400_427.json...
📄 Processing Qwen2.5-1.5B-Instruct_test_2025-08-17_13-24-10_0_50.json...
📄 Processing Qwen2.5-1.5B-Instruct_test_2025-08-17_13-24-10_102_152.json...
📄 Processing Qwen2.5-1.5B-Instruct_test_2025-08-17_13-24-10_51_101.json...
📄 Processing Qwen2.5-1.5B-Instruct_test_2025-08-17_15-52-40_153_203.json...
📄 Processing Qwen2.5-1.5B-Instruct_test_2025-08-17_16-40-06_204_254.json...
📊 Total records loaded: 423
�� Files successfully processed: 9
💾 Successfully merged data to ../data/entity-extraction/Qwen2.5-1.5B-Instruct-entity-extraction.json
�� Final record count: 423

ME