# Entity and Relationship Extraction for Threat Intelligence

## Overview
This notebook implements entity and relationship extraction from threat intelligence text using LLM-based approach.

### Task Description
- **Input**: Threat intelligence text content
- **Output**: Named entities and relationships in structured format
- **Entity Types**: malware, threat type, attacker, vulnerability, tool, etc.
- **Relationship Types**: use, target, exploit, etc.

### Example
**Input**: A hitherto unknown attack group has been observed targeting a materials research organization in Asia. The group, which Symantec calls Clasiopa, is characterized by a distinct toolset, which includes one piece of custom malware (Backdoor.Atharvan).

**Output**:
- Named Entities: (Clasiopa, attacker), (custom malware, malware), (Backdoor.Atharvan, malware)
- Relationships: (Clasiopa, use, custom malware), (custom malware, name, Backdoor.Atharvan)


In [8]:
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple, Any
from collections import defaultdict
import datetime

# Load environment and model setup
from dotenv import load_dotenv
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load environment variables
load_dotenv()

print("üîß Setting up Entity & Relationship Extraction Pipeline")
print("=" * 60)


üîß Setting up Entity & Relationship Extraction Pipeline


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
def load_data(input_file: str) -> list:
    """
    Load threat intelligence data from JSON file.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"‚úÖ Loaded {len(data)} records from {input_file}")
        return data
    except Exception as e:
        print(f"‚ùå Error loading {input_file}: {e}")
        return []

# Load threat intelligence data
data_path = '/content/drive/MyDrive/LLM-TKIG/data/processed/merged_threat_intelligence.json'
data = load_data(data_path)

if data:
    print(f"üìä Sample data structure:")
    print(f"   Keys: {list(data[0].keys())}")
    print(f"   Title: {data[0]['title'][:100]}...")


‚úÖ Loaded 427 records from /content/drive/MyDrive/LLM-TKIG/data/processed/merged_threat_intelligence.json
üìä Sample data structure:
   Keys: ['title', 'content', 'link']
   Title: FortiGuard Labs Threat Research...


In [10]:
# Device setup
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

print(f"üñ•Ô∏è  Using device: {device.upper()}")
print(f"üîß PyTorch version: {torch.__version__}")

# Memory cleanup
if device == "cuda":
    torch.cuda.empty_cache()
elif device == "mps":
    import gc
    gc.collect()
    if hasattr(torch.mps, 'empty_cache'):
        torch.mps.empty_cache()


üñ•Ô∏è  Using device: CUDA
üîß PyTorch version: 2.8.0+cu126


In [14]:
from google.colab import userdata
HF_TOKEN = userdata.get('HF_TOKEN')
DEFAULT_MODEL = userdata.get('DEFAULT_MODEL')
FALLBACK_MODEL = userdata.get('FALLBACK_MODEL')

In [15]:
# Get configuration from environment
# DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'unsloth/Qwen3-8B')
# FALLBACK_MODEL = os.getenv('FALLBACK_MODEL', 'unsloth/Qwen3-4B')

def setup_model_for_extraction(model_name: str = None, hf_token: str = None):
    """
    T·∫£i model t·ª´ Hugging Face v·ªõi token t·ª´ environment variables.
    """
    model_name = model_name or DEFAULT_MODEL
    hf_token = hf_token or HF_TOKEN

    print(f"ü§ñ ƒêang t·∫£i m√¥ h√¨nh: {model_name}")
    print(f"üì± Thi·∫øt b·ªã: {device.upper()}")
    print(f"üîë Token: {'‚úÖ Found' if hf_token else '‚ùå Missing'}")

    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            token=hf_token,
            trust_remote_code=True
        )
        tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

        # Thi·∫øt l·∫≠p ki·ªÉu d·ªØ li·ªáu v√† b·∫£n ƒë·ªì thi·∫øt b·ªã
        torch_dtype = torch.float16 if device == "cuda" else torch.float32
        device_map = "auto" if device == "cuda" else None

        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=hf_token,
            trust_remote_code=True,
            torch_dtype=torch_dtype,
            device_map=device_map,
            use_cache=False
        )

        if device_map is None and device in ["mps", "cuda"]:
            model.to(device)

        if device_map is None:
            # N·∫øu kh√¥ng s·ª≠ d·ª•ng device_map="auto", c√≥ th·ªÉ ch·ªâ ƒë·ªãnh device
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                device=0 if device != "cpu" else -1,
                torch_dtype=torch_dtype,
                model_kwargs={"use_cache": False}
            )
        else:
            # N·∫øu s·ª≠ d·ª•ng device_map="auto", kh√¥ng ch·ªâ ƒë·ªãnh device
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                torch_dtype=torch_dtype,
                model_kwargs={"use_cache": False}
            )

        print(f"‚úÖ ƒê√£ t·∫£i th√†nh c√¥ng {model_name} tr√™n {device.upper()}")
        return pipe

    except Exception as e:
        print(f"‚ùå L·ªói khi t·∫£i {model_name}: {e}")
        return setup_fallback_model(hf_token)

def setup_fallback_model(hf_token: str = None):
    """
    T·∫£i fallback model n·∫øu model ch√≠nh l·ªói.
    """
    fallback_name = FALLBACK_MODEL
    hf_token = hf_token or HF_TOKEN
    print(f"üîÑ ƒêang t·∫£i m√¥ h√¨nh d·ª± ph√≤ng: {fallback_name}")

    try:
        tokenizer = AutoTokenizer.from_pretrained(fallback_name, token=hf_token)
        tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            fallback_name,
            token=hf_token,
            torch_dtype=torch.float32,
            use_cache=False
        )

        if device in ["cuda", "mps"]:
            model.to(device)

        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=0 if device != "cpu" else -1,
            model_kwargs={"use_cache": False}
        )

        print(f"‚úÖ {FALLBACK_MODEL} ƒë√£ s·∫µn s√†ng tr√™n {device.upper()}")
        return pipe

    except Exception as e:
        print(f"‚ùå L·ªói khi t·∫£i {FALLBACK_MODEL} fallback: {e}")
        return None

# Load model
extraction_model = setup_model_for_extraction()


ü§ñ ƒêang t·∫£i m√¥ h√¨nh: unsloth/Qwen3-8B
üì± Thi·∫øt b·ªã: CUDA
üîë Token: ‚úÖ Found


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Device set to use cuda:0


‚úÖ ƒê√£ t·∫£i th√†nh c√¥ng unsloth/Qwen3-8B tr√™n CUDA


In [16]:
def create_entity_extraction_prompt(text: str) -> str:
    """
    Create prompt for entity and relationship extraction focusing on core cybersecurity entity types.
    """
    # Truncate text to avoid token limits
    text_truncated = (text[:1500] if text else "").replace('\n', ' ').strip()

    prompt = f"""Instruction: Please identify the following types of entities and then extract the relationships between these extracted entities:

Entity Types (focus on these only):
- Malware: Malicious software (e.g., 'Stuxnet', 'Emotet', 'Backdoor.Atharvan')
- Threat Type: Category of threats (e.g., 'Ransomware', 'APT', 'Botnet')
- Attacker: Threat actors/groups (e.g., 'APT28', 'Lazarus Group', 'Shuckworm')
- Technique: Attack techniques/TTPs (e.g., 'T1057: Process Discovery', 'Privilege Escalation', 'Phishing')
- Tool: Security tools or attack tools (e.g., 'PowerShell', 'Cobalt Strike', 'EHole')
- Vulnerability: Security weaknesses (e.g., 'CVE-2020-1472', 'CVE-2021-44228')
- IP: IP addresses (e.g., '45.153.243.93', '192.168.1.100')
- Domain: Domain names (e.g., 'malicious-domain[.]com', 'evil[.]example[.]com')
- URL: URLs (e.g., 'hxxp://178.73.192[.]15/cal.exe')
- File: File names (e.g., 'rtk.lnk', 'payload.exe', 'shtasks.exe')
- Hash: File hashes (e.g., '2aee8bb2a953124803bc42e5c42935c9', MD5/SHA1/SHA256)

Relationship Types:
- use, hash, aka, execute, used by, download, resolved to, IP, drop, associated with, deploy, communicate with, connect to, install, exploit, contain, run, launch, target, linked to

If there are no entities and relationships pertaining to the specified types, please state 'No related entities and relations'. Make sure to follow the output format shown in the following examples.

Example 1:
Input: A hitherto unknown attack group has been observed targeting a materials research organization in Asia. The group, which Symantec calls Clasiopa, is characterized by a distinct toolset, which includes one piece of custom malware (Backdoor.Atharvan).
Output: Named Entities: (Clasiopa, Attacker), (Backdoor.Atharvan, Malware)\nRelationships: (Clasiopa, uses, Backdoor.Atharvan)

Example 2:
Input: The Emotet malware has been observed using new phishing techniques to target banking institutions. The malware exploits CVE-2021-1234 vulnerability in Microsoft Office.
Output: Named Entities: (Emotet, Malware), (phishing, Technique), (CVE-2021-1234, Vulnerability), (Microsoft Office, Tool)\nRelationships: (Emotet, uses, phishing), (Emotet, exploits, CVE-2021-1234)

Example 3:
Input: The threat actor downloaded malicious payload from hxxp://malicious-domain[.]com/payload.exe and used hash 2aee8bb2a953124803bc42e5c42935c9 to verify file integrity. The attack targeted IP address 192.168.1.100.
Output: Named Entities: (threat actor, Attacker), (malicious payload, File), (hxxp://malicious-domain[.]com/payload.exe, URL), (2aee8bb2a953124803bc42e5c42935c9, Hash), (192.168.1.100, IP)\nRelationships: (threat actor, uses, hxxp://malicious-domain[.]com/payload.exe), (threat actor, targets, 192.168.1.100)

Example 4:
Input: H2Miner botnet uses Kinsing malware and Cobalt Strike to deploy XMRig miners. The campaign communicates with C2 server at evil[.]domain[.]com and is attributed to APT group.
Output: Named Entities: (H2Miner, Threat Type), (Kinsing, Malware), (Cobalt Strike, Tool), (XMRig, Tool), (evil[.]domain[.]com, Domain), (APT group, Attacker)\nRelationships: (H2Miner, uses, Kinsing), (H2Miner, uses, Cobalt Strike), (H2Miner, uses, XMRig), (Kinsing, communicatesWith, evil[.]domain[.]com), (H2Miner, attributedTo, APT group)

Example 5:
Input: The weather forecast shows sunny skies and moderate temperatures for the weekend.
Output: No related entities and relations

Now extract entities and relationships from the following text:
Input: {text_truncated}
Output:"""

    return prompt

# Test the prompt creation
if data:
    sample_prompt = create_entity_extraction_prompt(data[0]['content'])
    print("üìù Sample prompt (first 500 chars):")
    print(sample_prompt[:500] + "...")


üìù Sample prompt (first 500 chars):
Instruction: Please identify the following types of entities and then extract the relationships between these extracted entities:

Entity Types (focus on these only):
- Malware: Malicious software (e.g., 'Stuxnet', 'Emotet', 'Backdoor.Atharvan')
- Threat Type: Category of threats (e.g., 'Ransomware', 'APT', 'Botnet')
- Attacker: Threat actors/groups (e.g., 'APT28', 'Lazarus Group', 'Shuckworm')
- Technique: Attack techniques/TTPs (e.g., 'T1057: Process Discovery', 'Privilege Escalation', 'Phishi...


In [17]:
def extract_entities_and_relationships(pipe, text: str) -> Dict[str, Any]:
    """
    Extract entities and relationships from text using the LLM.
    """
    try:
        prompt = create_entity_extraction_prompt(text)

        # Generate response
        response = pipe(
            prompt,
            max_new_tokens=300,
            do_sample=False,
            temperature=0.1,
            pad_token_id=pipe.tokenizer.eos_token_id,
        )

        # Extract generated text
        generated_text = response[0]['generated_text']
        answer = generated_text[len(prompt):].strip()

        print(f"üîç Raw model output: {answer[:200]}...")

        # Parse the response
        entities, relationships = parse_extraction_output(answer)

        return {
            "raw_output": answer,
            "entities": entities,
            "relationships": relationships,
            "has_entities": len(entities) > 0
        }

    except Exception as e:
        print(f"‚ùå Error in extraction: {e}")
        return {
            "raw_output": "",
            "entities": [],
            "relationships": [],
            "has_entities": False,
            "error": str(e)
        }

def parse_extraction_output(output: str) -> Tuple[List[Tuple], List[Tuple]]:
    """
    Parse the model output to extract entities and relationships.
    """
    entities = []
    relationships = []

    # Check for "No related entities" case
    if "no related entities" in output.lower():
        return entities, relationships

    try:
        # Split output into lines
        lines = [line.strip() for line in output.split('\n') if line.strip()]

        current_section = None
        for line in lines:
            line_lower = line.lower()

            if "named entities:" in line_lower:
                current_section = "entities"
                # Extract entities from the same line
                entity_part = line.split(":", 1)[1] if ":" in line else ""
                entities.extend(extract_tuples_from_text(entity_part))

            elif "relationships:" in line_lower:
                current_section = "relationships"
                # Extract relationships from the same line
                rel_part = line.split(":", 1)[1] if ":" in line else ""
                relationships.extend(extract_tuples_from_text(rel_part))

            elif current_section == "entities":
                entities.extend(extract_tuples_from_text(line))

            elif current_section == "relationships":
                relationships.extend(extract_tuples_from_text(line))

    except Exception as e:
        print(f"‚ö†Ô∏è  Error parsing output: {e}")

    return entities, relationships

def extract_tuples_from_text(text: str) -> List[Tuple]:
    """
    Extract tuples from text using regex pattern matching.
    """
    tuples = []

    # Pattern to match (item1, item2) or (item1, item2, item3)
    pattern = r'\(([^)]+)\)'
    matches = re.findall(pattern, text)

    for match in matches:
        # Split by comma and clean up
        parts = [part.strip() for part in match.split(',')]
        if len(parts) >= 2:
            tuples.append(tuple(parts))

    return tuples

# Test the extraction function
if extraction_model and data:
    print("\nüß™ Testing entity extraction on sample data...")
    test_result = extract_entities_and_relationships(extraction_model, data[0]['content'])

    print(f"\nüìä Extraction Results:")
    print(f"   Entities found: {len(test_result['entities'])}")
    print(f"   Relationships found: {len(test_result['relationships'])}")

    if test_result['entities']:
        print("\nüè∑Ô∏è  Sample Entities:")
        for entity in test_result['entities'][:5]:
            print(f"     {entity}")

    if test_result['relationships']:
        print("\nüîó Sample Relationships:")
        for rel in test_result['relationships'][:5]:
            print(f"     {rel}")


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



üß™ Testing entity extraction on sample data...
üîç Raw model output: Named Entities: (NailaoLocker, Malware), (SM2, Technique), (Lcrypt0rx, Malware), (H2Miner, Threat Type), (Monero, Tool), (Dark 101, Malware), (NordDragonScan, Tool), (RondoDox, Malware), (TBK DVRs, To...

üìä Extraction Results:
   Entities found: 17
   Relationships found: 0

üè∑Ô∏è  Sample Entities:
     ('NailaoLocker', 'Malware')
     ('SM2', 'Technique')
     ('Lcrypt0rx', 'Malware')
     ('H2Miner', 'Threat Type')
     ('Monero', 'Tool')


In [23]:
def _load_existing_records(output_path: Path) -> List[Dict]:
    """Load existing JSON array from file; return [] if file doesn't exist/invalid."""
    if not output_path.exists():
        return []
    try:
        with output_path.open("r", encoding="utf-8") as f:
            data = json.load(f)
            if isinstance(data, list):
                return data
    except Exception:
        pass
    return []

def _persist_records(output_path: Path, all_records: List[Dict]) -> None:
    """Atomically write JSON array to file (UTF-8, pretty, no ASCII escaping)."""
    tmp_path = output_path.with_suffix(output_path.suffix + ".tmp")
    with tmp_path.open("w", encoding="utf-8") as f:
        json.dump(all_records, f, ensure_ascii=False, indent=2)
        f.write("\n")
    tmp_path.replace(output_path)



def process_articles_for_extraction(data: List[Dict], pipe, start: int = 0, offset:int=5, output_path: Path=None) -> List[Dict]:
    """
    Process multiple articles for entity and relationship extraction.
    """
    end = min(start + offset, len(data))
    articles_to_process = data[start:end]
    results = []

    print(f"üîç Processing {len(articles_to_process)} articles for entity extraction...")


    # Chu·∫©n b·ªã output_path
    if output_path is None:
      raise ValueError("Output path is required")
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    # T·∫£i d·ªØ li·ªáu hi·ªán c√≥ & x√¢y set ti√™u ƒë·ªÅ ƒë·ªÉ ki·ªÉm tra tr√πng
    existing_records = _load_existing_records(output_path)
    existing_titles = {(rec.get("title") or "").strip() for rec in existing_records}

    # tr√°nh tr√πng trong batch hi·ªán t·∫°i
    batch_titles_seen = set()


    for i, article in enumerate(articles_to_process):
        print(f"\nProcessing {i+1}/{len(articles_to_process)}: {article.get('title', 'Unknown')[:60]}...")

        title = (article.get('title', '') or '').strip()
        link  = article.get('link', '')
        content = article.get('content', '')

        if title in existing_titles:
            print(f"  ‚è≠Ô∏è  Skip (duplicate title in file): {title[:60] or 'Untitled'}")
            continue
        if title in batch_titles_seen:
            print(f"  ‚è≠Ô∏è  Skip (duplicate title in batch): {title[:60] or 'Untitled'}")
            continue


        # Extract entities and relationships
        extraction_result = extract_entities_and_relationships(pipe, content)
        entities = extraction_result.get('entities', []) if isinstance(extraction_result, dict) else []
        relationships = extraction_result.get('relationships', []) if isinstance(extraction_result, dict) else []

        # Combine with original article data
        result = {
            "title": title,
            "link": link,
            "content": content,
            "extraction": extraction_result,
            "entity_count": len(entities),
            "relationship_count": len(relationships)
        }

        results.append(result)

        # Ghi n·ªëi v√†o file (th√¥ng qua m·∫£ng t·ªïng r·ªìi ghi atomically)
        existing_records.append(result)
        _persist_records(output_path, existing_records)

        # ƒê√°nh d·∫•u ƒë√£ th·∫•y
        batch_titles_seen.add(title)
        existing_titles.add(title)

        # Progress update
        if (i + 1) % 5 == 0:
            print(f"  ‚úÖ Processed {i+1}/{len(articles_to_process)} articles")

    print(f"\nüíæ Saved {len(results)} new records to {str(output_path)}")
    return results




In [24]:
import datetime
today = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
offset = 427


In [28]:
start = 0
end = min(len(data), start+offset)
output_path = Path(f"/content/drive/MyDrive/LLM-TKIG/data/entity-extraction/{DEFAULT_MODEL}_{today}_{start}_{end}.json")

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset, output_path=output_path)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîç Processing 427 articles for entity extraction...

Processing 1/427: FortiGuard Labs Threat Research...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîç Raw model output: Named Entities: (NailaoLocker, Malware), (SM2, Technique), (Lcrypt0rx, Malware), (H2Miner, Threat Type), (Monero, Tool), (Dark 101, Malware), (NordDragonScan, Tool), (RondoDox, Malware), (TBK DVRs, To...

Processing 2/427: NailaoLocker Ransomware‚Äôs ‚ÄúCheese‚Äù...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


üîç Raw model output: Named Entities: (NailaoLocker, Malware), (AES-256-CBC, Technique), (SM2, Technique), (usysdiag.exe, File), (sensapi, File)\nRelationships: (NailaoLocker, uses, AES-256-CBC), (NailaoLocker, uses, SM2),...

Processing 3/427: Improving Cloud Intrusion Detection and Triage with FortiCNA...


KeyboardInterrupt: 