# Entity and Relationship Extraction for Threat Intelligence

## Overview
This notebook implements entity and relationship extraction from threat intelligence text using LLM-based approach.

### Task Description
- **Input**: Threat intelligence text content
- **Output**: Named entities and relationships in structured format
- **Entity Types**: malware, threat type, attacker, vulnerability, tool, etc.
- **Relationship Types**: use, target, exploit, etc.

### Example
**Input**: A hitherto unknown attack group has been observed targeting a materials research organization in Asia. The group, which Symantec calls Clasiopa, is characterized by a distinct toolset, which includes one piece of custom malware (Backdoor.Atharvan).

**Output**:
- Named Entities: (Clasiopa, attacker), (custom malware, malware), (Backdoor.Atharvan, malware)
- Relationships: (Clasiopa, use, custom malware), (custom malware, name, Backdoor.Atharvan)


In [1]:
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple, Any
from collections import defaultdict
import datetime

# Load environment and model setup
from dotenv import load_dotenv
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Load environment variables
load_dotenv()

print("🔧 Setting up Entity & Relationship Extraction Pipeline")
print("=" * 60)


🔧 Setting up Entity & Relationship Extraction Pipeline


In [4]:
def load_data(input_file: str) -> list:
    """
    Load threat intelligence data from JSON file.
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"✅ Loaded {len(data)} records from {input_file}")
        return data
    except Exception as e:
        print(f"❌ Error loading {input_file}: {e}")
        return []

# Load threat intelligence data
data_path = '../data/processed/merged_threat_intelligence.json'
data = load_data(data_path)

if data:
    print(f"📊 Sample data structure:")
    print(f"   Keys: {list(data[0].keys())}")
    print(f"   Title: {data[0]['title'][:100]}...")


✅ Loaded 427 records from ../data/processed/merged_threat_intelligence.json
📊 Sample data structure:
   Keys: ['title', 'content', 'link']
   Title: FortiGuard Labs Threat Research...


In [5]:
# Device setup
device = (
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)

print(f"🖥️  Using device: {device.upper()}")
print(f"🔧 PyTorch version: {torch.__version__}")

# Memory cleanup
if device == "cuda":
    torch.cuda.empty_cache()
elif device == "mps":
    import gc
    gc.collect()
    if hasattr(torch.mps, 'empty_cache'):
        torch.mps.empty_cache()


🖥️  Using device: CUDA
🔧 PyTorch version: 2.7.1+cu128


In [6]:
# Get configuration from environment
HF_TOKEN = os.getenv('HF_TOKEN')
DEFAULT_MODEL = os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')
FALLBACK_MODEL = os.getenv('FALLBACK_MODEL', 'gpt2')

def setup_model_for_extraction(model_name: str = None, hf_token: str = None):
    """
    Tải model từ Hugging Face với token từ environment variables.
    """
    model_name = model_name or DEFAULT_MODEL
    hf_token = hf_token or HF_TOKEN

    print(f"🤖 Đang tải mô hình: {model_name}")
    print(f"📱 Thiết bị: {device.upper()}")
    print(f"🔑 Token: {'✅ Found' if hf_token else '❌ Missing'}")

    try:
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            token=hf_token,
            trust_remote_code=True
        )
        tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

        # Thiết lập kiểu dữ liệu và bản đồ thiết bị
        torch_dtype = torch.float16 if device == "cuda" else torch.float32
        device_map = "auto" if device == "cuda" else None

        # Load model
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=hf_token,
            trust_remote_code=True,
            torch_dtype=torch_dtype,
            device_map=device_map,
            use_cache=False
        )

        if device_map is None and device in ["mps", "cuda"]:
            model.to(device)

        if device_map is None:
            # Nếu không sử dụng device_map="auto", có thể chỉ định device
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                device=0 if device != "cpu" else -1,
                torch_dtype=torch_dtype,
                model_kwargs={"use_cache": False}
            )
        else:
            # Nếu sử dụng device_map="auto", không chỉ định device
            pipe = pipeline(
                "text-generation",
                model=model,
                tokenizer=tokenizer,
                torch_dtype=torch_dtype,
                model_kwargs={"use_cache": False}
            )

        print(f"✅ Đã tải thành công {model_name} trên {device.upper()}")
        return pipe

    except Exception as e:
        print(f"❌ Lỗi khi tải {model_name}: {e}")
        return setup_fallback_model(hf_token)

def setup_fallback_model(hf_token: str = None):
    """
    Tải fallback model nếu model chính lỗi.
    """
    fallback_name = FALLBACK_MODEL
    hf_token = hf_token or HF_TOKEN
    print(f"🔄 Đang tải mô hình dự phòng: {fallback_name}")

    try:
        tokenizer = AutoTokenizer.from_pretrained(fallback_name, token=hf_token)
        tokenizer.pad_token = tokenizer.pad_token or tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            fallback_name,
            token=hf_token,
            torch_dtype=torch.float32,
            use_cache=False
        )

        if device in ["cuda", "mps"]:
            model.to(device)

        pipe = pipeline(
            "text-generation",
            model=model,
            tokenizer=tokenizer,
            device=0 if device != "cpu" else -1,
            model_kwargs={"use_cache": False}
        )

        print(f"✅ {FALLBACK_MODEL} đã sẵn sàng trên {device.upper()}")
        return pipe

    except Exception as e:
        print(f"❌ Lỗi khi tải {FALLBACK_MODEL} fallback: {e}")
        return None

# Load model
extraction_model = setup_model_for_extraction()


🤖 Đang tải mô hình: Qwen/Qwen2.5-1.5B-Instruct
📱 Thiết bị: CUDA
🔑 Token: ✅ Found


Device set to use cuda:0


✅ Đã tải thành công Qwen/Qwen2.5-1.5B-Instruct trên CUDA


In [7]:
def create_entity_extraction_prompt(text: str) -> str:
    """
    Create prompt for entity and relationship extraction focusing on core cybersecurity entity types.
    """
    # Truncate text to avoid token limits
    text_truncated = (text[:1500] if text else "").replace('\n', ' ').strip()
    
    prompt = f"""Instruction: Please identify the following types of entities and then extract the relationships between these extracted entities:

Entity Types (focus on these only):
- Malware: Malicious software (e.g., 'Stuxnet', 'Emotet', 'Backdoor.Atharvan')
- Threat Type: Category of threats (e.g., 'Ransomware', 'APT', 'Botnet')
- Attacker: Threat actors/groups (e.g., 'APT28', 'Lazarus Group', 'Shuckworm')
- Technique: Attack techniques/TTPs (e.g., 'T1057: Process Discovery', 'Privilege Escalation', 'Phishing')
- Tool: Security tools or attack tools (e.g., 'PowerShell', 'Cobalt Strike', 'EHole')
- Vulnerability: Security weaknesses (e.g., 'CVE-2020-1472', 'CVE-2021-44228')
- IP: IP addresses (e.g., '45.153.243.93', '192.168.1.100')
- Domain: Domain names (e.g., 'malicious-domain[.]com', 'evil[.]example[.]com')
- URL: URLs (e.g., 'hxxp://178.73.192[.]15/cal.exe')
- File: File names (e.g., 'rtk.lnk', 'payload.exe', 'shtasks.exe')
- Hash: File hashes (e.g., '2aee8bb2a953124803bc42e5c42935c9', MD5/SHA1/SHA256)

Relationship Types:
- use, hash, aka, execute, used by, download, resolved to, IP, drop, associated with, deploy, communicate with, connect to, install, exploit, contain, run, launch, target, linked to

If there are no entities and relationships pertaining to the specified types, please state 'No related entities and relations'. Make sure to follow the output format shown in the following examples.

Example 1:
Input: A hitherto unknown attack group has been observed targeting a materials research organization in Asia. The group, which Symantec calls Clasiopa, is characterized by a distinct toolset, which includes one piece of custom malware (Backdoor.Atharvan).
Output: Named Entities: (Clasiopa, Attacker), (Backdoor.Atharvan, Malware)\\nRelationships: (Clasiopa, uses, Backdoor.Atharvan)

Example 2:
Input: The Emotet malware has been observed using new phishing techniques to target banking institutions. The malware exploits CVE-2021-1234 vulnerability in Microsoft Office.
Output: Named Entities: (Emotet, Malware), (phishing, Technique), (CVE-2021-1234, Vulnerability), (Microsoft Office, Tool)\\nRelationships: (Emotet, uses, phishing), (Emotet, exploits, CVE-2021-1234)

Example 3:
Input: The threat actor downloaded malicious payload from hxxp://malicious-domain[.]com/payload.exe and used hash 2aee8bb2a953124803bc42e5c42935c9 to verify file integrity. The attack targeted IP address 192.168.1.100.
Output: Named Entities: (threat actor, Attacker), (malicious payload, File), (hxxp://malicious-domain[.]com/payload.exe, URL), (2aee8bb2a953124803bc42e5c42935c9, Hash), (192.168.1.100, IP)\\nRelationships: (threat actor, uses, hxxp://malicious-domain[.]com/payload.exe), (threat actor, targets, 192.168.1.100)

Example 4:
Input: H2Miner botnet uses Kinsing malware and Cobalt Strike to deploy XMRig miners. The campaign communicates with C2 server at evil[.]domain[.]com and is attributed to APT group.
Output: Named Entities: (H2Miner, Threat Type), (Kinsing, Malware), (Cobalt Strike, Tool), (XMRig, Tool), (evil[.]domain[.]com, Domain), (APT group, Attacker)\\nRelationships: (H2Miner, uses, Kinsing), (H2Miner, uses, Cobalt Strike), (H2Miner, uses, XMRig), (Kinsing, communicatesWith, evil[.]domain[.]com), (H2Miner, attributedTo, APT group)

Example 5:
Input: The weather forecast shows sunny skies and moderate temperatures for the weekend.
Output: No related entities and relations

Now extract entities and relationships from the following text:
Input: {text_truncated}
Output:"""
    
    return prompt

# Test the prompt creation
if data:
    sample_prompt = create_entity_extraction_prompt(data[0]['content'])
    print("📝 Sample prompt (first 500 chars):")
    print(sample_prompt[:500] + "...")


📝 Sample prompt (first 500 chars):
Instruction: Please identify the following types of entities and then extract the relationships between these extracted entities:

Entity Types (focus on these only):
- Malware: Malicious software (e.g., 'Stuxnet', 'Emotet', 'Backdoor.Atharvan')
- Threat Type: Category of threats (e.g., 'Ransomware', 'APT', 'Botnet')
- Attacker: Threat actors/groups (e.g., 'APT28', 'Lazarus Group', 'Shuckworm')
- Technique: Attack techniques/TTPs (e.g., 'T1057: Process Discovery', 'Privilege Escalation', 'Phishi...


In [8]:
def extract_entities_and_relationships(pipe, text: str) -> Dict[str, Any]:
    """
    Extract entities and relationships from text using the LLM.
    """
    try:
        prompt = create_entity_extraction_prompt(text)
        
        # Generate response
        response = pipe(
            prompt,
            max_new_tokens=300,
            do_sample=False,
            temperature=0.1,
            pad_token_id=pipe.tokenizer.eos_token_id,
        )
        
        # Extract generated text
        generated_text = response[0]['generated_text']
        answer = generated_text[len(prompt):].strip()
        
        print(f"🔍 Raw model output: {answer[:200]}...")
        
        # Parse the response
        entities, relationships = parse_extraction_output(answer)
        
        return {
            "raw_output": answer,
            "entities": entities,
            "relationships": relationships,
            "has_entities": len(entities) > 0
        }
        
    except Exception as e:
        print(f"❌ Error in extraction: {e}")
        return {
            "raw_output": "",
            "entities": [],
            "relationships": [],
            "has_entities": False,
            "error": str(e)
        }

def parse_extraction_output(output: str) -> Tuple[List[Tuple], List[Tuple]]:
    """
    Parse the model output to extract entities and relationships.
    """
    entities = []
    relationships = []
    
    # Check for "No related entities" case
    if "no related entities" in output.lower():
        return entities, relationships
    
    try:
        # Split output into lines
        lines = [line.strip() for line in output.split('\n') if line.strip()]
        
        current_section = None
        for line in lines:
            line_lower = line.lower()
            
            if "named entities:" in line_lower:
                current_section = "entities"
                # Extract entities from the same line
                entity_part = line.split(":", 1)[1] if ":" in line else ""
                entities.extend(extract_tuples_from_text(entity_part))
                
            elif "relationships:" in line_lower:
                current_section = "relationships"
                # Extract relationships from the same line
                rel_part = line.split(":", 1)[1] if ":" in line else ""
                relationships.extend(extract_tuples_from_text(rel_part))
                
            elif current_section == "entities":
                entities.extend(extract_tuples_from_text(line))
                
            elif current_section == "relationships":
                relationships.extend(extract_tuples_from_text(line))
    
    except Exception as e:
        print(f"⚠️  Error parsing output: {e}")
    
    return entities, relationships

def extract_tuples_from_text(text: str) -> List[Tuple]:
    """
    Extract tuples from text using regex pattern matching.
    """
    tuples = []
    
    # Pattern to match (item1, item2) or (item1, item2, item3)
    pattern = r'\(([^)]+)\)'
    matches = re.findall(pattern, text)
    
    for match in matches:
        # Split by comma and clean up
        parts = [part.strip() for part in match.split(',')]
        if len(parts) >= 2:
            tuples.append(tuple(parts))
    
    return tuples

# Test the extraction function
if extraction_model and data:
    print("\n🧪 Testing entity extraction on sample data...")
    test_result = extract_entities_and_relationships(extraction_model, data[0]['content'])
    
    print(f"\n📊 Extraction Results:")
    print(f"   Entities found: {len(test_result['entities'])}")
    print(f"   Relationships found: {len(test_result['relationships'])}")
    
    if test_result['entities']:
        print("\n🏷️  Sample Entities:")
        for entity in test_result['entities'][:5]:
            print(f"     {entity}")
    
    if test_result['relationships']:
        print("\n🔗 Sample Relationships:")
        for rel in test_result['relationships'][:5]:
            print(f"     {rel}")


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



🧪 Testing entity extraction on sample data...
🔍 Raw model output: Named Entities: (NailaoLocker, Malware), (SM2, Technique), (Lcrypt0rx, Malware), (Dark 101, Malware), (FortiCNAPP Composite Alerts, Tool), (Lcrypt0rx, Malware), (FortiCNAPP Labs, Tool), (FortiSandbox ...

📊 Extraction Results:
   Entities found: 12
   Relationships found: 9

🏷️  Sample Entities:
     ('NailaoLocker', 'Malware')
     ('SM2', 'Technique')
     ('Lcrypt0rx', 'Malware')
     ('Dark 101', 'Malware')
     ('FortiCNAPP Composite Alerts', 'Tool')

🔗 Sample Relationships:
     ('NailaoLocker', 'uses', 'SM2')
     ('NailaoLocker', 'contains', 'Lcrypt0rx')
     ('Lcrypt0rx', 'uses', 'Dark 101')
     ('FortiCNAPP Composite Alerts', 'linksWeakSignalsIntoClearTimelines')
     ('Lcrypt0rx', 'uses', 'FortiCNAPP Labs')


In [9]:
def process_articles_for_extraction(data: List[Dict], pipe, start: int = 0, offset:int=5) -> List[Dict]:
    """
    Process multiple articles for entity and relationship extraction.
    """
    end = min(start + offset, len(data))
    articles_to_process = data[start:end]
    results = []

    print(f"🔍 Processing {len(articles_to_process)} articles for entity extraction...")

    for i, article in enumerate(articles_to_process):
        print(f"\nProcessing {i+1}/{len(articles_to_process)}: {article.get('title', 'Unknown')[:60]}...")

        # Extract entities and relationships
        extraction_result = extract_entities_and_relationships(pipe, article.get('content', ''))

        # Combine with original article data
        result = {
            "title": article.get('title', ''),
            "link": article.get('link', ''),
            "content": article.get('content', ''),
            "extraction": extraction_result,
            "entity_count": len(extraction_result['entities']),
            "relationship_count": len(extraction_result['relationships'])
        }

        results.append(result)
        
        # Progress update
        if (i + 1) % 5 == 0:
            print(f"  ✅ Processed {i+1}/{len(articles_to_process)} articles")

    return results

# Process a small batch first for testing
print("\n🚀 Processing first 5 articles for entity extraction...")
extraction_results = process_articles_for_extraction(data, extraction_model, start=0, offset = 5)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



🚀 Processing first 5 articles for entity extraction...
🔍 Processing 5 articles for entity extraction...

Processing 1/5: FortiGuard Labs Threat Research...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (NailaoLocker, Malware), (SM2, Technique), (Lcrypt0rx, Malware), (Dark 101, Malware), (FortiCNAPP Composite Alerts, Tool), (Lcrypt0rx, Malware), (FortiCNAPP Labs, Tool), (FortiSandbox ...

Processing 2/5: NailaoLocker Ransomware’s “Cheese”...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (NailaoLocker, Malware), (AES-256-CBC, Technique), (SM2 cryptographic key, Vulnerability), (Windows, Platform), (user files, File), (high sev...

Processing 3/5: Improving Cloud Intrusion Detection and Triage with FortiCNA...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (Cloud, Entity), (Multi-stage technique, Technique), (Authentication abuse, Technique), (Privilege escalation, Technique), (Command execution...

Processing 4/5: Old Miner, New Tricks...


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (FortiCNAPP team, Attacker), (H2miner, Threat Type), (Lcrypt0rx, Malware), (Linux, OS), (Windows, OS), (Containers, OS), (KinSing, Tool), (Xmrig miners, Tool), (Lcrypt0rx, Malware), (L...

Processing 5/5: How FortiSandbox 5.0 Detects Dark 101 Ransomware Despite Eva...
🔍 Raw model output: Named Entities: (FortiGuard Labs, Organization), (Dark 101, Malware Family), (ransomware, Threat Type), (ransomnote, Threat Type), (Bitcoin, Currency), (Task Manager, Tool), (backupcatalog, Object), (...
  ✅ Processed 5/5 articles


In [29]:
def save_extraction_results(results: List[Dict], output_file: str = "entity-extraction.json"):
    """
    Save extraction results to files.
    """
    try:
        # Create output directory
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        # Convert to an absolute path to avoid relative path issues
        absolute_path = output_path.resolve()
        print(f"💾 Saving to: {absolute_path}")
        
        with open(absolute_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        
        print(f"\n💾 SAVED EXTRACTION RESULTS to {output_file}")

        
    except Exception as e:
        print(f"❌ Error saving results: {e}")
        print(f"   Attempted path: {output_path}")
        print(f"   Current working directory: {os.getcwd()}")
        print(f"   Absolute path would be: {Path(output_path).resolve()}")


In [16]:
# test save results
import datetime
today = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
offset = 2
start = 0
end = min(len(data), start+offset)

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Processing 2 articles for entity extraction...

Processing 1/2: FortiGuard Labs Threat Research...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔍 Raw model output: Named Entities: (NailaoLocker, Malware), (SM2, Technique), (Lcrypt0rx, Malware), (Dark 101, Malware), (FortiCNAPP Composite Alerts, Tool), (Lcrypt0rx, Malware), (FortiCNAPP Labs, Tool), (FortiSandbox ...

Processing 2/2: NailaoLocker Ransomware’s “Cheese”...
🔍 Raw model output: Named Entities: (FortiGuard Labs Threat Research, Entity), (NailaoLocker, Malware), (AES-256-CBC, Technique), (SM2 cryptographic key, Vulnerability), (Windows, Platform), (user files, File), (high sev...


In [30]:
output_path = f"../data/entity-extraction/entity-extraction_{today}_{start}_{end}.json"
save_extraction_results(results, output_path)

💾 Saving to: C:\Users\KietVu\Testplace\LLM_TKIG\data\entity-extraction\entity-extraction_2025-08-03_19-54-10_0_2.json

💾 SAVED EXTRACTION RESULTS to ../data/entity-extraction/entity-extraction_2025-08-03_19-54-10_0_2.json


In [None]:
import datetime
today = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
offset = 50


In [None]:
start = 0
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/topic_classification_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

In [None]:
start = 50
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/topic_classification_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

In [None]:
start = 100
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/topic_classification_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

In [None]:
start = 150
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/topic_classification_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

In [None]:
start = 200
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/topic_classification_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

In [None]:
start = 250
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/topic_classification_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

In [None]:
start = 300
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/topic_classification_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

In [None]:
start = 350
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/topic_classification_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)

In [None]:
start = 400
end = min(len(data), start+offset)
output_path = f"../data/entity-extraction/topic_classification_{os.getenv('DEFAULT_MODEL', 'Qwen/Qwen2.5-1.5B-Instruct')}_test_{today}_{start}_{end}.json"

results = process_articles_for_extraction(data, extraction_model,start=start,offset=offset)
save_extraction_results(results, output_path)