# Optimized Entity and Relationship Extraction with Ollama (Llama3)

This notebook extracts entities and relationships from threat intelligence data using Llama3 via Ollama.

- Optimized for speed: Shorter prompts, fewer few-shots, content limit 500 chars.
- Fixed parsing/normalization to preserve data.
- Input: data/raw/merged_threat_intelligence.json
- Output: data/entity-extraction/extractions_llama3_{start}_{end}_{timestamp}.json


In [1]:
import os
import json
import datetime as dt
from typing import List, Dict, Any, Optional
from pathlib import Path
from tqdm import tqdm
import logging

import ollama

# Setup
os.environ['OLLAMA_NUM_GPU_LAYERS'] = '999'  # Force GPU offload
PROJECT_ROOT = Path('/Users/huynguyen/Documents/UIT/2nd/NLP/LLM-TKIG')
os.chdir(PROJECT_ROOT)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("extraction")

# Pull Llama3 if not present
try:
    ollama.show('llama3')
except:
    ollama.pull('llama3')


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/show "HTTP/1.1 200 OK"


In [2]:
# Prompts (optimized: fewer few-shots, shorter content)
SYSTEM_PROMPT = """Please identify the following types of entities and then extract the relationships between these extracted entities: malware(e.g., 'Stuxnet'), threat type (e.g.,'ransomware’),… If there are no entities and relationships pertaining to the specified types, please state 'No related entities and relations'. Make sure to follow the output format shown in the following example.
"""

USER_TEMPLATE = "Title: {title}\nLink: {link}\nContent:\n{content}\n\nReturn JSON only."

# Reduced few-shot examples (only 1 for speed)
FEW_SHOT_EXAMPLES = [
    {
        "title": "Example Title",
        "link": "example.com",
        "content": "APT29 uses Mimikatz to exploit CVE-2019-1234 in Windows.",
        "output": {
            "entities": [{"text": "APT29", "type": "ThreatActor"}, {"text": "Mimikatz", "type": "Tool"}, {"text": "CVE-2019-1234", "type": "CVE"}, {"text": "Windows", "type": "Platform"}],
            "relationships": [["APT29", "uses", "Mimikatz"], ["Mimikatz", "exploits", "CVE-2019-1234"], ["APT29", "targets", "Windows"]]
        }
    }
]


In [3]:
# Functions
def build_prompt(title, link, content):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for ex in FEW_SHOT_EXAMPLES:
        messages.append({"role": "user", "content": USER_TEMPLATE.format(**ex)})
        messages.append({"role": "assistant", "content": json.dumps(ex['output'])})
    messages.append({"role": "user", "content": USER_TEMPLATE.format(title=title, link=link, content=content)})
    return messages

def extract_with_model(messages, model_name='llama3'):
    response = ollama.chat(model=model_name, messages=messages)
    return response['message']['content']


In [4]:
# Fixed safe_parse_json (more robust: extract JSON block if extra text)
def safe_parse_json(s: str) -> Optional[Dict[str, Any]]:
    import re
    try:
        # Find largest JSON object
        match = re.search(r'\{[\s\S]*\}', s)
        if match:
            return json.loads(match.group(0))
        return json.loads(s)
    except json.JSONDecodeError as e:
        logger.warning(f"JSON parse error: {e}")
        return None


In [5]:
# Fixed normalize_output (less strict, keep originals, convert rel to dict)
def normalize_output(parsed: Dict[str, Any]) -> Dict[str, Any]:
    entities = parsed.get("entities", [])
    relationships = parsed.get("relationships", [])

    norm_entities: List[Dict[str, str]] = []
    seen_e = set()
    for e in entities:
        if not isinstance(e, dict): continue
        text = (e.get("text") or "").strip()
        etype = (e.get("type") or "").strip()  # No capitalization
        if text and etype and (text, etype) not in seen_e:
            seen_e.add((text, etype))
            norm_entities.append({"text": text, "type": etype})

    norm_rels: List[Dict[str, str]] = []
    seen_r = set()
    for r in relationships:
        if not isinstance(r, list) or len(r) != 3: continue
        sub, pred, obj = [x.strip() for x in r]
        if sub and pred and obj:
            tuple_r = (sub, pred, obj)
            if tuple_r not in seen_r:
                seen_r.add(tuple_r)
                norm_rels.append({"subject": sub, "predicate": pred, "object": obj})

    return {"entities": norm_entities, "relationships": norm_rels}


In [6]:
# Load data
INPUT_PATH = "data/raw/merged_threat_intelligence.json"
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Loaded {len(data)} records")


Loaded 427 records


In [7]:
# Test example
sample = data[0]
messages = build_prompt(sample['title'], sample['link'], sample['content'])
raw = extract_with_model(messages, 'llama3')
print("Raw Output:", raw)

parsed = safe_parse_json(raw) or {"entities": [], "relationships": []}
normalized = normalize_output(parsed)
print("Normalized:", normalized)


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Raw Output: {"entities": [{"text": "NailaoLocker", "type": "Malware"}, {"text": "FortiGuard Labs", "type": "Vendor"}, {"text": "SM2", "type": "Crypto"}, {"text": "Fortinet", "type": "Vendor"}, {"text": "Lcrypt0rx", "type": "Malware"}, {"text": "H2Miner", "type": "ThreatActor"}, {"text": "Monero", "type": "Currency"}, {"text": "Dark 101", "type": "Malware"}, {"text": "FortiSandbox", "type": "Tool"}, {"text": "AI", "type": "Technique"}, {"text": "Flutter", "type": "Platform"}, {"text": "Rust", "type": "Platform"}, {"text": "Delphi", "type": "Platform"}, {"text": "NordDragonScan", "type": "Tool"}, {"text": "CVE-2024-3721", "type": "CVE"}, {"text": "CVE-2024-12856", "type": "CVE"}, {"text": "TBK DVRs", "type": "Device"}, {"text": "Four-Faith routers", "type": "Device"}, {"text": "DCRAT", "type": "Malware"}, {"text": "Havoc RAT", "type": "Tool"}, {"text": "Middle East", "type": "GeographicLocation"}, {"text": "PowerShell", "type": "Technique"}], "relationships": [["FortiGuard Labs", "analyz

In [8]:
# Batch annotation function
def run_batch(start: int, size: int, model_name: str):
    batch = data[start:start+size]
    results = []
    for i in tqdm(range(len(batch)), desc=f"Batch {start}-{start+size-1}"):
        rec = batch[i]
        pos = start + i + 1
        logger.info(f"Processing {pos}/{len(data)}")
        messages = build_prompt(rec.get("title", ""), rec.get("link", ""), rec.get("content", ""))
        raw = extract_with_model(messages, model_name)
        parsed = safe_parse_json(raw) or {"entities": [], "relationships": []}
        norm = normalize_output(parsed)
        results.append({
            "title": rec["title"],
            "link": rec["link"],
            "content": rec["content"],
            "extraction": {
                "model": model_name,
                "raw_output": raw,
                "entities": norm["entities"],
                "relationships": norm["relationships"]
            }
        })
    
    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"data/entity-extraction/extractions_{model_name}_{start}_{start+size-1}_{timestamp}.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    logger.info(f"Saved batch to {output_path}")


In [9]:
# Run batch 0-49 (50 records)
#run_batch(0, 50, 'llama3')

In [10]:
# Run batch 50-99 (50 records)
#run_batch(50, 50, 'llama3')

In [11]:
# Run batch 100-149 (50 records)
run_batch(100, 50, 'llama3')

Batch 100-149:   0%|          | 0/50 [00:00<?, ?it/s]INFO:extraction:Processing 101/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 100-149:   2%|▏         | 1/50 [00:15<12:56, 15.85s/it]INFO:extraction:Processing 102/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 100-149:   4%|▍         | 2/50 [00:38<15:51, 19.82s/it]INFO:extraction:Processing 103/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 100-149:   6%|▌         | 3/50 [00:57<15:24, 19.66s/it]INFO:extraction:Processing 104/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 100-149:   8%|▊         | 4/50 [01:15<14:19, 18.68s/it]INFO:extraction:Processing 105/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 100-149:  10%|█         | 5/50 [01:38<15:11, 20.26s/it]INFO:extraction:Processing 106/427
INFO:httpx:HTTP Request: POST ht

In [12]:
# Run batch 150-199 (50 records)
#run_batch(150, 50, 'llama3')

In [13]:
# Run batch 200-249 (50 records)
#run_batch(200, 50, 'llama3')

In [14]:
# Run batch 250-299 (50 records)
#run_batch(250, 50, 'llama3')

In [15]:
# Run batch 300-349 (50 records)
#run_batch(300, 50, 'llama3')

In [16]:
# Run batch 350-399 (50 records)
#run_batch(350, 50, 'llama3')

In [17]:
# Run batch 400-426 (remaining 27 records)
#run_batch(400, 27, 'llama3')

In [18]:
# Merge all batch outputs
import glob

output_files = glob.glob("data/entity-extraction/extractions_llama3_*.json")
merged = []
for file in output_files:
    with open(file, "r") as f:
        merged.extend(json.load(f))

merged_path = "data/entity-extraction/merged_llama3_extractions_full_content.json"
with open(merged_path, "w") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)

print(f"✅ Merged {len(merged)} records into {merged_path}")
print(f"🎉 All batches completed with FULL CONTENT!")

✅ Merged 427 records into data/entity-extraction/merged_llama3_extractions_full_content.json
🎉 All batches completed with FULL CONTENT!
