# Optimized Entity and Relationship Extraction with Ollama (Llama3)

This notebook extracts entities and relationships from threat intelligence data using Llama3 via Ollama.

- Optimized for speed: Shorter prompts, fewer few-shots, content limit 500 chars.
- Fixed parsing/normalization to preserve data.
- Input: data/raw/merged_threat_intelligence.json
- Output: data/entity-extraction/extractions_llama3_{start}_{end}_{timestamp}.json


In [2]:
import os
import json
import datetime as dt
from typing import List, Dict, Any, Optional
from pathlib import Path
from tqdm import tqdm
import logging

import ollama

# Setup
os.environ['OLLAMA_NUM_GPU_LAYERS'] = '999'  # Force GPU offload
PROJECT_ROOT = Path('/Users/huynguyen/Documents/UIT/2nd/NLP/LLM-TKIG')
os.chdir(PROJECT_ROOT)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("extraction")

# Pull Llama3 if not present
try:
    ollama.show('llama3')
except:
    ollama.pull('llama3')


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/show "HTTP/1.1 200 OK"


In [3]:
# Prompts (optimized: fewer few-shots, shorter content)
SYSTEM_PROMPT = """
You are an expert cyber threat intelligence annotator. Extract named entities and relationships from the given document.
Return strict JSON with keys: entities, relationships.
- entities: list of {"text": str, "type": str} where type in [Malware, Tool, ThreatActor, CVE, IPAddress, Domain, URL, Email, File, Crypto, Platform, Technique, Vendor, Device, Severity, GeographicLocation].
- relationships: list of [subject, predicate, object]. Predicates: uses, exploits, targets, etc.
No extra commentary.
"""

USER_TEMPLATE = "Title: {title}\nLink: {link}\nContent:\n{content}\n\nReturn JSON only."

# Reduced few-shot examples (only 1 for speed)
FEW_SHOT_EXAMPLES = [
    {
        "title": "Example Title",
        "link": "example.com",
        "content": "APT29 uses Mimikatz to exploit CVE-2019-1234 in Windows.",
        "output": {
            "entities": [{"text": "APT29", "type": "ThreatActor"}, {"text": "Mimikatz", "type": "Tool"}, {"text": "CVE-2019-1234", "type": "CVE"}, {"text": "Windows", "type": "Platform"}],
            "relationships": [["APT29", "uses", "Mimikatz"], ["Mimikatz", "exploits", "CVE-2019-1234"], ["APT29", "targets", "Windows"]]
        }
    }
]


In [4]:
# Functions
def build_prompt(title, link, content):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for ex in FEW_SHOT_EXAMPLES:
        messages.append({"role": "user", "content": USER_TEMPLATE.format(**ex)})
        messages.append({"role": "assistant", "content": json.dumps(ex['output'])})
    messages.append({"role": "user", "content": USER_TEMPLATE.format(title=title, link=link, content=content)})
    return messages

def extract_with_model(messages, model_name='llama3'):
    response = ollama.chat(model=model_name, messages=messages)
    return response['message']['content']


In [5]:
# Fixed safe_parse_json (more robust: extract JSON block if extra text)
def safe_parse_json(s: str) -> Optional[Dict[str, Any]]:
    import re
    try:
        # Find largest JSON object
        match = re.search(r'\{[\s\S]*\}', s)
        if match:
            return json.loads(match.group(0))
        return json.loads(s)
    except json.JSONDecodeError as e:
        logger.warning(f"JSON parse error: {e}")
        return None


In [6]:
# Fixed normalize_output (less strict, keep originals, convert rel to dict)
def normalize_output(parsed: Dict[str, Any]) -> Dict[str, Any]:
    entities = parsed.get("entities", [])
    relationships = parsed.get("relationships", [])

    norm_entities: List[Dict[str, str]] = []
    seen_e = set()
    for e in entities:
        if not isinstance(e, dict): continue
        text = (e.get("text") or "").strip()
        etype = (e.get("type") or "").strip()  # No capitalization
        if text and etype and (text, etype) not in seen_e:
            seen_e.add((text, etype))
            norm_entities.append({"text": text, "type": etype})

    norm_rels: List[Dict[str, str]] = []
    seen_r = set()
    for r in relationships:
        if not isinstance(r, list) or len(r) != 3: continue
        sub, pred, obj = [x.strip() for x in r]
        if sub and pred and obj:
            tuple_r = (sub, pred, obj)
            if tuple_r not in seen_r:
                seen_r.add(tuple_r)
                norm_rels.append({"subject": sub, "predicate": pred, "object": obj})

    return {"entities": norm_entities, "relationships": norm_rels}


In [7]:
# Load data
INPUT_PATH = "data/raw/merged_threat_intelligence.json"
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Loaded {len(data)} records")


Loaded 427 records


In [8]:
# Test example
sample = data[0]
messages = build_prompt(sample['title'], sample['link'], sample['content'])
raw = extract_with_model(messages, 'llama3')
print("Raw Output:", raw)

parsed = safe_parse_json(raw) or {"entities": [], "relationships": []}
normalized = normalize_output(parsed)
print("Normalized:", normalized)


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Raw Output: {"entities": [{"text": "NailaoLocker", "type": "Malware"}, {"text": "FortiGuard Labs", "type": "Vendor"}, {"text": "SM2 encryption", "type": "Technique"}, {"text": "Fortinet", "type": "Vendor"}, {"text": "FortiCNAPP", "type": "Tool"}, {"text": "Lcrypt0rx", "type": "Malware"}, {"text": "H2Miner", "type": "Malware"}, {"text": "Monero mining", "type": "Technique"}, {"text": "Dark 101 ransomware", "type": "Malware"}, {"text": "FortiSandbox 5.0", "type": "Tool"}, {"text": "AI", "type": "Technique"}, {"text": "Flutter", "type": "Platform"}, {"text": "Rust", "type": "Platform"}, {"text": "Delphi", "type": "Platform"}, {"text": "NordDragonScan", "type": "Malware"}, {"text": "TBK DVRs", "type": "Device"}, {"text": "Four-Faith routers", "type": "Device"}, {"text": "CVE-2024-3721", "type": "CVE"}, {"text": "CVE-2024-12856", "type": "CVE"}, {"text": "DCRAT", "type": "Malware"}, {"text": "Havoc Remote Access Trojan (RAT)", "type": "Malware"}, {"text": "Middle East critical national infr

In [9]:
# Batch annotation function
def run_batch(start: int, size: int, model_name: str):
    batch = data[start:start+size]
    results = []
    for i in tqdm(range(len(batch)), desc=f"Batch {start}-{start+size-1}"):
        rec = batch[i]
        pos = start + i + 1
        logger.info(f"Processing {pos}/{len(data)}")
        messages = build_prompt(rec.get("title", ""), rec.get("link", ""), rec.get("content", ""))
        raw = extract_with_model(messages, model_name)
        parsed = safe_parse_json(raw) or {"entities": [], "relationships": []}
        norm = normalize_output(parsed)
        results.append({
            "title": rec["title"],
            "link": rec["link"],
            "content": rec["content"],
            "extraction": {
                "model": model_name,
                "raw_output": raw,
                "entities": norm["entities"],
                "relationships": norm["relationships"]
            }
        })
    
    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"data/entity-extraction/extractions_{model_name}_{start}_{start+size-1}_{timestamp}.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    logger.info(f"Saved batch to {output_path}")


In [10]:
# Merge outputs
import glob

output_files = glob.glob("data/entity-extraction/extractions_llama3_*.json")
merged = []
for file in output_files:
    with open(file, "r") as f:
        merged.extend(json.load(f))

merged_path = "data/entity-extraction/merged_llama3_extractions.json"
with open(merged_path, "w") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)

print(f"Merged {len(merged)} records into {merged_path}")


Merged 327 records into data/entity-extraction/merged_llama3_extractions.json


In [None]:
%%sql


In [11]:
# Run batch 0-49 (50 records)
run_batch(0, 50, 'llama3')

Batch 0-49:   0%|          | 0/50 [00:00<?, ?it/s]INFO:extraction:Processing 1/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 0-49:   2%|▏         | 1/50 [00:29<24:05, 29.50s/it]INFO:extraction:Processing 2/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 0-49:   4%|▍         | 2/50 [00:43<16:27, 20.58s/it]INFO:extraction:Processing 3/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 0-49:   6%|▌         | 3/50 [01:06<16:45, 21.40s/it]INFO:extraction:Processing 4/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 0-49:   8%|▊         | 4/50 [01:38<19:49, 25.86s/it]INFO:extraction:Processing 5/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 0-49:  10%|█         | 5/50 [01:57<17:24, 23.21s/it]INFO:extraction:Processing 6/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat 

In [12]:
# Run batch 50-99 (50 records)
run_batch(50, 50, 'llama3')

Batch 50-99:   0%|          | 0/50 [00:00<?, ?it/s]INFO:extraction:Processing 51/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 50-99:   2%|▏         | 1/50 [00:22<18:03, 22.10s/it]INFO:extraction:Processing 52/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 50-99:   4%|▍         | 2/50 [00:47<19:08, 23.93s/it]INFO:extraction:Processing 53/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 50-99:   6%|▌         | 3/50 [01:00<14:56, 19.08s/it]INFO:extraction:Processing 54/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 50-99:   8%|▊         | 4/50 [01:40<20:55, 27.30s/it]INFO:extraction:Processing 55/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 50-99:  10%|█         | 5/50 [02:11<21:26, 28.59s/it]INFO:extraction:Processing 56/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:114

In [13]:
# Run batch 100-149 (50 records)
run_batch(100, 50, 'llama3')

Batch 100-149:   0%|          | 0/50 [00:00<?, ?it/s]INFO:extraction:Processing 101/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 100-149:   2%|▏         | 1/50 [00:16<13:35, 16.64s/it]INFO:extraction:Processing 102/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 100-149:   4%|▍         | 2/50 [00:38<15:41, 19.61s/it]INFO:extraction:Processing 103/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 100-149:   6%|▌         | 3/50 [00:52<13:22, 17.06s/it]INFO:extraction:Processing 104/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 100-149:   8%|▊         | 4/50 [01:04<11:42, 15.27s/it]INFO:extraction:Processing 105/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 100-149:  10%|█         | 5/50 [01:24<12:41, 16.93s/it]INFO:extraction:Processing 106/427
INFO:httpx:HTTP Request: POST ht

In [14]:
# Run batch 150-199 (50 records)
run_batch(150, 50, 'llama3')

Batch 150-199:   0%|          | 0/50 [00:00<?, ?it/s]INFO:extraction:Processing 151/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 150-199:   2%|▏         | 1/50 [00:12<10:25, 12.76s/it]INFO:extraction:Processing 152/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 150-199:   4%|▍         | 2/50 [00:27<10:55, 13.65s/it]INFO:extraction:Processing 153/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 150-199:   6%|▌         | 3/50 [00:38<09:56, 12.69s/it]INFO:extraction:Processing 154/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 150-199:   8%|▊         | 4/50 [01:00<12:28, 16.28s/it]INFO:extraction:Processing 155/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 150-199:  10%|█         | 5/50 [01:04<09:01, 12.03s/it]INFO:extraction:Processing 156/427
INFO:httpx:HTTP Request: POST ht

In [15]:
# Run batch 200-249 (50 records)
run_batch(200, 50, 'llama3')

Batch 200-249:   0%|          | 0/50 [00:00<?, ?it/s]INFO:extraction:Processing 201/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 200-249:   2%|▏         | 1/50 [00:30<25:04, 30.70s/it]INFO:extraction:Processing 202/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 200-249:   4%|▍         | 2/50 [00:41<15:03, 18.83s/it]INFO:extraction:Processing 203/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 200-249:   6%|▌         | 3/50 [00:45<09:38, 12.32s/it]INFO:extraction:Processing 204/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 200-249:   8%|▊         | 4/50 [01:11<13:22, 17.45s/it]INFO:extraction:Processing 205/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 200-249:  10%|█         | 5/50 [01:31<13:59, 18.65s/it]INFO:extraction:Processing 206/427
INFO:httpx:HTTP Request: POST ht

In [17]:
# Run batch 250-299 (50 records)
run_batch(250, 50, 'llama3')

Batch 250-299:   0%|          | 0/50 [00:00<?, ?it/s]INFO:extraction:Processing 251/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 250-299:   2%|▏         | 1/50 [00:17<14:12, 17.39s/it]INFO:extraction:Processing 252/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 250-299:   4%|▍         | 2/50 [00:30<11:52, 14.85s/it]INFO:extraction:Processing 253/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 250-299:   6%|▌         | 3/50 [00:39<09:41, 12.37s/it]INFO:extraction:Processing 254/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 250-299:   8%|▊         | 4/50 [01:13<15:47, 20.59s/it]INFO:extraction:Processing 255/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 250-299:  10%|█         | 5/50 [01:28<14:02, 18.72s/it]INFO:extraction:Processing 256/427
INFO:httpx:HTTP Request: POST ht

In [18]:
# Run batch 300-349 (50 records)
run_batch(300, 50, 'llama3')

Batch 300-349:   0%|          | 0/50 [00:00<?, ?it/s]INFO:extraction:Processing 301/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 300-349:   2%|▏         | 1/50 [00:23<19:29, 23.87s/it]INFO:extraction:Processing 302/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 300-349:   4%|▍         | 2/50 [00:36<13:49, 17.28s/it]INFO:extraction:Processing 303/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 300-349:   6%|▌         | 3/50 [00:51<12:48, 16.34s/it]INFO:extraction:Processing 304/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 300-349:   8%|▊         | 4/50 [01:13<14:07, 18.42s/it]INFO:extraction:Processing 305/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 300-349:  10%|█         | 5/50 [01:30<13:21, 17.81s/it]INFO:extraction:Processing 306/427
INFO:httpx:HTTP Request: POST ht

In [22]:
# Run batch 350-399 (50 records)
run_batch(350, 50, 'llama3')

Batch 350-399:   0%|          | 0/50 [00:00<?, ?it/s]INFO:extraction:Processing 351/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 350-399:   2%|▏         | 1/50 [00:06<05:38,  6.91s/it]INFO:extraction:Processing 352/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 350-399:   4%|▍         | 2/50 [00:25<10:47, 13.49s/it]INFO:extraction:Processing 353/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 350-399:   6%|▌         | 3/50 [00:36<10:00, 12.77s/it]INFO:extraction:Processing 354/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 350-399:   8%|▊         | 4/50 [00:48<09:34, 12.48s/it]INFO:extraction:Processing 355/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 350-399:  10%|█         | 5/50 [00:54<07:27,  9.95s/it]INFO:extraction:Processing 356/427
INFO:httpx:HTTP Request: POST ht

In [21]:
# Run batch 400-426 (remaining 27 records)
run_batch(400, 27, 'llama3')

Batch 400-426:   0%|          | 0/27 [00:00<?, ?it/s]INFO:extraction:Processing 401/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 400-426:   4%|▎         | 1/27 [00:26<11:21, 26.21s/it]INFO:extraction:Processing 402/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 400-426:   7%|▋         | 2/27 [00:41<08:16, 19.87s/it]INFO:extraction:Processing 403/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 400-426:  11%|█         | 3/27 [00:53<06:28, 16.17s/it]INFO:extraction:Processing 404/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 400-426:  15%|█▍        | 4/27 [01:08<06:03, 15.79s/it]INFO:extraction:Processing 405/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 400-426:  19%|█▊        | 5/27 [01:26<06:06, 16.64s/it]INFO:extraction:Processing 406/427
INFO:httpx:HTTP Request: POST ht

In [24]:
# Merge all batch outputs
import glob

output_files = glob.glob("data/entity-extraction/extractions_llama3_*.json")
merged = []
for file in output_files:
    with open(file, "r") as f:
        merged.extend(json.load(f))

merged_path = "data/entity-extraction/merged_llama3_extractions_full_content.json"
with open(merged_path, "w") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)

print(f"✅ Merged {len(merged)} records into {merged_path}")
print(f"🎉 All batches completed with FULL CONTENT!")

✅ Merged 427 records into data/entity-extraction/merged_llama3_extractions_full_content.json
🎉 All batches completed with FULL CONTENT!
