# Optimized Entity and Relationship Extraction with Ollama (Llama3)

This notebook extracts entities and relationships from threat intelligence data using Llama3 via Ollama.

- Optimized for speed: Shorter prompts, fewer few-shots, content limit 500 chars.
- Fixed parsing/normalization to preserve data.
- Input: data/raw/merged_threat_intelligence.json
- Output: data/entity-extraction/extractions_llama3_{start}_{end}_{timestamp}.json


In [1]:
import os
import json
import datetime as dt
from typing import List, Dict, Any, Optional
from pathlib import Path
from tqdm import tqdm
import logging

import ollama

# Setup
os.environ['OLLAMA_NUM_GPU_LAYERS'] = '999'  # Force GPU offload
PROJECT_ROOT = Path('/Users/huynguyen/Documents/UIT/2nd/NLP/LLM-TKIG')
os.chdir(PROJECT_ROOT)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("extraction")

# Pull Llama3 if not present
try:
    ollama.show('llama3')
except:
    ollama.pull('llama3')


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/show "HTTP/1.1 200 OK"


In [2]:
# Prompts (optimized: fewer few-shots, shorter content)
SYSTEM_PROMPT = """
You are an expert cyber threat intelligence annotator. Extract named entities and relationships from the given document.
Return strict JSON with keys: entities, relationships.
- entities: list of {"text": str, "type": str} where type in [Malware, Tool, ThreatActor, CVE, IPAddress, Domain, URL, Email, File, Crypto, Platform, Technique, Vendor, Device, Severity, GeographicLocation].
- relationships: list of [subject, predicate, object]. Predicates: uses, exploits, targets, etc.
No extra commentary.
"""

USER_TEMPLATE = "Title: {title}\nLink: {link}\nContent:\n{content}\n\nReturn JSON only."

# Reduced few-shot examples (only 1 for speed)
FEW_SHOT_EXAMPLES = [
    {
        "title": "Example Title",
        "link": "example.com",
        "content": "APT29 uses Mimikatz to exploit CVE-2019-1234 in Windows.",
        "output": {
            "entities": [{"text": "APT29", "type": "ThreatActor"}, {"text": "Mimikatz", "type": "Tool"}, {"text": "CVE-2019-1234", "type": "CVE"}, {"text": "Windows", "type": "Platform"}],
            "relationships": [["APT29", "uses", "Mimikatz"], ["Mimikatz", "exploits", "CVE-2019-1234"], ["APT29", "targets", "Windows"]]
        }
    }
]


In [3]:
# Functions
def build_prompt(title, link, content):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    for ex in FEW_SHOT_EXAMPLES:
        messages.append({"role": "user", "content": USER_TEMPLATE.format(**ex)})
        messages.append({"role": "assistant", "content": json.dumps(ex['output'])})
    messages.append({"role": "user", "content": USER_TEMPLATE.format(title=title, link=link, content=content[:500])})
    return messages

def extract_with_model(messages, model_name='llama3'):
    response = ollama.chat(model=model_name, messages=messages)
    return response['message']['content']


In [4]:
# Fixed safe_parse_json (more robust: extract JSON block if extra text)
def safe_parse_json(s: str) -> Optional[Dict[str, Any]]:
    import re
    try:
        # Find largest JSON object
        match = re.search(r'\{[\s\S]*\}', s)
        if match:
            return json.loads(match.group(0))
        return json.loads(s)
    except json.JSONDecodeError as e:
        logger.warning(f"JSON parse error: {e}")
        return None


In [5]:
# Fixed normalize_output (less strict, keep originals, convert rel to dict)
def normalize_output(parsed: Dict[str, Any]) -> Dict[str, Any]:
    entities = parsed.get("entities", [])
    relationships = parsed.get("relationships", [])

    norm_entities: List[Dict[str, str]] = []
    seen_e = set()
    for e in entities:
        if not isinstance(e, dict): continue
        text = (e.get("text") or "").strip()
        etype = (e.get("type") or "").strip()  # No capitalization
        if text and etype and (text, etype) not in seen_e:
            seen_e.add((text, etype))
            norm_entities.append({"text": text, "type": etype})

    norm_rels: List[Dict[str, str]] = []
    seen_r = set()
    for r in relationships:
        if not isinstance(r, list) or len(r) != 3: continue
        sub, pred, obj = [x.strip() for x in r]
        if sub and pred and obj:
            tuple_r = (sub, pred, obj)
            if tuple_r not in seen_r:
                seen_r.add(tuple_r)
                norm_rels.append({"subject": sub, "predicate": pred, "object": obj})

    return {"entities": norm_entities, "relationships": norm_rels}


In [6]:
# Load data
INPUT_PATH = "data/raw/merged_threat_intelligence.json"
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)
print(f"Loaded {len(data)} records")


Loaded 427 records


In [7]:
# Test example
sample = data[0]
messages = build_prompt(sample['title'], sample['link'], sample['content'])
raw = extract_with_model(messages, 'llama3')
print("Raw Output:", raw)

parsed = safe_parse_json(raw) or {"entities": [], "relationships": []}
normalized = normalize_output(parsed)
print("Normalized:", normalized)


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Raw Output: {"entities": [{"text": "NailaoLocker", "type": "Malware"}, {"text": "SM2", "type": "Crypto"}, {"text": "Fortinet", "type": "Vendor"}], "relationships": []}
Normalized: {'entities': [{'text': 'NailaoLocker', 'type': 'Malware'}, {'text': 'SM2', 'type': 'Crypto'}, {'text': 'Fortinet', 'type': 'Vendor'}], 'relationships': []}


In [8]:
# Batch annotation function
def run_batch(start: int, size: int, model_name: str):
    batch = data[start:start+size]
    results = []
    for i in tqdm(range(len(batch)), desc=f"Batch {start}-{start+size-1}"):
        rec = batch[i]
        pos = start + i + 1
        logger.info(f"Processing {pos}/{len(data)}")
        messages = build_prompt(rec.get("title", ""), rec.get("link", ""), rec.get("content", ""))
        raw = extract_with_model(messages, model_name)
        parsed = safe_parse_json(raw) or {"entities": [], "relationships": []}
        norm = normalize_output(parsed)
        results.append({
            "title": rec["title"],
            "link": rec["link"],
            "content": rec["content"],
            "extraction": {
                "model": model_name,
                "raw_output": raw,
                "entities": norm["entities"],
                "relationships": norm["relationships"]
            }
        })
    
    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_path = f"data/entity-extraction/extractions_{model_name}_{start}_{start+size-1}_{timestamp}.json"
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    logger.info(f"Saved batch to {output_path}")


In [9]:
# Run batch 0-99 with llama3
run_batch(0, 100, "llama3")


Batch 0-99:   0%|          | 0/100 [00:00<?, ?it/s]INFO:extraction:Processing 1/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 0-99:   1%|          | 1/100 [00:06<10:06,  6.13s/it]INFO:extraction:Processing 2/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 0-99:   2%|▏         | 2/100 [00:10<08:23,  5.14s/it]INFO:extraction:Processing 3/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 0-99:   3%|▎         | 3/100 [00:13<06:32,  4.05s/it]INFO:extraction:Processing 4/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 0-99:   4%|▍         | 4/100 [00:21<08:49,  5.52s/it]INFO:extraction:Processing 5/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
Batch 0-99:   5%|▌         | 5/100 [00:24<07:25,  4.69s/it]INFO:extraction:Processing 6/427
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api

In [10]:
# Merge outputs
import glob

output_files = glob.glob("data/entity-extraction/extractions_llama3_*.json")
merged = []
for file in output_files:
    with open(file, "r") as f:
        merged.extend(json.load(f))

merged_path = "data/entity-extraction/merged_llama3_extractions.json"
with open(merged_path, "w") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)

print(f"Merged {len(merged)} records into {merged_path}")


Merged 100 records into data/entity-extraction/merged_llama3_extractions.json


# Entity and Relationship Extraction with Ollama and GPT-OSS

This notebook extracts named entities and relationships from threat intelligence articles using local models via Ollama and GPT-OSS-20B.

- Input data: `data/raw/merged_threat_intelligence.json`
- Output data: `data/entity-extraction/extractions_{model}_{timestamp}.json`

Follows LLM-TIKG methodology with clean prompts and structured outputs.


In [11]:
import os
from pathlib import Path

PROJECT_ROOT = Path('/Users/huynguyen/Documents/UIT/2nd/NLP/LLM-TKIG')
os.chdir(PROJECT_ROOT)
print(f"CWD set to: {Path.cwd()}")


CWD set to: /Users/huynguyen/Documents/UIT/2nd/NLP/LLM-TKIG


In [12]:
# Install required packages
%pip install ollama transformers safetensors gpt-oss tqdm --quiet



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
# Setup Ollama client with GPU offload
import os
import ollama

# Force GPU usage (offload all layers to Metal/GPU on Mac)
os.environ['OLLAMA_NUM_GPU_LAYERS'] = '999'  # 999 = all layers; adjust if OOM

# Models are already pulled: llama3 and gpt-oss:20b


In [14]:
import json
import datetime as dt
from typing import List, Dict, Any

# Prompts
SYSTEM_PROMPT = """
You are an expert cyber threat intelligence annotator. 
Extract named entities and relationships from the given document. 
Return strict JSON with keys: entities, relationships. 
- entities: list of {text, type} where type in [Malware, Tool, ThreatActor, CVE, IPAddress, Domain, URL, Email, File, Crypto, Platform, Technique, Vendor, Device, Severity, GeographicLocation]. 
- relationships: list of [subject, predicate, object]. Predicates should be short snake_case verbs e.g., uses, exploits, targets, communicates_with, downloads, associated_with, attributed_to, mitigates, detects, distributes, drops. 
Follow the few-shot examples below exactly for output formatting. No extra commentary.
"""

USER_TEMPLATE = (
    "Title: {title}\n"
    "Link: {link}\n"
    "Content:\n{content}\n\n"
    "Return JSON only."
)

# Few-shot examples
FEW_SHOT_EXAMPLES: List[Dict[str, Any]] = [
    {
        "title": "Sample: NailaoLocker uses AES-256-CBC",
        "link": "https://example.com/1",
        "content": (
            "Fortinet reports that NailaoLocker ransomware uses AES-256-CBC on Microsoft Windows. "
            "Fortinet detects the threat early."
        ),
        "output": {
            "entities": [
                {"text": "NailaoLocker", "type": "Malware"},
                {"text": "AES-256-CBC", "type": "Technique"},
                {"text": "Microsoft Windows", "type": "Platform"},
                {"text": "Fortinet", "type": "Vendor"}
            ],
            "relationships": [
                ["NailaoLocker", "uses", "AES-256-CBC"],
                ["Fortinet", "detects", "NailaoLocker"]
            ]
        }
    },
    {
        "title": "Sample: Tool communicates with domain and downloads file",
        "link": "https://example.com/2",
        "content": (
            "Cobalt Strike beacon communicates with c2.example.com and downloads payload.dll."
        ),
        "output": {
            "entities": [
                {"text": "Cobalt Strike", "type": "Tool"},
                {"text": "c2.example.com", "type": "Domain"},
                {"text": "payload.dll", "type": "File"}
            ],
            "relationships": [
                ["Cobalt Strike", "communicates_with", "c2.example.com"],
                ["Cobalt Strike", "downloads", "payload.dll"]
            ]
        }
    }
]


In [15]:
# Functions for building prompts and calling models
def build_prompt(title, link, content):
    messages = [{"role": "system", "content": SYSTEM_PROMPT}]
    # Add few-shot
    for ex in FEW_SHOT_EXAMPLES:
        messages.append({"role": "user", "content": USER_TEMPLATE.format(**ex)})
        messages.append({"role": "assistant", "content": json.dumps(ex['output'])})
    messages.append({"role": "user", "content": USER_TEMPLATE.format(title=title, link=link, content=content[:800])})
    return messages

def extract_with_model(messages, model_name):
    response = ollama.chat(model=model_name, messages=messages)
    return response['message']['content']


In [16]:
from typing import Optional

def safe_parse_json(s: str) -> Optional[Dict[str, Any]]:
    try:
        return json.loads(s)
    except Exception:
        # Try to find the first/last braces
        try:
            start = s.find("{")
            end = s.rfind("}")
            if start >= 0 and end > start:
                return json.loads(s[start:end+1])
        except Exception:
            return None
    return None

def normalize_output(parsed: Dict[str, Any]) -> Dict[str, Any]:
    entities = parsed.get("entities", []) or []
    relationships = parsed.get("relationships", []) or []

    norm_entities: List[Dict[str, str]] = []
    seen_e = set()
    for e in entities:
        if not isinstance(e, dict):
            continue
        text = (e.get("text") or "").strip()
        etype = (e.get("type") or "").strip()
        if not text or not etype:
            continue
        key = (text, etype)
        if key in seen_e:
            continue
        seen_e.add(key)
        norm_entities.append({"text": text, "type": etype})

    norm_rels: List[List[str]] = []
    for r in relationships:
        if not isinstance(r, (list, tuple)) or len(r) != 3:
            continue
        s, p, o = (str(r[0]).strip(), str(r[1]).strip(), str(r[2]).strip())
        if not s or not p or not o:
            continue
        norm_rels.append([s, p, o])

    return {"entities": norm_entities, "relationships": norm_rels}


In [17]:
# Load data
INPUT_PATH = "data/raw/merged_threat_intelligence.json"
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    data = json.load(f)

# Example extraction for one record using Llama3
sample = data[0]
messages = build_prompt(sample['title'], sample['link'], sample['content'])
llama_output = extract_with_model(messages, 'llama3')
print("Llama3 Output:", llama_output)

# Parse and normalize
parsed = safe_parse_json(llama_output) or {"entities": [], "relationships": []}
normalized = normalize_output(parsed)
print("Normalized:", normalized)


INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"


Llama3 Output: {"entities": [{"text": "FortiGuard Labs", "type": "Vendor"}, {"text": "NailaoLocker", "type": "Malware"}, {"text": "SM2", "type": "Technique"}, {"text": "Kuan-Yen Liu", "type": "ThreatActor"}, {"text": "Yen-Ting Lee", "type": "ThreatActor"}, {"text": "David Adamson", "type": "ThreatActor"}, {"text": "Akshat Pradhan", "type": "ThreatActor"}, {"text": "Lcrypt0rx", "type": "Malware"}, {"text": "H2Miner", "type": "Tool"}, {"text": "Monero", "type": "Crypto"}, {"text": "Dark 101", "type": "Malware"}], "relationships": [["FortiGuard Labs", "analyzes", "NailaoLocker"], ["NailaoLocker", "uses", "SM2"], ["FortiCNAPP", "links", "weak signals into clear timelines"], ["FortiCNAPP Composite Alerts", "detects", "cloud-native threats"], ["Lcrypt0rx", "used in", "H2Miner campaigns"], ["FortiGuard Labs Threat Research", "discovers", "Dark 101 ransomware"]]}
Normalized: {'entities': [{'text': 'FortiGuard Labs', 'type': 'Vendor'}, {'text': 'NailaoLocker', 'type': 'Malware'}, {'text': 'SM2'

In [18]:
import logging
from pathlib import Path
from tqdm import tqdm

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("extraction")

def annotate_records(records: List[Dict[str, Any]], model_name: str, start_offset: int = 0) -> List[Dict[str, Any]]:
    results = []
    total = len(records)
    for i, rec in tqdm(enumerate(records), total=total, desc=f"Annotating batch with {model_name}"):
        pos = start_offset + i + 1
        logger.info(f"Processing record {pos}/{len(data)} with {model_name}")
        title = rec.get("title", "")
        link = rec.get("link", "")
        content = rec.get("content", "")[:800]
        messages = build_prompt(title, link, content)
        
        raw = extract_with_model(messages, model_name)
        
        parsed = safe_parse_json(raw) or {"entities": [], "relationships": []}
        norm = normalize_output(parsed)
        
        results.append({
            "title": title,
            "link": link,
            "content": content,
            "extraction": {
                "model": model_name,
                "raw_output": raw,
                "entities": norm["entities"],
                "relationships": norm["relationships"]
            }
        })
        logger.info(f"Completed record {pos}/{len(data)}")
    return results

def run_batch(start: int, size: int, model_name: str) -> Path:
    end = min(start + size, len(data))
    batch = data[start:end]
    results = annotate_records(batch, model_name, start_offset=start)
    
    DT_STR = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    OUTPUT_DIR = Path("data/entity-extraction")
    OUTPUT_DIR.mkdir(exist_ok=True)
    out_path = OUTPUT_DIR / f"extractions_{model_name}_{start}_{end-1}_{DT_STR}.json"
    
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    logger.info(f"Saved batch to {out_path}")
    return out_path


In [19]:
# Batch 1: Records 0-99 with llama3
run_batch(0, 100, "llama3")


Annotating batch with llama3:   0%|          | 0/100 [00:00<?, ?it/s]INFO:extraction:Processing record 1/427 with llama3
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:extraction:Completed record 1/427
Annotating batch with llama3:   1%|          | 1/100 [00:08<14:40,  8.89s/it]INFO:extraction:Processing record 2/427 with llama3
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:extraction:Completed record 2/427
Annotating batch with llama3:   2%|▏         | 2/100 [00:13<10:30,  6.43s/it]INFO:extraction:Processing record 3/427 with llama3
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:extraction:Completed record 3/427
Annotating batch with llama3:   3%|▎         | 3/100 [00:15<07:24,  4.58s/it]INFO:extraction:Processing record 4/427 with llama3
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:extraction:Completed record 4/427
Annotating batch

KeyboardInterrupt: 

In [20]:
# Batch 2: Comment out since running 100 in one go
run_batch(100, 200, "llama3")


Annotating batch with llama3:   0%|          | 0/200 [00:00<?, ?it/s]INFO:extraction:Processing record 101/427 with llama3
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:extraction:Completed record 101/427
Annotating batch with llama3:   0%|          | 1/200 [00:07<25:06,  7.57s/it]INFO:extraction:Processing record 102/427 with llama3
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:extraction:Completed record 102/427
Annotating batch with llama3:   1%|          | 2/200 [00:16<28:02,  8.50s/it]INFO:extraction:Processing record 103/427 with llama3
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:extraction:Completed record 103/427
Annotating batch with llama3:   2%|▏         | 3/200 [00:23<25:05,  7.64s/it]INFO:extraction:Processing record 104/427 with llama3
INFO:httpx:HTTP Request: POST http://127.0.0.1:11434/api/chat "HTTP/1.1 200 OK"
INFO:extraction:Completed record 104/427


PosixPath('data/entity-extraction/extractions_llama3_100_299_20250816_150017.json')

In [None]:
# Continue with more batches as needed
run_batch(300, 127, "llama3")


In [None]:
# Merge Llama3 batch outputs
import glob

output_files = glob.glob("data/entity-extraction/extractions_llama3_*.json")
merged = []
for file in output_files:
    with open(file, "r") as f:
        merged.extend(json.load(f))

merged_path = "data/entity-extraction/merged_llama3_extractions.json"
with open(merged_path, "w") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)

print(f"Merged {len(merged)} records into {merged_path}")
