In [1]:
import argparse
import json
import os
import re
import textwrap
from pathlib import Path
from typing import List

# --- PDF → text utility (PyMuPDF)
try:
    import fitz  # PyMuPDF
except ImportError:
    fitz = None  # Will error later if preprocess is used

# --- Deduplication
import text_dedup.minhash
from datasketch import MinHash, MinHashLSH

# --- Hugging Face & PEFT
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

import pandas as pd
import torch
torch.cuda.empty_cache()

In [2]:
# preprocess config
pdf_dir = './pdf_swat'
out_dir = './data'
min_tokens = 128
max_tokens = 2048
overlap = 256

# train config
dataset_path = './data/train.jsonl'
output_dir = './checkpoints'
num_epochs = 3
per_device_train_batch = 2
grad_accum = 4
lr = 2e-4
warmup_steps = 100
lora_r = 64
lora_alpha = 128
lora_dropout = 0.05
max_seq_len = 2048
fp16 = True  # 또는 False

# evaluate config
adapter_dir = './checkpoints/adapter'
eval_dataset_path = './data/eval.json'
eval_questions = './data/questions.csv'  # CSV with columns 'question','answer_regex'


In [6]:
import re
import json
from pathlib import Path
from typing import List
from transformers import AutoTokenizer

# SECTION 제거용 정규표현식
SECTION_PATTERNS = [
    re.compile(r"^references$", re.I),
    re.compile(r"^bibliography$", re.I),
    re.compile(r"^acknowledg(e)?ments?$", re.I),
]

def clean_text(text: str) -> str:
    """Remove references/acknowledgment sections & excessive blank lines."""
    lines = [l.strip() for l in text.splitlines()]
    cleaned: List[str] = []
    skip = False
    for ln in lines:
        if any(p.match(ln.lower()) for p in SECTION_PATTERNS):
            skip = True
        if not skip and ln:
            cleaned.append(ln)
    return "\n".join(cleaned)

def chunk_text(text: str, tokenizer, max_tokens: int, overlap: int) -> List[str]:
    """Slice long text into overlapping chunks by token count."""
    tokens = tokenizer(text)["input_ids"]
    chunks = []
    i = 0
    while i < len(tokens):
        chunk_tokens = tokens[i: i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        if len(chunk_tokens) >= 32:
            chunks.append(chunk_text)
        i += max_tokens - overlap
    return chunks

# 경로 설정
swat_txt_path = Path("swat.txt")  # 각 줄이 하나의 논문 텍스트
out_dir = Path("data")
out_dir.mkdir(parents=True, exist_ok=True)

# 토크나이저 불러오기
with open("token.txt", "r") as f:
    token = f.read().strip()
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token)

# 하이퍼파라미터
max_tokens = 512
overlap = 64
min_tokens = 64

# 처리 시작
raw_records = []
with swat_txt_path.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        raw = line.strip()
        if not raw:
            continue
        cleaned = clean_text(raw)
        raw_records.append({"doc_id": f"swat_{i:03}", "text": cleaned})

# 청크 생성 및 필터링
all_chunks = []
for rec in raw_records:
    chunks = chunk_text(rec["text"], tokenizer, max_tokens, overlap)
    for idx, chunk in enumerate(chunks):
        all_chunks.append({"text": chunk, "source": f"{rec['doc_id']}§{idx}"})

def token_len(example):
    return len(tokenizer(example["text"])["input_ids"])

all_chunks = [c for c in all_chunks if token_len(c) >= min_tokens]
print(f"[i] Final chunks: {len(all_chunks)}")

# JSONL로 저장
jsonl_path = out_dir / "train.jsonl"
with jsonl_path.open("w", encoding="utf-8") as f:
    for rec in all_chunks:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"[✓] Saved {jsonl_path}")


[i] Final chunks: 121
[✓] Saved data/train.jsonl


In [3]:
import pdfplumber

def extract_text_from_pdf(pdf_path: Path) -> str:
    """Extract raw text from a single PDF using pdfplumber."""
    text_chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text_chunks.append(page_text)
    return "\n".join(text_chunks)


# def extract_text_from_pdf(pdf_path: Path) -> str:
#     doc = fitz.open(pdf_path)
#     text_chunks = []
#     for page in doc:
#         blocks = page.get_text("blocks")
#         page_text = "\n".join([b[4] for b in blocks])
#         text_chunks.append(page_text)
#     return "\n".join(text_chunks)


SECTION_PATTERNS = [
    re.compile(r"^references$", re.I),
    re.compile(r"^bibliography$", re.I),
    re.compile(r"^acknowledg(e)?ments?$", re.I),
]


def clean_text(text: str) -> str:
    """Remove references/acknowledgment sections & excessive blank lines."""
    lines = [l.strip() for l in text.splitlines()]
    cleaned: List[str] = []
    skip = False
    for ln in lines:
        if any(p.match(ln.lower()) for p in SECTION_PATTERNS):
            skip = True
        if not skip and ln:
            cleaned.append(ln)
    return "\n".join(cleaned)


def chunk_text(text: str, tokenizer, max_tokens: int, overlap: int) -> List[str]:
    """Slice long text into overlapping chunks by token count."""
    tokens = tokenizer(text)["input_ids"]
    chunks = []
    i = 0
    while i < len(tokens):
        chunk_tokens = tokens[i : i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        if len(chunk_tokens) >= 32:  # minimal meaningful length
            chunks.append(chunk_text)
        i += max_tokens - overlap
    return chunks


pdf_dir = Path(pdf_dir)
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)

with open("token.txt", "r") as f:
    token = f.read().strip()

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token)

raw_records = []
for pdf_path in pdf_dir.rglob("*.pdf"):
    raw = extract_text_from_pdf(pdf_path)
    cleaned = clean_text(raw)
    raw_records.append({"doc_id": pdf_path.stem, "text": cleaned})
    print(f"[+] Extracted {pdf_path}")

# Deduplicate
# threshold = 0.88
# num_perm = 128

# texts = [r["text"] for r in raw_records]

# minhashes = []
# for text in texts:
#     m = MinHash(num_perm=num_perm)
#     for word in text.split():
#         m.update(word.encode('utf8'))
#     minhashes.append(m)

# lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
# unique_indices = []
# seen = set()

# for i, m in enumerate(minhashes):
#     duplicates = lsh.query(m)
#     if not duplicates:
#         lsh.insert(f"m{i}", m)
#         unique_indices.append(i)

# unique_records = [raw_records[i] for i in unique_indices]
# print(f"[i] Deduplicated: {len(texts)} → {len(unique_records)} docs")

# Chunking
all_chunks = []
for rec in raw_records:
    chunks = chunk_text(rec["text"], tokenizer, max_tokens, overlap)
    for idx, chunk in enumerate(chunks):
        all_chunks.append({"text": chunk, "source": f"{rec['doc_id']}§{idx}"})

# Filter by min_tokens
min_toks = min_tokens
def token_len(example):
    return len(tokenizer(example["text"])["input_ids"])

all_chunks = [c for c in all_chunks if token_len(c) >= min_toks]
print(f"[i] Final chunks: {len(all_chunks)}")

# Write JSONL
jsonl_path = out_dir / "train.jsonl"
with jsonl_path.open("w", encoding="utf-8") as f:
    for rec in all_chunks:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"[✓] Saved {jsonl_path}")

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf_swat/swat (1).pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf_swat/1911.04831v1.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf_swat/swat.pdf
[+] Extracted pdf_swat/16523-Article Text-20017-1-2-20210518 (4).pdf
[i] Final chunks: 25
[✓] Saved data/train.jsonl


In [4]:
# from pathlib import Path
# from datasets import load_dataset, Dataset
# import re
# import unicodedata
# from unidecode import unidecode

# def remove_weird_unicode(text):
#     text = ''.join(ch for ch in text if unicodedata.category(ch)[0] not in ['C'])
#     text = re.sub(r'[\u2000-\u200f\u2028-\u202f\u205f\u2060-\u206f\ufeff]', ' ', text)
#     text = re.sub(r'[\u2190-\u21ff\u2300-\u23ff\u2500-\u257f\u2700-\u27bf\ue000-\uf8ff\U0001F000-\U0001FAFF]', '', text)
#     return text

# def clean_text(text):
#     text = unicodedata.normalize('NFKC', text)
#     text = re.sub(r'(?<=[가-힣])\s+(?=[가-힣])', '', text)
#     text = re.sub(r'(?<=[a-zA-Z])\s+(?=[a-zA-Z])', '', text)
#     text = remove_weird_unicode(text)
#     text = re.sub(r'[�]', '', text)
#     text = unidecode(text)
#     text = re.sub(r'\s+', ' ', text)
#     return text.strip()

# dataset_path = Path(dataset_path)
# if not dataset_path.exists():
#     raise FileNotFoundError(dataset_path)

# dataset = load_dataset("json", data_files=str(dataset_path), split="train")

# dataset = dataset.map(lambda x: {"text": clean_text(x["text"])})

# dataset.to_json("train_cleaned.jsonl", force_ascii=False)

In [7]:
dataset_path = Path(dataset_path)
if not dataset_path.exists():
    raise FileNotFoundError(dataset_path)

# Load dataset
dataset = load_dataset("json", data_files=str(dataset_path), split="train")

with open("token.txt", "r") as f:
    token = f.read().strip()

# Tokenizer & model
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_auth_token=token)
tokenizer.pad_token = tokenizer.eos_token

# 4-bit QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_auth_token=token, device_map="auto")

# PEFT config
lora_cfg = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

# Tokenize dataset lazily
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_seq_len)

tokenized_ds = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text", "source"])

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=per_device_train_batch,
    gradient_accumulation_steps=grad_accum,
    learning_rate=lr,
    weight_decay=0.1,
    warmup_steps=warmup_steps,
    logging_steps=20,
    save_strategy="epoch",
    bf16=not fp16,
    fp16=fp16,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model()
tokenizer.save_pretrained(output_dir)
print("[✓] Training complete - adapter+tokenizer saved.")


Generating train split: 0 examples [00:00, ? examples/s]



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465


Map (num_proc=4):   0%|          | 0/121 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
20,2.3764
40,2.0435



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignorin

[✓] Training complete - adapter+tokenizer saved.


In [8]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='nltk')
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

In [9]:
from typing import List

def perplexity(eval_texts: List[str], model, tokenizer):
    ppl_list = []

    for text in eval_texts:
        inputs = tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(model.device)

        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss

        if not torch.isnan(loss):
            ppl = torch.exp(loss).item()
            ppl_list.append(ppl)

    if len(ppl_list) == 0:
        return float("nan")

    return sum(ppl_list) / len(ppl_list)


pretrined = False

if pretrined==True:
    base_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_auth_token=token, device_map="auto")
    # Load LoRA adapter
    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, output_dir)
model.eval()

# 3-5 random chunks for perplexity
eval_ds = load_dataset("json", data_files=str(dataset_path), split="train") #[:1%]
sample_texts = [eval_ds[i]["text"] for i in range(min(5, len(eval_ds)))]
ppl = perplexity(sample_texts, model, tokenizer)
print(f"[i] Domain PPL ≈ {ppl:.2f}")

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

if eval_questions:
    df = pd.read_csv(eval_questions)
    results = []

    for idx, row in df.iterrows():
        prompt = textwrap.dedent(
            f"""\
            [INST] {row['question']} [/INST]
            """
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        gen = model.generate(**inputs, max_new_tokens=64, pad_token_id=tokenizer.eos_token_id)
        answer = tokenizer.decode(gen[0], skip_special_tokens=True).strip()
        gold = row["answer"].strip()

        bleu = sentence_bleu([gold.split()], answer.split())
        rouge_l = scorer.score(gold, answer)['rougeL'].fmeasure
        print(f"[Answer]: {gold}")
        print(f"[{idx}] BLEU: {bleu}, ROUGE-L: {rouge_l}")

[i] Domain PPL ≈ 8.24
[Answer]: CVD utilizes chemical reactions on substrates with gaseous precursors, while PVD relies on physical vaporization/condensation without chemical changes.
[0] BLEU: 8.614911585158347e-232, ROUGE-L: 0.07594936708860761
[Answer]: 300-400°C for substrate-specific deposition through surface-limited reactions.
[1] BLEU: 7.884916681118857e-232, ROUGE-L: 0.05797101449275362
[Answer]: 0.5-2 Torr achieves >95% conformality in high-aspect-ratio structures.
[2] BLEU: 6.441148769597431e-232, ROUGE-L: 0.02631578947368421
[Answer]: Growth rate increases logarithmically from 0.5 to 3 μm/min as H₂ flow rises from 10 to 50 slm.
[3] BLEU: 2.89177102115629e-155, ROUGE-L: 0.16470588235294117


KeyboardInterrupt: 

In [10]:
import pandas as pd
import torch
import sqlite3
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

llm_pipeline = pipeline(
    "text-generation",
    model=trainer.model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    return_full_text=False,
    temperature=0.1,
    device_map="auto"
)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoFo

In [11]:
# 2. GNN 결과 로드
test_result = pd.read_csv("/home/mskim2/GDN/csv/swat/test_result.csv")
attention = pd.read_csv("/home/mskim2/GDN/csv/swat/attention_result.csv")
anomaly_score = pd.read_csv("/home/mskim2/GDN/csv/swat/anomaly_score.csv")
raw_df = pd.read_csv("/home/mskim2/GDN/data/swat/test.csv")

feature_file = open(f'/home/mskim2/GDN/data/swat/list.txt', 'r')
feature_list = []
for ft in feature_file:
    feature_list.append(ft.strip())

attack_point = pd.read_csv("/home/mskim2/GDN/attack_point.csv")
attack_point = attack_point.iloc[5:, -1].tolist()
test_result['attack_point'] = attack_point

In [12]:
# 3. true positive 시점 필터링
tp_df = test_result[(test_result["ground truth label"] == 1.0) & (test_result["model prediction"] == 1.0)& (pd.notna(test_result["attack_point"]))]
tp_df = tp_df.drop(tp_df.index[68])

# 4-1. DB 연결 (SQLite / CSV 예시)
conn = sqlite3.connect("sensor_data.db")
def get_raw_sensor_data(sensor: str, time_idx: int, window: int = 10) -> str:
    query = f"""
        SELECT timestamp, value
        FROM raw_data
        WHERE sensor_id = '{sensor}'
        AND time_index BETWEEN {time_idx - window} AND {time_idx + window}
        ORDER BY time_index
    """
    result = pd.read_sql(query, conn)
    return result.to_string(index=False)

def get_sensor_data_block(raw_df: pd.DataFrame, sensor: list, time_idx: int, window: int = 10) -> str:
    start = max(0, time_idx - window)
    end = min(len(raw_df), time_idx + window + 1)
    block = raw_df.loc[start:end, sensor]
    lines = [f"{val}" for idx, val in block.items()]
    return ", ".join(lines)

def get_attention_data_block(df, sensor, time_idx, window):
    topk = 15
    node_num = 51
    block = df.loc[(time_idx)*node_num*topk:(time_idx+1)*node_num*topk, :].squeeze()
    sensor_graph = {}
    for _, row in block.iterrows():
        source = row['source']
        target = row['target']
        attn = row['attention']

        if source not in sensor_graph:
            sensor_graph[source] = {}
        
        sensor_graph[source][target] = attn

    return sensor_graph

# 5. 도메인 매뉴얼 불러오기
with open("./manual.txt", "r") as f:
    manual_text = f.read()

In [17]:
import re
import json

root = None

# 6. 루트 원인 분석 루프
slide_win = 5
window = 30
prev_value = None
correct = 0
incorrect = 0

for _, row in tp_df.iterrows():
    current_value = row['attack_point']
    if current_value != prev_value:
        time_idx = int(row["timestamp"] + slide_win)
        print("Time Index: ", time_idx)
        sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
        sensors = [s.split(":")[0] for s in sensors_scores]
        print("Top 3 Sensors: ", sensors)

        output_json = {
            "raw_data": {},
            "anomaly_scores": {},
            "attention": {},
        }

        for sensor in feature_list:
            anomaly = get_sensor_data_block(anomaly_score, sensor, time_idx, window=window)
            output_json["anomaly_scores"][sensor] = anomaly

        for sensor in feature_list:
            raw = get_sensor_data_block(raw_df, sensor, time_idx, window=window)
            output_json["raw_data"][sensor] = raw

        anomaly = get_attention_data_block(attention, sensor, time_idx, window=window)
        output_json["attention"] = anomaly

        root = row['attack_point']

        messages = [
        {
            "role": "system",
            "content": """You are an expert in root cause analysis for cyber-physical systems, especially industrial water treatment systems. Your task is to identify plausible root causes of detected anomalies. Use domain knowledge and respond concisely and clearly.

        TASK:
        1. Read the files provided in subsequent messages:
        - Sensor Manual: A textual guide containing descriptions of each sensor and actuator, their intended functionality.
        - Raw Sensor Data: A dictionary mapping each sensor name to a string containing comma-separated raw data over a time window (±30 time steps from the detected anomaly).
        - Attention Weights: A dictionary where each key is a source sensor name, and its value is another dictionary mapping target sensor names to attention values (floats between 0 and 1). The attention values represents the influence or correlation strength from the source sensor to the target sensor as learned by the Graph Neural Network.
        2. Return a JSON object with:
        {
            "root_causes": [
            {"cause": str, "evidence": [sensor_id], "confidence": 0-1 float}
            ],
            "supporting_detail": str (<=150 tokens)
        }
        CONSTRAINTS:
        - Use only the given data; do not hallucinate unseen equipment.
        - Be concise; no markdown, no additional text outside the JSON.
        - Identify the most plausible root cause by considering abnormal changes in raw data or attention weights, as well as the inter-sensor relationships and the system’s operational flow.
        """
        },
        {
            "role": "user",
            "content": f"""
        Top 3 Sensors (by anomaly score): {', '.join(sensors)}

        Sensor Manual:
        {manual_text}

        Raw Sensor Data (±{window} points around anomaly):
        {output_json['raw_data']}

        Attention Weights:
        {output_json['attention']}

        Please analyze and explain the most likely root cause of the anomaly (maximum 3 sensors). Respond only with the required JSON output."""
        }
        ]

        response = llm_pipeline(messages, max_new_tokens=512)
        print("\n--- Root Cause:", root, '---')
        print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response[0]['generated_text']}\n")
        parsed = json.loads(response[0]['generated_text'])
        
        predicted_root_sensors = []
        for i in range(len(parsed['root_causes'])):
            predicted_root = parsed['root_causes'][i]['evidence'][0]
            predicted_root = re.sub(r"([A-Za-z]+)(\d+)", r"\1-\2", predicted_root)            
            predicted_root_sensors.append(predicted_root)

        acc = 1 if any(sensor in root for sensor in predicted_root_sensors) else 0

        if acc:
            correct += 1
        else:
            incorrect += 1
        print("@@@@@@@@@@@@@@@@@@@@@@", predicted_root_sensors, root, "@@@@@@@@@@@@@@@@@@@@@@")

    prev_value = current_value

print(correct, incorrect)
print("Accuracy: ", correct / (correct + incorrect))
print('------------------------------------------------------------------------------')

Time Index:  1538
Top 3 Sensors:  ['FIT401', 'MV301', 'FIT501']

--- Root Cause: FIT-401 ---

--- Root Cause Analysis for time 1538 ---
{
  "root_causes": [
    {"cause": "FIT401 sensor anomaly", "evidence": ["FIT401"], "confidence": 0.99},
    {"cause": "P101 actuator anomaly", "evidence": ["P101"], "confidence": 0.95},
    {"cause": "MV101 actuator anomaly", "evidence": ["MV101"], "confidence": 0.92}
  ],
  "supporting_detail": "The anomaly is likely caused by a sensor or actuator malfunction in the ultrafiltration stage. The sensor data from FIT401 shows a sudden drop in water flow rate, which is not reflected in the other sensors. The attention weights from the Graph Neural Network indicate a strong correlation between FIT401 and P101, suggesting that P101 is likely to be involved in the anomaly. Additionally, the attention weights show a moderate correlation between MV101 and FIT401, indicating that MV101 may also be related to the anomaly. Further investigation is needed to deter