In [22]:
import pandas as pd
import sqlite3
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# 📌 1. LLM 설정 (오픈소스 모델, HuggingFace 기준)
# model_id = "tiiuae/falcon-7b-instruct"
model_id = "Qwen/Qwen2.5-7B"
# model_id = "deepseek-ai/deepseek-llm-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
text_gen = pipeline("text-generation", 
                    model=model, 
                    tokenizer=tokenizer, 
                    max_new_tokens=512,
                    return_full_text=False
                    )
llm = HuggingFacePipeline(pipeline=text_gen)

Fetching 4 files: 100%|██████████| 4/4 [02:12<00:00, 33.15s/it] 
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards: 100%|██████████| 4/4 [00:01<00:00,  2.76it/s]
Device set to use cuda:0


In [21]:
# 📌 2. GNN 결과 로드
df = pd.read_csv("/home/mskim2/GDN/csv/swat/test_result.csv")

# 📌 3. true positive 시점 필터링
tp_df = df[(df["ground truth label"] == 1.0) & (df["model prediction"] == 1.0)]

# 📌 4-1. DB 연결 (SQLite / CSV 예시)
conn = sqlite3.connect("sensor_data.db")
def get_raw_sensor_data(sensor: str, time_idx: int, window: int = 10) -> str:
    query = f"""
        SELECT timestamp, value
        FROM raw_data
        WHERE sensor_id = '{sensor}'
        AND time_index BETWEEN {time_idx - window} AND {time_idx + window}
        ORDER BY time_index
    """
    result = pd.read_sql(query, conn)
    return result.to_string(index=False)

def get_sensor_data_block(sensor: list, time_idx: int, window: int = 10) -> str:
    raw_df = pd.read_csv("/home/mskim2/GDN/data/swat/test.csv")
    start = max(0, time_idx - window)
    end = min(len(raw_df), time_idx + window + 1)
    block = raw_df.loc[start:end, sensor]
    return block.to_string(index=False)

# 📌 5. 도메인 매뉴얼 불러오기
with open("./manual.txt", "r") as f:
    manual_text = f.read()

# 📌 6. 루트 원인 분석 루프
slide_win = 5
for _, row in tp_df.iterrows():
    time_idx = int(row["timestamp"]+slide_win)
    sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
    sensors = [s.split(":")[0] for s in sensors_scores]

    sensor_data_blocks = []
    for sensor in sensors:
        raw = get_sensor_data_block(sensor, time_idx)
        sensor_data_blocks.append(f"Sensor: {sensor}\n{raw}")

    sensor_data_str = "\n\n".join(sensor_data_blocks)

    prompt_template = ChatPromptTemplate.from_messages([
        ("system", f"You are an expert in root cause analysis for cyber-physical systems, especially industrial water treatment systems. Your task is to analyze sensor behavior and identify plausible causes of detected anomalies. Use the domain knowledge provided to reason about physical, logical, or cyber attack-induced failures. Respond concisely and clearly."),
        ("user", f"""Domain Knowledge:
{manual_text}

Time Point: {time_idx}
Top 3 Sensors (by anomaly score): {', '.join(sensors)}

Raw Sensor Data (±10 points around anomaly):
{sensor_data_str}

Please analyze this and explain the most likely root cause of the anomaly at time {time_idx}. Respond in bullet points.""")
    ])

    chain = prompt_template | llm
    response = chain.invoke({})
    print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response}\n")
    break

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



--- Root Cause Analysis for time 1533 ---

- The most likely root cause of the anomaly at time 1533 is a PLC compromise.
- The anomaly could be caused by a malicious actor attempting to gain unauthorized access to the plant's control system.
- The anomaly could be due to a software bug that allowed an attacker to manipulate sensor values.
- The anomaly could be caused by a hardware failure, such as a faulty sensor or valve, leading to incorrect sensor values.
- The anomaly could be due to a communication issue between the control system and the SCADA HMI, causing the system to misbehave.
- The anomaly could be caused by a power failure, leading to the control system losing power and the sensors failing.

