# Llama-3.1-8B (128,000)

In [1]:
import pandas as pd
import torch
import sqlite3
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Llama-3.1-8B-Instruct"
with open("token.txt", "r") as f:
    token = f.read().strip()

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", use_auth_token=token)
llm_pipeline = pipeline("text-generation", 
                    model=model, 
                    tokenizer=tokenizer, 
                    model_kwargs={"torch_dtype": torch.bfloat16},
                    device_map="auto",
                    return_full_text=False,
                    temperature=0.1,
                    )

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.40it/s]
Device set to use cuda:0


In [2]:
# 2. GNN 결과 로드
test_result = pd.read_csv("/home/mskim2/GDN/csv/swat/test_result.csv")
attention = pd.read_csv("/home/mskim2/GDN/csv/swat/attention_result.csv")
anomaly_score = pd.read_csv("/home/mskim2/GDN/csv/swat/anomaly_score.csv")
raw_df = pd.read_csv("/home/mskim2/GDN/data/swat/test.csv")

feature_file = open(f'/home/mskim2/GDN/data/swat/list.txt', 'r')
feature_list = []
for ft in feature_file:
    feature_list.append(ft.strip())

attack_point = pd.read_csv("/home/mskim2/GDN/attack_point.csv")
attack_point = attack_point.iloc[5:, -1].tolist()
test_result['attack_point'] = attack_point

In [3]:
# 3. true positive 시점 필터링
tp_df = test_result[(test_result["ground truth label"] == 1.0) & (test_result["model prediction"] == 1.0)& (pd.notna(test_result["attack_point"]))]

# 4-1. DB 연결 (SQLite / CSV 예시)
conn = sqlite3.connect("sensor_data.db")
def get_raw_sensor_data(sensor: str, time_idx: int, window: int = 10) -> str:
    query = f"""
        SELECT timestamp, value
        FROM raw_data
        WHERE sensor_id = '{sensor}'
        AND time_index BETWEEN {time_idx - window} AND {time_idx + window}
        ORDER BY time_index
    """
    result = pd.read_sql(query, conn)
    return result.to_string(index=False)

def get_sensor_data_block(raw_df: pd.DataFrame, sensor: list, time_idx: int, window: int = 10) -> str:
    start = max(0, time_idx - window)
    end = min(len(raw_df), time_idx + window + 1)
    block = raw_df.loc[start:end, sensor]
    lines = [f"{val}" for idx, val in block.items()]
    return ", ".join(lines)

def get_attention_data_block(df, sensor, time_idx, window):
    topk = 15
    node_num = 51
    block = df.loc[(time_idx)*node_num*topk:(time_idx+1)*node_num*topk, :].squeeze()
    sensor_graph = {}
    for _, row in block.iterrows():
        source = row['source']
        target = row['target']
        attn = row['attention']

        if source not in sensor_graph:
            sensor_graph[source] = {}
        
        sensor_graph[source][target] = attn

    return sensor_graph

# 5. 도메인 매뉴얼 불러오기
with open("./manual.txt", "r") as f:
    manual_text = f.read()

In [4]:
import json

root = None

# 6. 루트 원인 분석 루프
slide_win = 5
window = 10

for _, row in tp_df.iterrows():
    time_idx = int(row["timestamp"] + slide_win)
    sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
    sensors = [s.split(":")[0] for s in sensors_scores]

    output_json = {
        "raw_data": {},
        "anomaly_scores": {},
        "attention": {},
    }

    for sensor in feature_list:
        anomaly = get_sensor_data_block(anomaly_score, sensor, time_idx, window=window)
        output_json["anomaly_scores"][sensor] = anomaly

    for sensor in feature_list:
        raw = get_sensor_data_block(raw_df, sensor, time_idx, window=window)
        output_json["raw_data"][sensor] = raw

    anomaly = get_attention_data_block(attention, sensor, time_idx, window=window)
    output_json["attention"] = anomaly

    root = row['attack_point']
    # break로 첫 번째 tp만 일단
    break

print(root)

FIT-401


In [5]:
messages = [
{
    "role": "system",
    "content": """You are an expert in root cause analysis for cyber-physical systems, especially industrial water treatment systems. Your task is to identify plausible root causes of detected anomalies. Use domain knowledge and respond concisely and clearly.

TASK:
1. Read the files provided in subsequent messages:
- Sensor Manual: A textual guide containing descriptions of each sensor and actuator, their intended functionality.
- Raw Sensor Data: A dictionary mapping each sensor name to a string containing comma-separated raw data over a time window (±30 time steps from the detected anomaly).
- Anomaly Scores: A dictionary mapping each sensor name to a string containing comma-separated anomaly scores, which is learned by the Graph Neural Network, over a time window (±30 time steps from the detected anomaly).
- Attention Weights: A dictionary where each key is a source sensor name, and its value is another dictionary mapping target sensor names to attention scores (floats between 0 and 1). The attention score represents the influence or correlation strength from the source sensor to the target sensor as learned by the Graph Neural Network.
2. Return a JSON object with:
   {
     "root_causes": [
       {"cause": str, "evidence": [sensor_id], "confidence": 0-1 float}
     ],
     "supporting_detail": str (<=150 tokens)
   }
CONSTRAINTS:
- Use only the given data; do not hallucinate unseen equipment.
- Be concise; no markdown, no additional text outside the JSON.
"""
},
{
    "role": "user",
    "content": f"""
Top 3 Sensors (by anomaly score): {', '.join(sensors)}

Sensor Manual:
{manual_text}

Raw Sensor Data (±{window} points around anomaly):
{output_json['raw_data']}

Anomaly Scores (±{window} points around anomaly):
{output_json['anomaly_scores']}

Attention Weights:
{output_json['attention']}

Please analyze and explain the most likely root cause of the anomaly. Respond only with the required JSON output."""
    }
]

In [None]:
response = llm_pipeline(messages, max_new_tokens=512)
print("\n--- Root Cause:", root, '---')
print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response[0]['generated_text']}\n")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Root Cause: FIT-401

--- Root Cause Analysis for time 1538 ---
{
  "root_causes": [
    {"cause": "Possible malfunction of FIT401 sensor", "evidence": ["FIT401"], "confidence": 0.9},
    {"cause": "Possible malfunction of P101 actuator", "evidence": ["P101"], "confidence": 0.8},
    {"cause": "Possible malfunction of MV301 actuator", "evidence": ["MV301"], "confidence": 0.7}
  ],
  "supporting_detail": "The anomaly scores for FIT401 are significantly higher than other sensors, indicating a possible malfunction. The attention weights also show a strong correlation between FIT401 and other sensors, suggesting a potential issue with the sensor's measurement. The P101 actuator's anomaly score is also high, indicating a possible malfunction. The MV301 actuator's anomaly score is lower, but its attention weights show a strong correlation with other sensors, suggesting a potential issue."
}



In [7]:
slide_win = 5
for _, row in tp_df.iterrows():
    time_idx = int(row["timestamp"]+slide_win)
    sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
    sensors = [s.split(":")[0] for s in sensors_scores]

    window=30
    sensor_data_blocks = []
    for sensor in sensors:
        raw = get_sensor_data_block(raw_df, sensor, time_idx, window=window)
        sensor_data_blocks.append(f"Sensor: {sensor}\n{raw}")

    sensor_data_str = "\n\n".join(sensor_data_blocks)

    messages = [
    {
        "role": "system",
        "content": "You are an expert in root cause analysis for cyber-physical systems, especially industrial water treatment systems. Your task is to identify plausible root causes of detected anomalies. Use domain knowledge and respond concisely and clearly."
    },
    {
        "role": "user",
        "content": f"""Domain Knowledge:
{manual_text}

Top 3 Sensors (by anomaly score): {', '.join(sensors)}

Raw Sensor Data (±{window} points around anomaly):
{sensor_data_str}

Please analyze and explain the most likely root cause of the anomaly. Respond concisely and clearly in bullet points."""
    }
]

    response = llm_pipeline(messages, max_new_tokens=512)
    print("Root Cause:", root)
    print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response[0]['generated_text']}\n")
    break

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Root Cause: FIT-401

--- Root Cause Analysis for time 1538 ---
Based on the provided sensor data and domain knowledge, the most likely root cause of the anomaly is:

* **Sensor FIT401 anomaly**: The sensor data shows a sudden drop in value to -13.93543942 and then to -23.93044192, which is likely due to a **sensor failure or malfunction**. This is because the sensor is measuring flow rate, and a sudden drop in value is not physically possible in this context.
* **Sensor MV301 anomaly**: The sensor data shows a constant value of 0.5, which is likely due to a **stuck or faulty sensor**. This is because the motorized valve (MV301) is expected to have varying values based on its position, but the constant value suggests that the sensor is not accurately measuring the valve's position.
* **Sensor FIT501 anomaly**: The sensor data shows a sudden drop in value to -6.637492982 and then to -6.714059509, which is likely due to a **sensor failure or malfunction**. This is because the sensor is me

In [None]:
# 2. GNN 결과 로드
df = pd.read_csv("/home/mskim2/GDN/csv/swat/test_result.csv")

# 3. true positive 시점 필터링
tp_df = df[(df["ground truth label"] == 1.0) & (df["model prediction"] == 1.0)]

# 4-1. DB 연결 (SQLite / CSV 예시)
conn = sqlite3.connect("sensor_data.db")
def get_raw_sensor_data(sensor: str, time_idx: int, window: int = 10) -> str:
    query = f"""
        SELECT timestamp, value
        FROM raw_data
        WHERE sensor_id = '{sensor}'
        AND time_index BETWEEN {time_idx - window} AND {time_idx + window}
        ORDER BY time_index
    """
    result = pd.read_sql(query, conn)
    return result.to_string(index=False)

def get_sensor_data_block(sensor: list, time_idx: int, window: int = 10) -> str:
    raw_df = pd.read_csv("/home/mskim2/GDN/data/swat/test.csv")
    start = max(0, time_idx - window)
    end = min(len(raw_df), time_idx + window + 1)
    block = raw_df.loc[start:end, sensor]
    return block.to_string(index=False)

# 5. 도메인 매뉴얼 불러오기
with open("./manual.txt", "r") as f:
    manual_text = f.read()

# 6. 루트 원인 분석 루프
slide_win = 5
for _, row in tp_df.iterrows():
    time_idx = int(row["timestamp"]+slide_win)
    sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
    sensors = [s.split(":")[0] for s in sensors_scores]

    window=30
    sensor_data_blocks = []
    for sensor in sensors:
        raw = get_sensor_data_block(sensor, time_idx, window=window)
        sensor_data_blocks.append(f"Sensor: {sensor}\n{raw}")

    sensor_data_str = "\n\n".join(sensor_data_blocks)

    messages = [
    {
        "role": "system",
        "content": "You are an expert in root cause analysis for cyber-physical systems, especially industrial water treatment systems. Your task is to identify plausible root causes of detected anomalies. Use domain knowledge and respond concisely and clearly."
    },
    {
        "role": "user",
        "content": f"""Domain Knowledge:
{manual_text}

Time Point: {time_idx}
Top 3 Sensors (by anomaly score): {', '.join(sensors)}

Raw Sensor Data (±{window} points around anomaly):
{sensor_data_str}

Please analyze and explain the most likely root cause of the anomaly at time {time_idx}. Respond concisely and clearly in bullet points."""
    }
]

    response = llm_pipeline(messages, max_new_tokens=512)
    print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response[0]['generated_text']}\n")
    break

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- Root Cause Analysis for time 1533 ---
Based on the provided data and rules, the most likely root cause of the anomaly at time 1533 is:

* **Sensor spoofing or malfunction**: 
  • FIT401, FIT501, and FIT502 show anomalies at time 1533, with FIT401 and FIT501 having negative values, which is physically impossible.
  • The sudden drop in flow rates is not consistent with the expected behavior of the system.
  • The rules state that a pump should never be ON when corresponding flow (FIT) is zero, which is likely violated in this case.
  • The conductivity (AIT201) and pH (AIT202) sensors do not show any anomalies, which suggests that the issue is likely related to the flow sensors.

* **Possible attack vector**: 
  • The anomaly could be caused by a PLC override or spoofing sensor values via simulation mode or HMI tag manipulation.
  • The attacker might have exploited the SCADA workstation or command injection into network devices to disrupt communication and manipulate sensor values

# Falcon-7B (2,048)

In [8]:
import pandas as pd
import sqlite3
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

model_id = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
text_gen = pipeline("text-generation", 
                    model=model, 
                    tokenizer=tokenizer, 
                    max_new_tokens=512,
                    return_full_text=False,
                    )
llm = HuggingFacePipeline(pipeline=text_gen)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.03s/it]
Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=text_gen)


In [13]:
# 2. GNN 결과 로드
df = pd.read_csv("/home/mskim2/GDN/csv/swat/test_result.csv")

# 3. true positive 시점 필터링
tp_df = df[(df["ground truth label"] == 1.0) & (df["model prediction"] == 1.0)]

# 4-1. DB 연결 (SQLite / CSV 예시)
conn = sqlite3.connect("sensor_data.db")
def get_raw_sensor_data(sensor: str, time_idx: int, window: int = 10) -> str:
    query = f"""
        SELECT timestamp, value
        FROM raw_data
        WHERE sensor_id = '{sensor}'
        AND time_index BETWEEN {time_idx - window} AND {time_idx + window}
        ORDER BY time_index
    """
    result = pd.read_sql(query, conn)
    return result.to_string(index=False)

def get_sensor_data_block(sensor: list, time_idx: int, window: int = 10) -> str:
    raw_df = pd.read_csv("/home/mskim2/GDN/data/swat/test.csv")
    start = max(0, time_idx - window)
    end = min(len(raw_df), time_idx + window + 1)
    block = raw_df.loc[start:end, sensor]
    return block.to_string(index=False)

# 5. 도메인 매뉴얼 불러오기
with open("./manual.txt", "r") as f:
    manual_text = f.read()

# 6. 루트 원인 분석 루프
slide_win = 5
for _, row in tp_df.iterrows():
    time_idx = int(row["timestamp"]+slide_win)
    sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
    sensors = [s.split(":")[0] for s in sensors_scores]

    window=20
    sensor_data_blocks = []
    for sensor in sensors:
        raw = get_sensor_data_block(sensor, time_idx, window=window)
        sensor_data_blocks.append(f"Sensor: {sensor}\n{raw}")

    sensor_data_str = "\n\n".join(sensor_data_blocks)

    prompt_template = ChatPromptTemplate.from_messages([
        ("system", f"You are an expert in root cause analysis for cyber-physical systems, especially industrial water treatment systems. Your task is to identify plausible root causes of detected anomalies. Use domain knowledge and respond concisely and clearly."),
        ("human", f"""Domain Knowledge:
{manual_text}

Time Point: {time_idx}
Top 3 Sensors (by anomaly score): {', '.join(sensors)}

Raw Sensor Data (±{window} points around anomaly):
{sensor_data_str}

Please analyze this and explain the most likely root cause of the anomaly at time {time_idx}. Respond concisely and clearly in bullet points.""")
    ])

    chain = prompt_template | llm
    response = chain.invoke({})
    print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response}\n")
    break

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



--- Root Cause Analysis for time 1533 ---

- Spoofing sensor values (FIT401, AIT201) via simulation mode or HMI tag manipulation to damage RO membrane.
- Disabling UV401 via alarm setpoint manipulation to damage RO membrane.
- Exploiting SCADA workstation (e.g., EternalBlue) to alter plant operation logic.
- Command injection into network devices (e.g., MOXA access points) to disrupt communication.

The most likely root cause of the anomaly at time 1533 is spoofing sensor values. This is due to the fact that the anomaly occurred during a time when the sensor values were being manipulated through simulation mode or HMI tag manipulation. The spoofing could be intentional or unintentional, but it is likely that the root cause is related to the manipulation.



# DeepSeek-7B (4,096)

In [4]:
import torch
import pandas as pd
import sqlite3
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoTokenizer, GenerationConfig

model_name = "deepseek-ai/deepseek-llm-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.57s/it]


In [5]:
# 2. GNN 결과 로드
df = pd.read_csv("/home/mskim2/GDN/csv/swat/test_result.csv")

# 3. true positive 시점 필터링
tp_df = df[(df["ground truth label"] == 1.0) & (df["model prediction"] == 1.0)]

# 4-1. DB 연결 (SQLite / CSV 예시)
conn = sqlite3.connect("sensor_data.db")
def get_raw_sensor_data(sensor: str, time_idx: int, window: int = 10) -> str:
    query = f"""
        SELECT timestamp, value
        FROM raw_data
        WHERE sensor_id = '{sensor}'
        AND time_index BETWEEN {time_idx - window} AND {time_idx + window}
        ORDER BY time_index
    """
    result = pd.read_sql(query, conn)
    return result.to_string(index=False)

def get_sensor_data_block(sensor: list, time_idx: int, window: int = 10) -> str:
    raw_df = pd.read_csv("/home/mskim2/GDN/data/swat/test.csv")
    start = max(0, time_idx - window)
    end = min(len(raw_df), time_idx + window + 1)
    block = raw_df.loc[start:end, sensor]
    return block.to_string(index=False)

# 5. 도메인 매뉴얼 불러오기
with open("./manual.txt", "r") as f:
    manual_text = f.read()

# 6. 루트 원인 분석 루프 (DeepSeek 스타일 프롬프트 적용)
slide_win = 5
for _, row in tp_df.iterrows():
    time_idx = int(row["timestamp"] + slide_win)
    sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
    sensors = [s.split(":")[0] for s in sensors_scores]

    window = 30
    sensor_data_blocks = []
    for sensor in sensors:
        raw = get_sensor_data_block(sensor, time_idx, window=window)
        sensor_data_blocks.append(f"Sensor: {sensor}\n{raw}")

    sensor_data_str = "\n\n".join(sensor_data_blocks)

    messages = [
        {"role": "user", "content": f"""Domain Knowledge:
    {manual_text}

    Time Point: {time_idx}
    Top 3 Sensors (by anomaly score): {', '.join(sensors)}

    Raw Sensor Data (±{window} points around anomaly):
    {sensor_data_str}

    Please analyze the anomaly and explain the most likely root cause at time {time_idx}. Respond concisely and clearly in bullet points."""}
    ]

    # 템플릿 적용
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
    outputs = model.generate(inputs.to(model.device), max_new_tokens=512)

    # 응답 디코딩
    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)

    print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response}\n")
    break


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Root Cause Analysis for time 1533 ---
* FIT401 and FIT501 show significant spikes in values, reaching -30 in some cases, which is highly unusual for sensor values.
* FIT502 has more normal values but still shows some unusual fluctuations.
* The most likely root cause of these anomalies is a physical failure or malfunction of the sensors themselves, leading to incorrect or inconsistent readings.
* It is also possible that there is a communication issue between the sensors and the SCADA system, causing data corruption or delay.
* It is important to investigate these anomalies further by physically inspecting the sensors, checking their calibration, and verifying their connection to the system.

