# Llama-3.1-8B (128,000)

In [1]:
import pandas as pd
import torch
import sqlite3
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

model_id = "meta-llama/Llama-3.1-8B-Instruct"
with open("token.txt", "r") as f:
    token = f.read().strip()

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=token)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", use_auth_token=token)
llm_pipeline = pipeline("text-generation", 
                    model=model, 
                    tokenizer=tokenizer, 
                    model_kwargs={"torch_dtype": torch.bfloat16},
                    device_map="auto",
                    return_full_text=False,
                    temperature=0.1,
                    )

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.74s/it]
Device set to use cuda:0


In [2]:
# import pandas as pd
# import torch
# import sqlite3
# from langchain.prompts import ChatPromptTemplate
# from langchain.llms import HuggingFacePipeline
# from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

# model = AutoModelForCausalLM.from_pretrained("./saved_model")
# tokenizer = AutoTokenizer.from_pretrained("./saved_model")

# llm_pipeline = pipeline("text-generation", 
#                     model=model, 
#                     tokenizer=tokenizer, 
#                     model_kwargs={"torch_dtype": torch.bfloat16},
#                     device_map="auto",
#                     return_full_text=False,
#                     temperature=0.1,
#                     )

In [3]:
# 2. GNN 결과 로드
test_result = pd.read_csv("/home/mskim2/GDN/csv/swat/test_result.csv")
attention = pd.read_csv("/home/mskim2/GDN/csv/swat/attention_result.csv")
anomaly_score = pd.read_csv("/home/mskim2/GDN/csv/swat/anomaly_score.csv")
raw_df = pd.read_csv("/home/mskim2/GDN/data/swat/test.csv")

feature_file = open(f'/home/mskim2/GDN/data/swat/list.txt', 'r')
feature_list = []
for ft in feature_file:
    feature_list.append(ft.strip())

attack_point = pd.read_csv("/home/mskim2/GDN/attack_point.csv")
attack_point = attack_point.iloc[5:, -1].tolist()
test_result['attack_point'] = attack_point

In [4]:
# 3. true positive 시점 필터링
tp_df = test_result[(test_result["ground truth label"] == 1.0) & (test_result["model prediction"] == 1.0)& (pd.notna(test_result["attack_point"]))]
tp_df = tp_df.drop(tp_df.index[68])

# 4-1. DB 연결 (SQLite / CSV 예시)
conn = sqlite3.connect("sensor_data.db")
def get_raw_sensor_data(sensor: str, time_idx: int, window: int = 10) -> str:
    query = f"""
        SELECT timestamp, value
        FROM raw_data
        WHERE sensor_id = '{sensor}'
        AND time_index BETWEEN {time_idx - window} AND {time_idx + window}
        ORDER BY time_index
    """
    result = pd.read_sql(query, conn)
    return result.to_string(index=False)

def get_sensor_data_block(raw_df: pd.DataFrame, sensor: list, time_idx: int, window: int = 10) -> str:
    start = max(0, time_idx - window)
    end = min(len(raw_df), time_idx + window + 1)
    block = raw_df.loc[start:end, sensor]
    lines = [f"{val}" for idx, val in block.items()]
    return ", ".join(lines)

def get_attention_data_block(df, sensor, time_idx, window):
    topk = 15
    node_num = 51
    block = df.loc[(time_idx)*node_num*topk:(time_idx+1)*node_num*topk, :].squeeze()
    sensor_graph = {}
    for _, row in block.iterrows():
        source = row['source']
        target = row['target']
        attn = row['attention']

        if source not in sensor_graph:
            sensor_graph[source] = {}
        
        sensor_graph[source][target] = attn

    return sensor_graph

# 5. 도메인 매뉴얼 불러오기
with open("./manual.txt", "r") as f:
    manual_text = f.read()

In [12]:
import re
import json

root = None

# 6. 루트 원인 분석 루프
slide_win = 5
window = 30
prev_value = None
correct = 0
incorrect = 0

for _, row in tp_df.iterrows():
    current_value = row['attack_point']
    if current_value != prev_value:
        time_idx = int(row["timestamp"] + slide_win)
        print("Time Index: ", time_idx)
        sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
        sensors = [s.split(":")[0] for s in sensors_scores]
        print("Top 3 Sensors: ", sensors)

        output_json = {
            "raw_data": {},
            "anomaly_scores": {},
            "attention": {},
        }

        for sensor in feature_list:
            anomaly = get_sensor_data_block(anomaly_score, sensor, time_idx, window=window)
            output_json["anomaly_scores"][sensor] = anomaly

        for sensor in feature_list:
            raw = get_sensor_data_block(raw_df, sensor, time_idx, window=window)
            output_json["raw_data"][sensor] = raw

        anomaly = get_attention_data_block(attention, sensor, time_idx, window=window)
        output_json["attention"] = anomaly

        root = row['attack_point']

        messages = [
        {
            "role": "system",
            "content": """You are an expert in root cause analysis for cyber-physical systems, especially industrial water treatment systems. Your task is to identify plausible root causes of detected anomalies. Use domain knowledge and respond concisely and clearly.

        TASK:
        1. Read the files provided in subsequent messages:
        - Sensor Manual: A textual guide containing descriptions of each sensor and actuator, their intended functionality.
        - Raw Sensor Data: A dictionary mapping each sensor name to a string containing comma-separated raw data over a time window (±30 time steps from the detected anomaly).
        - Attention Weights: A dictionary where each key is a source sensor name, and its value is another dictionary mapping target sensor names to attention values (floats between 0 and 1). The attention values represents the influence or correlation strength from the source sensor to the target sensor as learned by the Graph Neural Network.
        2. Return a JSON object with:
        {
            "root_causes": [
            {"cause": str, "evidence": [sensor_id], "confidence": 0-1 float}
            ],
            "supporting_detail": str (<=150 tokens)
        }
        CONSTRAINTS:
        - Use only the given data; do not hallucinate unseen equipment.
        - Be concise; no markdown, no additional text outside the JSON.
        - Identify the most plausible root cause by considering abnormal changes in raw data or attention weights, as well as the inter-sensor relationships and the system’s operational flow.
        """
        },
        {
            "role": "user",
            "content": f"""
        Top 3 Sensors (by anomaly score): {', '.join(sensors)}

        Sensor Manual:
        {manual_text}

        Raw Sensor Data (±{window} points around anomaly):
        {output_json['raw_data']}

        Attention Weights:
        {output_json['attention']}

        Please analyze and explain the most likely root cause of the anomaly. Respond only with the required JSON output."""
        }
        ]

        response = llm_pipeline(messages, max_new_tokens=512)
        print("\n--- Root Cause:", root, '---')
        print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response[0]['generated_text']}\n")
        parsed = json.loads(response[0]['generated_text'])
        
        predicted_root_sensors = []
        for i in range(len(parsed['root_causes'])):
            predicted_root = parsed['root_causes'][i]['evidence'][0]
            predicted_root = re.sub(r"([A-Za-z]+)(\d+)", r"\1-\2", predicted_root)            
            predicted_root_sensors.append(predicted_root)

        acc = 1 if any(sensor in root for sensor in predicted_root_sensors) else 0

        if acc:
            correct += 1
        else:
            incorrect += 1
        print("@@@@@@@@@@@@@@@@@@@@@@", predicted_root_sensors, root, "@@@@@@@@@@@@@@@@@@@@@@")

    prev_value = current_value

print(correct, incorrect)
print("Accuracy: ", correct / (correct + incorrect))
print('------------------------------------------------------------------------------')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Time Index:  1538
Top 3 Sensors:  ['FIT401', 'MV301', 'FIT501']


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- Root Cause: FIT-401 ---

--- Root Cause Analysis for time 1538 ---
{
  "root_causes": [
    {"cause": "Possible malfunction of the UV dechlorination process (UV401) due to an unexpected drop in its operational status.", "evidence": ["UV401"], "confidence": 0.8},
    {"cause": "Possible issue with the reverse osmosis system (P501) due to an unexpected drop in its operational status.", "evidence": ["P501"], "confidence": 0.7},
    {"cause": "Possible malfunction of the FIT401 sensor due to an unexpected drop in its reading.", "evidence": ["FIT401"], "confidence": 0.6}
  ],
  "supporting_detail": "The anomaly is likely caused by a malfunction in one of the critical components of the water treatment system. The UV dechlorination process (UV401) and the reverse osmosis system (P501) are both critical components that play a crucial role in the water treatment process. The unexpected drop in their operational status suggests that there may be a malfunction in these components. Additional

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- Root Cause: AIT-504 ---

--- Root Cause Analysis for time 11614 ---
{
  "root_causes": [
    {"cause": "High conductivity in the water treatment process", "evidence": ["AIT504"], "confidence": 0.9},
    {"cause": "Abnormal flow rate in the ultrafiltration stage", "evidence": ["FIT401"], "confidence": 0.8},
    {"cause": "Unusual pressure reading after the reverse osmosis membrane", "evidence": ["PIT503"], "confidence": 0.7}
  ],
  "supporting_detail": "The high conductivity reading from AIT504 sensor suggests a potential issue with the water treatment process. The abnormal flow rate in the ultrafiltration stage, as indicated by FIT401 sensor, may be related to the high conductivity reading. The unusual pressure reading after the reverse osmosis membrane, as indicated by PIT503 sensor, may also be related to the issue. The attention weights suggest a strong correlation between AIT504 and FIT401 sensors, indicating that the high conductivity reading may be causing the abnormal flow 

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- Root Cause: AIT-502,P-501,UV-401 ---

--- Root Cause Analysis for time 13291 ---
{
  "root_causes": [
    {"cause": "Possible clogging or pump failure in the ultrafiltration stage", "evidence": ["FIT301", "MV302"], "confidence": 0.8},
    {"cause": "Possible malfunction of the reverse osmosis membrane", "evidence": ["PIT503", "FIT501"], "confidence": 0.7},
    {"cause": "Possible issue with the UV dechlorination process", "evidence": ["UV401", "P501"], "confidence": 0.6}
  ],
  "supporting_detail": "The anomaly is likely caused by a combination of factors, including clogging or pump failure in the ultrafiltration stage, malfunction of the reverse osmosis membrane, and issue with the UV dechlorination process. The evidence from the sensors and attention weights suggests that these factors are highly correlated and likely to be the root cause of the anomaly."
}

@@@@@@@@@@@@@@@@@@@@@@ ['FIT-301', 'PIT-503', 'UV-401'] AIT-502,P-501,UV-401 @@@@@@@@@@@@@@@@@@@@@@
Time Index:  23171
Top

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- Root Cause: P-302 ---

--- Root Cause Analysis for time 23171 ---
{
  "root_causes": [
    {"cause": "UV401 malfunction", "evidence": ["UV401"], "confidence": 0.8},
    {"cause": "P502 malfunction", "evidence": ["P502"], "confidence": 0.7},
    {"cause": "P206 malfunction", "evidence": ["P206"], "confidence": 0.6}
  ],
  "supporting_detail": "The anomaly is likely caused by a malfunction in one of the UV401, P502, or P206 sensors. The attention weights show a strong correlation between these sensors and other sensors in the system, indicating that they play a crucial role in the system's operation. The raw sensor data also shows abnormal changes in these sensors, which further supports the likelihood of a malfunction. The confidence levels are based on the strength of the evidence and the correlation between the sensors."
}

@@@@@@@@@@@@@@@@@@@@@@ ['UV-401', 'P-502', 'P-206'] P-302 @@@@@@@@@@@@@@@@@@@@@@
Time Index:  43654
Top 3 Sensors:  ['FIT502', 'P502', 'UV401']


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- Root Cause: FIT-502,P-501 ---

--- Root Cause Analysis for time 43654 ---
{
  "root_causes": [
    {"cause": "UV Disinfection Unit malfunction", "evidence": ["UV401"], "confidence": 0.8},
    {"cause": "Reverse Osmosis (RO) membrane failure", "evidence": ["PIT503"], "confidence": 0.7},
    {"cause": "Pump P501 malfunction", "evidence": ["P501"], "confidence": 0.6}
  ],
  "supporting_detail": "The anomaly is likely caused by a malfunction in the UV disinfection unit (UV401), which is indicated by the sudden drop in its output. This could be due to a failure in the UV lamp or a malfunction in the control system. The attention weights also suggest a strong correlation between UV401 and other sensors in the system, such as PIT503, which measures the pressure after the RO membrane. This suggests that the RO membrane may also be affected by the anomaly. Additionally, the pump P501, which is responsible for pumping water through the RO system, shows a sudden drop in its output, indicatin

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



--- Root Cause: AIT-502,FIT-401 ---

--- Root Cause Analysis for time 43814 ---
{
  "root_causes": [
    {"cause": "Chemical dosing pump P201 malfunction", "evidence": ["P201"], "confidence": 0.8},
    {"cause": "Sensor FIT501 malfunction", "evidence": ["FIT501"], "confidence": 0.7},
    {"cause": "Sensor FIT401 malfunction", "evidence": ["FIT401"], "confidence": 0.6}
  ],
  "supporting_detail": "The anomaly is likely caused by a malfunction in the chemical dosing pump P201, as indicated by the sudden change in its output. This is supported by the attention weights, which show a strong correlation between P201 and other sensors in the system. Additionally, the raw sensor data shows a sudden change in the output of sensor FIT501, which is also correlated with P201. The malfunction of P201 is likely the root cause of the anomaly, as it affects the chemical dosing process and can lead to changes in the water quality."
}

@@@@@@@@@@@@@@@@@@@@@@ ['P-201', 'FIT-501', 'FIT-401'] AIT-502,FIT-

In [None]:
# To use anomaly scores in the prompt

In [None]:
import json

root = None

# 6. 루트 원인 분석 루프
slide_win = 5
window = 30
prev_value = None
correct = 0
incorrect = 0

for _, row in tp_df.iterrows():
    current_value = row['attack_point']
    if current_value != prev_value:
        time_idx = int(row["timestamp"] + slide_win)
        sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
        sensors = [s.split(":")[0] for s in sensors_scores]

        output_json = {
            "raw_data": {},
            "anomaly_scores": {},
            "attention": {},
        }

        for sensor in feature_list:
            anomaly = get_sensor_data_block(anomaly_score, sensor, time_idx, window=window)
            output_json["anomaly_scores"][sensor] = anomaly

        for sensor in feature_list:
            raw = get_sensor_data_block(raw_df, sensor, time_idx, window=window)
            output_json["raw_data"][sensor] = raw

        anomaly = get_attention_data_block(attention, sensor, time_idx, window=window)
        output_json["attention"] = anomaly

        root = row['attack_point']

        messages = [
        {
            "role": "system",
            "content": """You are an expert in root cause analysis for cyber-physical systems, especially industrial water treatment systems. Your task is to identify plausible root causes of detected anomalies. Use domain knowledge and respond concisely and clearly.

        TASK:
        1. Read the files provided in subsequent messages:
        - Sensor Manual: A textual guide containing descriptions of each sensor and actuator, their intended functionality.
        - Raw Sensor Data: A dictionary mapping each sensor name to a string containing comma-separated raw data over a time window (±30 time steps from the detected anomaly).
        - Anomaly Scores: A dictionary mapping each sensor name to a string containing comma-separated anomaly scores, which is learned by the Graph Neural Network, over a time window (±30 time steps from the detected anomaly).
        - Attention Weights: A dictionary where each key is a source sensor name, and its value is another dictionary mapping target sensor names to attention values (floats between 0 and 1). The attention values represents the influence or correlation strength from the source sensor to the target sensor as learned by the Graph Neural Network.
        2. Return a JSON object with:
        {
            "root_causes": [
            {"cause": str, "evidence": [sensor_id], "confidence": 0-1 float}
            ],
            "supporting_detail": str (<=150 tokens)
        }
        CONSTRAINTS:
        - Use only the given data; do not hallucinate unseen equipment.
        - Be concise; no markdown, no additional text outside the JSON.
        """
        },
        {
            "role": "user",
            "content": f"""
        Top 3 Sensors (by anomaly score): {', '.join(sensors)}

        Sensor Manual:
        {manual_text}

        Raw Sensor Data (±{window} points around anomaly):
        {output_json['raw_data']}

        Anomaly Scores (±{window} points around anomaly):
        {output_json['anomaly_scores']}

        Attention Weights:
        {output_json['attention']}

        Please analyze and explain the most likely root cause of the anomaly. Respond only with the required JSON output."""
        }
        ]

        response = llm_pipeline(messages, max_new_tokens=512)
        print("\n--- Root Cause:", root, '---')
        print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response[0]['generated_text']}\n")
        parsed = json.loads(response[0]['generated_text'])
        predicted_root = parsed['root_causes'][0]['evidence'][0]

        acc = predicted_root in root
        if acc:
            correct += 1
        else:
            incorrect += 1
        print("@@@@", predicted_root, root, "@@@@")

    prev_value = current_value


print(correct, incorrect)
print("Accuracy: ", correct / (correct + incorrect))

# Falcon-7B (2,048)

In [8]:
import pandas as pd
import sqlite3
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

model_id = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
text_gen = pipeline("text-generation", 
                    model=model, 
                    tokenizer=tokenizer, 
                    max_new_tokens=512,
                    return_full_text=False,
                    )
llm = HuggingFacePipeline(pipeline=text_gen)

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.03s/it]
Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=text_gen)


In [13]:
# 2. GNN 결과 로드
df = pd.read_csv("/home/mskim2/GDN/csv/swat/test_result.csv")

# 3. true positive 시점 필터링
tp_df = df[(df["ground truth label"] == 1.0) & (df["model prediction"] == 1.0)]

# 4-1. DB 연결 (SQLite / CSV 예시)
conn = sqlite3.connect("sensor_data.db")
def get_raw_sensor_data(sensor: str, time_idx: int, window: int = 10) -> str:
    query = f"""
        SELECT timestamp, value
        FROM raw_data
        WHERE sensor_id = '{sensor}'
        AND time_index BETWEEN {time_idx - window} AND {time_idx + window}
        ORDER BY time_index
    """
    result = pd.read_sql(query, conn)
    return result.to_string(index=False)

def get_sensor_data_block(sensor: list, time_idx: int, window: int = 10) -> str:
    raw_df = pd.read_csv("/home/mskim2/GDN/data/swat/test.csv")
    start = max(0, time_idx - window)
    end = min(len(raw_df), time_idx + window + 1)
    block = raw_df.loc[start:end, sensor]
    return block.to_string(index=False)

# 5. 도메인 매뉴얼 불러오기
with open("./manual.txt", "r") as f:
    manual_text = f.read()

# 6. 루트 원인 분석 루프
slide_win = 5
for _, row in tp_df.iterrows():
    time_idx = int(row["timestamp"]+slide_win)
    sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
    sensors = [s.split(":")[0] for s in sensors_scores]

    window=20
    sensor_data_blocks = []
    for sensor in sensors:
        raw = get_sensor_data_block(sensor, time_idx, window=window)
        sensor_data_blocks.append(f"Sensor: {sensor}\n{raw}")

    sensor_data_str = "\n\n".join(sensor_data_blocks)

    prompt_template = ChatPromptTemplate.from_messages([
        ("system", f"You are an expert in root cause analysis for cyber-physical systems, especially industrial water treatment systems. Your task is to identify plausible root causes of detected anomalies. Use domain knowledge and respond concisely and clearly."),
        ("human", f"""Domain Knowledge:
{manual_text}

Time Point: {time_idx}
Top 3 Sensors (by anomaly score): {', '.join(sensors)}

Raw Sensor Data (±{window} points around anomaly):
{sensor_data_str}

Please analyze this and explain the most likely root cause of the anomaly at time {time_idx}. Respond concisely and clearly in bullet points.""")
    ])

    chain = prompt_template | llm
    response = chain.invoke({})
    print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response}\n")
    break

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.



--- Root Cause Analysis for time 1533 ---

- Spoofing sensor values (FIT401, AIT201) via simulation mode or HMI tag manipulation to damage RO membrane.
- Disabling UV401 via alarm setpoint manipulation to damage RO membrane.
- Exploiting SCADA workstation (e.g., EternalBlue) to alter plant operation logic.
- Command injection into network devices (e.g., MOXA access points) to disrupt communication.

The most likely root cause of the anomaly at time 1533 is spoofing sensor values. This is due to the fact that the anomaly occurred during a time when the sensor values were being manipulated through simulation mode or HMI tag manipulation. The spoofing could be intentional or unintentional, but it is likely that the root cause is related to the manipulation.



# DeepSeek-7B (4,096)

In [4]:
import torch
import pandas as pd
import sqlite3
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, AutoTokenizer, GenerationConfig

model_name = "deepseek-ai/deepseek-llm-7b-chat"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.57s/it]


In [5]:
# 2. GNN 결과 로드
df = pd.read_csv("/home/mskim2/GDN/csv/swat/test_result.csv")

# 3. true positive 시점 필터링
tp_df = df[(df["ground truth label"] == 1.0) & (df["model prediction"] == 1.0)]

# 4-1. DB 연결 (SQLite / CSV 예시)
conn = sqlite3.connect("sensor_data.db")
def get_raw_sensor_data(sensor: str, time_idx: int, window: int = 10) -> str:
    query = f"""
        SELECT timestamp, value
        FROM raw_data
        WHERE sensor_id = '{sensor}'
        AND time_index BETWEEN {time_idx - window} AND {time_idx + window}
        ORDER BY time_index
    """
    result = pd.read_sql(query, conn)
    return result.to_string(index=False)

def get_sensor_data_block(sensor: list, time_idx: int, window: int = 10) -> str:
    raw_df = pd.read_csv("/home/mskim2/GDN/data/swat/test.csv")
    start = max(0, time_idx - window)
    end = min(len(raw_df), time_idx + window + 1)
    block = raw_df.loc[start:end, sensor]
    return block.to_string(index=False)

# 5. 도메인 매뉴얼 불러오기
with open("./manual.txt", "r") as f:
    manual_text = f.read()

# 6. 루트 원인 분석 루프 (DeepSeek 스타일 프롬프트 적용)
slide_win = 5
for _, row in tp_df.iterrows():
    time_idx = int(row["timestamp"] + slide_win)
    sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
    sensors = [s.split(":")[0] for s in sensors_scores]

    window = 30
    sensor_data_blocks = []
    for sensor in sensors:
        raw = get_sensor_data_block(sensor, time_idx, window=window)
        sensor_data_blocks.append(f"Sensor: {sensor}\n{raw}")

    sensor_data_str = "\n\n".join(sensor_data_blocks)

    messages = [
        {"role": "user", "content": f"""Domain Knowledge:
    {manual_text}

    Time Point: {time_idx}
    Top 3 Sensors (by anomaly score): {', '.join(sensors)}

    Raw Sensor Data (±{window} points around anomaly):
    {sensor_data_str}

    Please analyze the anomaly and explain the most likely root cause at time {time_idx}. Respond concisely and clearly in bullet points."""}
    ]

    # 템플릿 적용
    inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
    outputs = model.generate(inputs.to(model.device), max_new_tokens=512)

    # 응답 디코딩
    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)

    print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response}\n")
    break


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Root Cause Analysis for time 1533 ---
* FIT401 and FIT501 show significant spikes in values, reaching -30 in some cases, which is highly unusual for sensor values.
* FIT502 has more normal values but still shows some unusual fluctuations.
* The most likely root cause of these anomalies is a physical failure or malfunction of the sensors themselves, leading to incorrect or inconsistent readings.
* It is also possible that there is a communication issue between the sensors and the SCADA system, causing data corruption or delay.
* It is important to investigate these anomalies further by physically inspecting the sensors, checking their calibration, and verifying their connection to the system.

