In [1]:
import argparse
import json
import os
import re
import textwrap
from pathlib import Path
from typing import List

# --- PDF → text utility (PyMuPDF)
try:
    import fitz  # PyMuPDF
except ImportError:
    fitz = None  # Will error later if preprocess is used

# --- Deduplication
import text_dedup.minhash
from datasketch import MinHash, MinHashLSH

# --- Hugging Face & PEFT
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

import pandas as pd
import torch
torch.cuda.empty_cache()

In [2]:
# preprocess config
pdf_dir = './pdf'
out_dir = './data'
min_tokens = 128
max_tokens = 2048
overlap = 256

# train config
dataset_path = './data/train.jsonl'
output_dir = './checkpoints'
num_epochs = 3
per_device_train_batch = 2
grad_accum = 4
lr = 2e-4
warmup_steps = 100
lora_r = 64
lora_alpha = 128
lora_dropout = 0.05
max_seq_len = 2048
fp16 = True  # 또는 False

# evaluate config
adapter_dir = './checkpoints/adapter'
eval_dataset_path = './data/eval.json'
eval_questions = './data/questions.csv'  # CSV with columns 'question','answer_regex'


In [3]:
import pdfplumber

def extract_text_from_pdf(pdf_path: Path) -> str:
    """Extract raw text from a single PDF using pdfplumber."""
    text_chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text_chunks.append(page_text)
    return "\n".join(text_chunks)


# def extract_text_from_pdf(pdf_path: Path) -> str:
#     doc = fitz.open(pdf_path)
#     text_chunks = []
#     for page in doc:
#         blocks = page.get_text("blocks")
#         page_text = "\n".join([b[4] for b in blocks])
#         text_chunks.append(page_text)
#     return "\n".join(text_chunks)


SECTION_PATTERNS = [
    re.compile(r"^references$", re.I),
    re.compile(r"^bibliography$", re.I),
    re.compile(r"^acknowledg(e)?ments?$", re.I),
]


def clean_text(text: str) -> str:
    """Remove references/acknowledgment sections & excessive blank lines."""
    lines = [l.strip() for l in text.splitlines()]
    cleaned: List[str] = []
    skip = False
    for ln in lines:
        if any(p.match(ln.lower()) for p in SECTION_PATTERNS):
            skip = True
        if not skip and ln:
            cleaned.append(ln)
    return "\n".join(cleaned)


def chunk_text(text: str, tokenizer, max_tokens: int, overlap: int) -> List[str]:
    """Slice long text into overlapping chunks by token count."""
    tokens = tokenizer(text)["input_ids"]
    chunks = []
    i = 0
    while i < len(tokens):
        chunk_tokens = tokens[i : i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        if len(chunk_tokens) >= 32:  # minimal meaningful length
            chunks.append(chunk_text)
        i += max_tokens - overlap
    return chunks


pdf_dir = Path(pdf_dir)
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)

with open("token.txt", "r") as f:
    token = f.read().strip()

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token)

raw_records = []
for pdf_path in pdf_dir.rglob("*.pdf"):
    raw = extract_text_from_pdf(pdf_path)
    cleaned = clean_text(raw)
    raw_records.append({"doc_id": pdf_path.stem, "text": cleaned})
    print(f"[+] Extracted {pdf_path}")

# Deduplicate
# threshold = 0.88
# num_perm = 128

# texts = [r["text"] for r in raw_records]

# minhashes = []
# for text in texts:
#     m = MinHash(num_perm=num_perm)
#     for word in text.split():
#         m.update(word.encode('utf8'))
#     minhashes.append(m)

# lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
# unique_indices = []
# seen = set()

# for i, m in enumerate(minhashes):
#     duplicates = lsh.query(m)
#     if not duplicates:
#         lsh.insert(f"m{i}", m)
#         unique_indices.append(i)

# unique_records = [raw_records[i] for i in unique_indices]
# print(f"[i] Deduplicated: {len(texts)} → {len(unique_records)} docs")

# Chunking
all_chunks = []
for rec in raw_records:
    chunks = chunk_text(rec["text"], tokenizer, max_tokens, overlap)
    for idx, chunk in enumerate(chunks):
        all_chunks.append({"text": chunk, "source": f"{rec['doc_id']}§{idx}"})

# Filter by min_tokens
min_toks = min_tokens
def token_len(example):
    return len(tokenizer(example["text"])["input_ids"])

all_chunks = [c for c in all_chunks if token_len(c) >= min_toks]
print(f"[i] Final chunks: {len(all_chunks)}")

# Write JSONL
jsonl_path = out_dir / "train.jsonl"
with jsonl_path.open("w", encoding="utf-8") as f:
    for rec in all_chunks:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"[✓] Saved {jsonl_path}")



[+] Extracted pdf/Analysis of silane and nitrous oxide produced plasma enhanced chemical vapor deposition simulation.pdf
[+] Extracted pdf/Analysis-of-the-synergetic-effect-of-process-parameters-of-h_2025_Diamond-an.pdf
[+] Extracted pdf/A-novel-physical-vapor-deposition-setup-applying-high-frequency-cur_2025_Vac.pdf
[+] Extracted pdf/A-review-of-comprehensive-utilization-of-biomass-to-s_2024_Journal-of-Analyt.pdf
[+] Extracted pdf/A-transport-kinetic-model-development-for-polysili_2024_International-Journa.pdf
[+] Extracted pdf/Centimeter-level-MoS2-films-with-controllable-number-of-layers-by-f_2023_Vac.pdf
[+] Extracted pdf/Characteristics-of-Single-Crystalline-Rutile-GeO2-Film-Gro_2025_Journal-of-A.pdf
[+] Extracted pdf/Chemical vapor deposition growth of boron incorporated graphitic carbon nitride film for carbon based semiconductor systems.pdf
[+] Extracted pdf/Chemical-vapor-deposited-nanocarbon-Fe-Al2O3-composi_2025_Materials-Chemistr.pdf
[+] Extracted pdf/Chemical-vapor-deposit

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Effects-of-carbon-nanoparticle-insertion-on-stress-reducti_2024_Diamond-and-.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Electrochemical-polishing-of-chemical-vapor-deposited-niob_2023_Thin-Solid-F.pdf
[+] Extracted pdf/Enhanced-solar-water-splitting-using-bismuth-ferrite_2024_Materials-Science-.pdf
[+] Extracted pdf/Enhancement-of-crystalline-quality-of-homoepitaxial--100--_2025_Diamond-and-.pdf
[+] Extracted pdf/Enhancing-the-thermal-stability-of-active-contacts-in-AlGaN-G_2024_Current-A.pdf
[+] Extracted pdf/Epitaxially deposited SrVO-sub 3- conducting films by laser ablation and metal organic chemical vapor deposition.pdf
[+] Extracted pdf/Evaluation of Ga-Sn-O films fabricated using mist chemical vapor deposition.pdf
[+] Extracted pdf/Exploring-the-growth-and-optoelectronic-properties-of-PtSe_2025_Surfaces-and.pdf
[+] Extracted pdf/Flexible-and-lightweight-graphene-grown-by-rapid-thermal-proce_2023_New-Carb.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Growth of GaN nanowires on Si substrate using Ni catalyst in vertical chemical vapor deposition reactor.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/04_34015_rashid.pdf
[+] Extracted pdf/20220728105632_74178.pdf
[+] Extracted pdf/Growth-dynamics-dependence-of-high-quality---Ga2O3-thin-films-prepa_2025_Vac.pdf
[+] Extracted pdf/Growth-of-bilayer-MoS2-flakes-by-reverse-flow-chemical-va_2023_Materials-Let.pdf
[+] Extracted pdf/H2-mediated-reduction-of-GeO2-and-chemical-vapor-deposition-_2023_Thin-Solid.pdf
[+] Extracted pdf/--hbox-SnO-_-2--  Nanorods Prepared by Inductively Coupled Plasma-Enhanced Chemical Vapor Deposition.pdf
[+] Extracted pdf/Heavy-phosphorus-doping-of-diamond-by-hot-filament-_2023_Diamond-and-Related.pdf
[+] Extracted pdf/High-light-yield-and-fast-response---Ga2O3-Al2O3-thick-film-sc_2024_Material.pdf
[+] Extracted pdf/High-performance-solar-blind-photodetector-based-on-Si-d_2024_Journal-of-All.pdf
[+] Extracted pdf/High-quality-MoS2-monolayers-with-largely-enhanced-electrical_2024_Applied-S.pdf
[+] Extracted pdf/High-rate-growth-of-single-crystal-diamond-with-an-atomically-_2022_Thin-Sol.pdf
[+] 

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Metalorganic chemical vapor deposition for optoelectronic devices.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Microstructure-evolution-of-carbon-films--grown-by-physi_2024_Materials-Chem.pdf
[+] Extracted pdf/Mixed-phase-iron-oxides-thin-layers-by-atmospheric-pressure-chemi_2024_Metho.pdf
[+] Extracted pdf/Model reduction for a tungsten chemical vapor deposition system.pdf
[+] Extracted pdf/9781439862070.pdf
[+] Extracted pdf/fan2021.pdf
[+] Extracted pdf/bleakie2013.pdf
[+] Extracted pdf/Modulating-growth-of-graphene-on-sapphire-by-chemica_2024_Journal-of-Crystal.pdf
[+] Extracted pdf/Monolithically-grown-CSPbBr3-by-chemical-vapor-deposit_2024_Chemical-Enginee.pdf
[+] Extracted pdf/Multifunctional-prospects-of-physical-vapor-depo_2025_Journal-of-Science--Ad.pdf
[+] Extracted pdf/Multi-objective-optimization-of-4H-SiC-homoepitaxy-chemica_2025_Materials-To.pdf
[+] Extracted pdf/Multiscale-multiphysics-predictive-modeling-of-chemical-v_2025_Chemical-Engi.pdf
[+] Extracted pdf/Numerical-analysis-of-the-use-of-multiple-inlet-plates-to-impro_2024_Results.pdf
[+] Extracted pdf/Nume

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Optimizing-electromagnetic-interference-shielding-of-flexible_2025_Applied-S.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Optimizing-the-chemical-vapor-deposition-process-of-4H-S_2024_Case-Studies-i.pdf
[+] Extracted pdf/Oxygen-vacancies-modulating-performance-for-Ga2O3-solar-b_2024_Materials-Tod.pdf
[+] Extracted pdf/Phase-controlled-epitaxy-of-wurtzite-ZnS-thin-films-by-metal_2025_Thin-Solid.pdf
[+] Extracted pdf/Phase-controlled-growth-of-indium-selenide-by-metalorg_2024_Journal-of-Cryst.pdf
[+] Extracted pdf/Photodetector-based-on-2H-WSe2-grown-by-physical-vapor-de_2024_Materials-Let.pdf
[+] Extracted pdf/Photodetectors-based-on-chemical-vapor-deposition-or-liquid-_2023_Optical-Ma.pdf
[+] Extracted pdf/Post-growth-annealing-effect-of-Li-doped-NiO-thin-f_2024_Materials-Science-a.pdf
[+] Extracted pdf/Precise-surface-engineering--Leveraging-chemical-vapor-deposition-f_2024_Hel.pdf
[+] Extracted pdf/Preparation-and-energy-band-analysis-of-graphene-diamond-_2025_Diamond-and-R.pdf
[+] Extracted pdf/Preparation-of-Al-doped-SnO2-thin-films-via-ultrasonic-mist-_2025_Physica-B-.pdf
[+] Extrac

CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Two-Level_Nested_Control_Chart_for_Batch_Process_in_the_Semiconductor_Manufacturing.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Advanced_process_equipment_matching_methodology_in_semiconductor_manufacturing.pdf
[+] Extracted pdf/Spatial_Correlated_Data_Monitoring_in_Semiconductor_Manufacturing_Using_Gaussian_Process_Model.pdf
[+] Extracted pdf/Fault_Detection_Using_Principal_Components-Based_Gaussian_Mixture_Model_for_Semiconductor_Manufacturing_Processes.pdf
[+] Extracted pdf/The_implementation_of_AFM_for_process_monitoring_and_metrology_in_trench_MOSFET_device_manufacturing.pdf
[+] Extracted pdf/Semiconductor_Manufacturing_Process_Monitoring_Based_on_Adaptive_Substatistical_PCA.pdf
[+] Extracted pdf/Control_performance_monitoring_for_EWMA-based_run-to-run_control_in_semiconductor_manufacturing_processes.pdf
[+] Extracted pdf/Multi-objective_Fault_Monitoring_for_Semiconductor_Manufacturing_Process_with_DEWMA_Run-to-Run_Controller.pdf


CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Novel_assessment_of_process_control_monitor_in_advanced_semiconductor_manufacturing_a_complete_set_of_addressable_failure_site_test_structures_AFS-TS.pdf


CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/An_effective_SPC_approach_to_monitoring_semiconductor_manufacturing_processes_with_multiple_variation_sources.pdf
[+] Extracted pdf/Advance_Process_Control_solutions_for_semiconductor_manufacturing.pdf
[+] Extracted pdf/Handbook_of_Thin_Film_Deposition_Process.pdf
[+] Extracted pdf/Monitoring_and_control_of_semiconductor_manufacturing_processes.pdf
[+] Extracted pdf/fan2020.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/pan2011.pdf
[+] Extracted pdf/10.1109@TSM.2019.2929765.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/chouichi2020.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/04127ca86f60e87687a9358b8df79011.pdf
[+] Extracted pdf/watanabe2013.pdf


CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/a-virtual-metrology-scheme-for-predicting-cvd-thickness-in-semic.pdf


CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/A-Novel-Virtual-Metrology-Scheme-for-Predicting-CVD-Thickness-in-Semiconductor-Manufacturing.pdf
[+] Extracted pdf/CDC2008_4.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/document.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/applsci-08-00846.pdf
[+] Extracted pdf/allgood2003.pdf
[+] Extracted pdf/Thin_film_atomic_layer_deposition_equipm.pdf
[+] Extracted pdf/book-summary.pdf
[+] Extracted pdf/hong2012.pdf
[+] Extracted pdf/Adversarial_Defect_Detection_in_Semiconductor_Manufacturing_Process.pdf
[+] Extracted pdf/Key_Feature_Identification_for_Monitoring_Wafer-to-Wafer_Variation_in_Semiconductor_Manufacturing.pdf
[+] Extracted pdf/fundamentals_of_semiconductor.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/1-s2.0-S0967066106002048-main.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/file_1.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf/AdvancedPlasmaProcessingforSemiconductorManufacturing.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf/ch10.pdf
[+] Extracted pdf/0.6835543743623976.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf/goldstone_plasma.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf/SemiconductorDevices.pdf
[+] Extracted pdf/EA002AB461DFDFC92BCFAD35BC4_C62E9AA9_1335DE9.pdf
[+] Extracted pdf/preview-9781837671366_A48614035.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/Chemistry-in-Semiconductors-and-Electronics.pdf
[+] Extracted pdf/AngewChemie_09.pdf
[+] Extracted pdf/chemical-vapour-deposition-cvd-advances-technology-and-applications-1nbsped-1466597763-9781466597761_compress.pdf
[+] Extracted pdf/GUPTA_2023_diffusion.pdf
[+] Extracted pdf/Fundamentals of Semiconductor Manufacturing and Process Control G. May C. Spanos.pdf
[+] Extracted pdf/Perspective_on_defect_characterization_in_semicond.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf/DOE_FES_PlasmaScience_Semiconductors_v27.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf/Final-Heat-Transfer-Fluids-Paper.pdf
[+] Extracted pdf/Handbook_e_202004.pdf
[+] Extracted pdf/Introduction to Semiconductor Processing.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/tm-app-semiconductors_lowres.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf/hartmann-VCI2010.2.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf/Operatormanual_PMX_C_N_August_2014.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf/swat.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

[+] Extracted pdf/1911.04831v1.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/2106.06947v1.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/ch01_overview.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox


[+] Extracted pdf/swat (1).pdf


Token indices sequence length is longer than the specified maximum sequence length for this model (243395 > 131072). Running this sequence through the model will result in indexing errors


[i] Final chunks: 1702
[✓] Saved data/train.jsonl


In [4]:
# from pathlib import Path
# from datasets import load_dataset, Dataset
# import re
# import unicodedata
# from unidecode import unidecode

# def remove_weird_unicode(text):
#     text = ''.join(ch for ch in text if unicodedata.category(ch)[0] not in ['C'])
#     text = re.sub(r'[\u2000-\u200f\u2028-\u202f\u205f\u2060-\u206f\ufeff]', ' ', text)
#     text = re.sub(r'[\u2190-\u21ff\u2300-\u23ff\u2500-\u257f\u2700-\u27bf\ue000-\uf8ff\U0001F000-\U0001FAFF]', '', text)
#     return text

# def clean_text(text):
#     text = unicodedata.normalize('NFKC', text)
#     text = re.sub(r'(?<=[가-힣])\s+(?=[가-힣])', '', text)
#     text = re.sub(r'(?<=[a-zA-Z])\s+(?=[a-zA-Z])', '', text)
#     text = remove_weird_unicode(text)
#     text = re.sub(r'[�]', '', text)
#     text = unidecode(text)
#     text = re.sub(r'\s+', ' ', text)
#     return text.strip()

# dataset_path = Path(dataset_path)
# if not dataset_path.exists():
#     raise FileNotFoundError(dataset_path)

# dataset = load_dataset("json", data_files=str(dataset_path), split="train")

# dataset = dataset.map(lambda x: {"text": clean_text(x["text"])})

# dataset.to_json("train_cleaned.jsonl", force_ascii=False)

In [5]:
dataset_path = Path(dataset_path)
if not dataset_path.exists():
    raise FileNotFoundError(dataset_path)

# Load dataset
dataset = load_dataset("json", data_files=str(dataset_path), split="train")

with open("token.txt", "r") as f:
    token = f.read().strip()

# Tokenizer & model
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_auth_token=token)
tokenizer.pad_token = tokenizer.eos_token

# 4-bit QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_auth_token=token, device_map="auto")

# PEFT config
lora_cfg = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

# Tokenize dataset lazily
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_seq_len)

tokenized_ds = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text", "source"])

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=per_device_train_batch,
    gradient_accumulation_steps=grad_accum,
    learning_rate=lr,
    weight_decay=0.1,
    warmup_steps=warmup_steps,
    logging_steps=20,
    save_strategy="epoch",
    bf16=not fp16,
    fp16=fp16,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model()
tokenizer.save_pretrained(output_dir)
print("[✓] Training complete - adapter+tokenizer saved.")


Generating train split: 0 examples [00:00, ? examples/s]



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465


Map (num_proc=4):   0%|          | 0/1702 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
20,2.1523
40,2.0159
60,1.9013
80,1.9575
100,1.9874
120,1.9944
140,1.9306
160,1.935
180,1.903
200,1.8683



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignorin

[✓] Training complete - adapter+tokenizer saved.


In [6]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='nltk')
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

In [7]:
from typing import List

def perplexity(eval_texts: List[str], model, tokenizer):
    ppl_list = []

    for text in eval_texts:
        inputs = tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(model.device)

        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss

        if not torch.isnan(loss):
            ppl = torch.exp(loss).item()
            ppl_list.append(ppl)

    if len(ppl_list) == 0:
        return float("nan")

    return sum(ppl_list) / len(ppl_list)


pretrined = False

if pretrined==True:
    base_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_auth_token=token, device_map="auto")
    # Load LoRA adapter
    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, output_dir)
model.eval()

# 3-5 random chunks for perplexity
eval_ds = load_dataset("json", data_files=str(dataset_path), split="train[:1%]")
sample_texts = [eval_ds[i]["text"] for i in range(min(5, len(eval_ds)))]
ppl = perplexity(sample_texts, model, tokenizer)
print(f"[i] Domain PPL ≈ {ppl:.2f}")

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

if eval_questions:
    df = pd.read_csv(eval_questions)
    results = []

    for idx, row in df.iterrows():
        prompt = textwrap.dedent(
            f"""\
            [INST] {row['question']} [/INST]
            """
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        gen = model.generate(**inputs, max_new_tokens=64, pad_token_id=tokenizer.eos_token_id)
        answer = tokenizer.decode(gen[0], skip_special_tokens=True).strip()
        gold = row["answer"].strip()

        bleu = sentence_bleu([gold.split()], answer.split())
        rouge_l = scorer.score(gold, answer)['rougeL'].fmeasure
        print(f"[Answer]: {gold}")
        print(f"[{idx}] BLEU: {bleu}, ROUGE-L: {rouge_l}")

[i] Domain PPL ≈ 3.58
[Answer]: CVD utilizes chemical reactions on substrates with gaseous precursors, while PVD relies on physical vaporization/condensation without chemical changes.
[0] BLEU: 7.955640502424632e-232, ROUGE-L: 0.05405405405405405
[Answer]: 300-400°C for substrate-specific deposition through surface-limited reactions.
[1] BLEU: 8.231055179516831e-232, ROUGE-L: 0.06557377049180327
[Answer]: 0.5-2 Torr achieves >95% conformality in high-aspect-ratio structures.
[2] BLEU: 6.659801989051833e-232, ROUGE-L: 0.028985507246376812
[Answer]: Growth rate increases logarithmically from 0.5 to 3 μm/min as H₂ flow rises from 10 to 50 slm.
[3] BLEU: 8.726094729337945e-232, ROUGE-L: 0.051948051948051945
[Answer]: Mo(CO)₆ + H₂S at 800°C with 0.1 Torr base pressure.
[4] BLEU: 0, ROUGE-L: 0.03076923076923077
[Answer]: B₂H₆ provides higher incorporation efficiency \(8x10¹⁹ cm⁻³ vs 5x10¹⁸ cm⁻³\) at 600°C.
[5] BLEU: 6.545917251136179e-232, ROUGE-L: 0.052631578947368425
[Answer]: 500-700 hole

KeyboardInterrupt: 

In [11]:
import pandas as pd
import torch
import sqlite3
from langchain.prompts import ChatPromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM

llm_pipeline = pipeline(
    "text-generation",
    model=trainer.model,
    tokenizer=tokenizer,
    model_kwargs={"torch_dtype": torch.bfloat16},
    return_full_text=False,
    device_map="auto"
)

Device set to use cuda:0
The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['AriaTextForCausalLM', 'BambaForCausalLM', 'BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'Cohere2ForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'DeepseekV3ForCausalLM', 'DiffLlamaForCausalLM', 'ElectraForCausalLM', 'Emu3ForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FalconMambaForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'Gemma3ForConditionalGeneration', 'Gemma3ForCausalLM', 'GitForCausalLM', 'GlmForCausalLM', 'Glm4ForCausalLM', 'GotOcr2ForConditionalGeneration', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoFo

In [12]:
# 2. GNN 결과 로드
test_result = pd.read_csv("/home/mskim2/GDN/csv/swat/test_result.csv")
attention = pd.read_csv("/home/mskim2/GDN/csv/swat/attention_result.csv")
anomaly_score = pd.read_csv("/home/mskim2/GDN/csv/swat/anomaly_score.csv")
raw_df = pd.read_csv("/home/mskim2/GDN/data/swat/test.csv")

feature_file = open(f'/home/mskim2/GDN/data/swat/list.txt', 'r')
feature_list = []
for ft in feature_file:
    feature_list.append(ft.strip())

attack_point = pd.read_csv("/home/mskim2/GDN/attack_point.csv")
attack_point = attack_point.iloc[5:, -1].tolist()
test_result['attack_point'] = attack_point

In [13]:
# 3. true positive 시점 필터링
tp_df = test_result[(test_result["ground truth label"] == 1.0) & (test_result["model prediction"] == 1.0)& (pd.notna(test_result["attack_point"]))]
tp_df = tp_df.drop(tp_df.index[68])

# 4-1. DB 연결 (SQLite / CSV 예시)
conn = sqlite3.connect("sensor_data.db")
def get_raw_sensor_data(sensor: str, time_idx: int, window: int = 10) -> str:
    query = f"""
        SELECT timestamp, value
        FROM raw_data
        WHERE sensor_id = '{sensor}'
        AND time_index BETWEEN {time_idx - window} AND {time_idx + window}
        ORDER BY time_index
    """
    result = pd.read_sql(query, conn)
    return result.to_string(index=False)

def get_sensor_data_block(raw_df: pd.DataFrame, sensor: list, time_idx: int, window: int = 10) -> str:
    start = max(0, time_idx - window)
    end = min(len(raw_df), time_idx + window + 1)
    block = raw_df.loc[start:end, sensor]
    lines = [f"{val}" for idx, val in block.items()]
    return ", ".join(lines)

def get_attention_data_block(df, sensor, time_idx, window):
    topk = 15
    node_num = 51
    block = df.loc[(time_idx)*node_num*topk:(time_idx+1)*node_num*topk, :].squeeze()
    sensor_graph = {}
    for _, row in block.iterrows():
        source = row['source']
        target = row['target']
        attn = row['attention']

        if source not in sensor_graph:
            sensor_graph[source] = {}
        
        sensor_graph[source][target] = attn

    return sensor_graph

# 5. 도메인 매뉴얼 불러오기
with open("./manual.txt", "r") as f:
    manual_text = f.read()

In [14]:
import re
import json

root = None

# 6. 루트 원인 분석 루프
slide_win = 5
window = 30
prev_value = None
correct = 0
incorrect = 0

for _, row in tp_df.iterrows():
    current_value = row['attack_point']
    if current_value != prev_value:
        time_idx = int(row["timestamp"] + slide_win)
        print("Time Index: ", time_idx)
        sensors_scores = [s.strip() for s in row[["1", "2", "3"]].tolist()]
        sensors = [s.split(":")[0] for s in sensors_scores]
        print("Top 3 Sensors: ", sensors)

        output_json = {
            "raw_data": {},
            "anomaly_scores": {},
            "attention": {},
        }

        for sensor in feature_list:
            anomaly = get_sensor_data_block(anomaly_score, sensor, time_idx, window=window)
            output_json["anomaly_scores"][sensor] = anomaly

        for sensor in feature_list:
            raw = get_sensor_data_block(raw_df, sensor, time_idx, window=window)
            output_json["raw_data"][sensor] = raw

        anomaly = get_attention_data_block(attention, sensor, time_idx, window=window)
        output_json["attention"] = anomaly

        root = row['attack_point']

        messages = [
        {
            "role": "system",
            "content": """You are an expert in root cause analysis for cyber-physical systems, especially industrial water treatment systems. Your task is to identify plausible root causes of detected anomalies. Use domain knowledge and respond concisely and clearly.

        TASK:
        1. Read the files provided in subsequent messages:
        - Sensor Manual: A textual guide containing descriptions of each sensor and actuator, their intended functionality.
        - Raw Sensor Data: A dictionary mapping each sensor name to a string containing comma-separated raw data over a time window (±30 time steps from the detected anomaly).
        - Attention Weights: A dictionary where each key is a source sensor name, and its value is another dictionary mapping target sensor names to attention values (floats between 0 and 1). The attention values represents the influence or correlation strength from the source sensor to the target sensor as learned by the Graph Neural Network.
        2. Return a JSON object with:
        {
            "root_causes": [
            {"cause": str, "evidence": [sensor_id], "confidence": 0-1 float}
            ],
            "supporting_detail": str (<=150 tokens)
        }
        CONSTRAINTS:
        - Use only the given data; do not hallucinate unseen equipment.
        - Be concise; no markdown, no additional text outside the JSON.
        - Identify the most plausible root cause by considering abnormal changes in raw data or attention weights, as well as the inter-sensor relationships and the system’s operational flow.
        """
        },
        {
            "role": "user",
            "content": f"""
        Top 3 Sensors (by anomaly score): {', '.join(sensors)}

        Sensor Manual:
        {manual_text}

        Raw Sensor Data (±{window} points around anomaly):
        {output_json['raw_data']}

        Attention Weights:
        {output_json['attention']}

        Please analyze and explain the most likely root cause of the anomaly. Respond only with the required JSON output."""
        }
        ]

        response = llm_pipeline(messages, max_new_tokens=512)
        print("\n--- Root Cause:", root, '---')
        print(f"\n--- Root Cause Analysis for time {time_idx} ---\n{response[0]['generated_text']}\n")
        parsed = json.loads(response[0]['generated_text'])
        
        predicted_root_sensors = []
        for i in range(len(parsed['root_causes'])):
            predicted_root = parsed['root_causes'][i]['evidence'][0]
            predicted_root = re.sub(r"([A-Za-z]+)(\d+)", r"\1-\2", predicted_root)            
            predicted_root_sensors.append(predicted_root)

        acc = 1 if any(sensor in root for sensor in predicted_root_sensors) else 0

        if acc:
            correct += 1
        else:
            incorrect += 1
        print("@@@@@@@@@@@@@@@@@@@@@@", predicted_root_sensors, root, "@@@@@@@@@@@@@@@@@@@@@@")

    prev_value = current_value

print(correct, incorrect)
print("Accuracy: ", correct / (correct + incorrect))
print('------------------------------------------------------------------------------')

Time Index:  1538
Top 3 Sensors:  ['FIT401', 'MV301', 'FIT501']

--- Root Cause: FIT-401 ---

--- Root Cause Analysis for time 1538 ---
{
  "root_causes": [
    {"cause": "Possible leak in UF feed water tank", "evidence": ["FIT301"], "confidence": 0.97},
    {"cause": "Possible leak in UF feed water tank", "evidence": ["LIT301"], "confidence": 0.95},
    {"cause": "Possible leak in UF feed water tank", "evidence": ["DPIT301"], "confidence": 0.93},
    {"cause": "Possible leak in UF feed water tank", "evidence": ["AIT201"], "confidence": 0.91},
    {"cause": "Possible leak in UF feed water tank", "evidence": ["AIT202"], "confidence": 0.89},
    {"cause": "Possible leak in UF feed water tank", "evidence": ["AIT203"], "confidence": 0.87}
  ],
  "supporting_detail": "The anomaly is likely caused by a leak in the UF feed water tank, as indicated by the simultaneous increases in flow rate (FIT301), pressure (LIT301), and differential pressure (DPIT301). The conductivity sensor (AIT201) also 

JSONDecodeError: Unterminated string starting at: line 14 column 11 (char 1852)