In [None]:
import argparse
import json
import os
import re
import textwrap
from pathlib import Path
from typing import List

# --- PDF → text utility (PyMuPDF)
try:
    import fitz  # PyMuPDF
except ImportError:
    fitz = None  # Will error later if preprocess is used

# --- Deduplication
import text_dedup.minhash
from datasketch import MinHash, MinHashLSH

# --- Hugging Face & PEFT
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

import pandas as pd
import torch
torch.cuda.empty_cache()

In [4]:
# preprocess config
pdf_dir = './pdf'
out_dir = './data'
min_tokens = 128
max_tokens = 2048
overlap = 256

# train config
dataset_path = './data/train.jsonl'
output_dir = './checkpoints'
num_epochs = 3
per_device_train_batch = 2
grad_accum = 4
lr = 2e-4
warmup_steps = 100
lora_r = 64
lora_alpha = 128
lora_dropout = 0.05
max_seq_len = 2048
fp16 = True  # 또는 False

# evaluate config
adapter_dir = './checkpoints/adapter'
eval_dataset_path = './data/eval.json'
eval_questions = './data/questions.csv'  # CSV with columns 'question','answer_regex'


In [3]:
def extract_text_from_pdf(pdf_path: Path) -> str:
    """Extract raw text from a single PDF page-by-page via PyMuPDF."""
    if fitz is None:
        raise RuntimeError("PyMuPDF not installed. pip install pymupdf")
    doc = fitz.open(pdf_path)
    text_chunks = []
    for page in doc:
        page_text = page.get_text("text")
        text_chunks.append(page_text)
    raw = "\n".join(text_chunks)
    return raw

SECTION_PATTERNS = [
    re.compile(r"^references$", re.I),
    re.compile(r"^bibliography$", re.I),
    re.compile(r"^acknowledg(e)?ments?$", re.I),
]


def clean_text(text: str) -> str:
    """Remove references/acknowledgment sections & excessive blank lines."""
    lines = [l.strip() for l in text.splitlines()]
    cleaned: List[str] = []
    skip = False
    for ln in lines:
        if any(p.match(ln.lower()) for p in SECTION_PATTERNS):
            skip = True
        if not skip and ln:
            cleaned.append(ln)
    return "\n".join(cleaned)


def chunk_text(text: str, tokenizer, max_tokens: int, overlap: int) -> List[str]:
    """Slice long text into overlapping chunks by token count."""
    tokens = tokenizer(text)["input_ids"]
    chunks = []
    i = 0
    while i < len(tokens):
        chunk_tokens = tokens[i : i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        if len(chunk_tokens) >= 32:  # minimal meaningful length
            chunks.append(chunk_text)
        i += max_tokens - overlap
    return chunks


pdf_dir = Path(pdf_dir)
out_dir = Path(out_dir)
out_dir.mkdir(parents=True, exist_ok=True)

with open("token.txt", "r") as f:
    token = f.read().strip()

tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token)

raw_records = []
for pdf_path in pdf_dir.rglob("*.pdf"):
    raw = extract_text_from_pdf(pdf_path)
    cleaned = clean_text(raw)
    raw_records.append({"doc_id": pdf_path.stem, "text": cleaned})
    print(f"[+] Extracted {pdf_path}")

# Deduplicate
threshold = 0.88
num_perm = 128

texts = [r["text"] for r in raw_records]

minhashes = []
for text in texts:
    m = MinHash(num_perm=num_perm)
    for word in text.split():
        m.update(word.encode('utf8'))
    minhashes.append(m)

lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
unique_indices = []
seen = set()

for i, m in enumerate(minhashes):
    duplicates = lsh.query(m)
    if not duplicates:
        lsh.insert(f"m{i}", m)
        unique_indices.append(i)

unique_records = [raw_records[i] for i in unique_indices]
print(f"[i] Deduplicated: {len(texts)} → {len(unique_records)} docs")

# Chunking
all_chunks = []
for rec in unique_records:
    chunks = chunk_text(rec["text"], tokenizer, max_tokens, overlap)
    for idx, chunk in enumerate(chunks):
        all_chunks.append({"text": chunk, "source": f"{rec['doc_id']}§{idx}"})

# Filter by min_tokens
min_toks = min_tokens
def token_len(example):
    return len(tokenizer(example["text"])["input_ids"])

all_chunks = [c for c in all_chunks if token_len(c) >= min_toks]
print(f"[i] Final chunks: {len(all_chunks)}")

# Write JSONL
jsonl_path = out_dir / "train.jsonl"
with jsonl_path.open("w", encoding="utf-8") as f:
    for rec in all_chunks:
        f.write(json.dumps(rec, ensure_ascii=False) + "\n")
print(f"[✓] Saved {jsonl_path}")



[+] Extracted pdf/Analysis of silane and nitrous oxide produced plasma enhanced chemical vapor deposition simulation.pdf
[+] Extracted pdf/Analysis-of-the-synergetic-effect-of-process-parameters-of-h_2025_Diamond-an.pdf
[+] Extracted pdf/A-novel-physical-vapor-deposition-setup-applying-high-frequency-cur_2025_Vac.pdf
[+] Extracted pdf/A-review-of-comprehensive-utilization-of-biomass-to-s_2024_Journal-of-Analyt.pdf
[+] Extracted pdf/A-transport-kinetic-model-development-for-polysili_2024_International-Journa.pdf
[+] Extracted pdf/Centimeter-level-MoS2-films-with-controllable-number-of-layers-by-f_2023_Vac.pdf
[+] Extracted pdf/Characteristics-of-Single-Crystalline-Rutile-GeO2-Film-Gro_2025_Journal-of-A.pdf
[+] Extracted pdf/Chemical vapor deposition growth of boron incorporated graphitic carbon nitride film for carbon based semiconductor systems.pdf
[+] Extracted pdf/Chemical-vapor-deposited-nanocarbon-Fe-Al2O3-composi_2025_Materials-Chemistr.pdf
[+] Extracted pdf/Chemical-vapor-deposit

In [6]:
dataset_path = Path(dataset_path)
if not dataset_path.exists():
    raise FileNotFoundError(dataset_path)

# Load dataset
dataset = load_dataset("json", data_files=str(dataset_path), split="train")

with open("token.txt", "r") as f:
    token = f.read().strip()

# Tokenizer & model
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_auth_token=token)
tokenizer.pad_token = tokenizer.eos_token

# 4-bit QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_auth_token=token, device_map="auto")

# PEFT config
lora_cfg = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

# Tokenize dataset lazily
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_seq_len)

tokenized_ds = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text", "source"])

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=per_device_train_batch,
    gradient_accumulation_steps=grad_accum,
    learning_rate=lr,
    weight_decay=0.1,
    warmup_steps=warmup_steps,
    logging_steps=20,
    save_strategy="epoch",
    bf16=not fp16,
    fp16=fp16,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model()
tokenizer.save_pretrained(output_dir)
print("[✓] Training complete - adapter+tokenizer saved.")




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.0465


Step,Training Loss
20,1.7722
40,1.699
60,1.6674
80,1.6726
100,1.5501
120,1.5174
140,1.4655
160,1.4439
180,1.1113
200,1.1219



Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3.1-8B-Instruct.

Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignorin

[✓] Training complete - adapter+tokenizer saved.


In [17]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='nltk')
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

In [21]:
from typing import List

def perplexity(eval_texts: List[str], model, tokenizer):
    ppl_list = []

    for text in eval_texts:
        inputs = tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=512
        ).to(model.device)

        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
            loss = outputs.loss

        if not torch.isnan(loss):
            ppl = torch.exp(loss).item()
            ppl_list.append(ppl)

    if len(ppl_list) == 0:
        return float("nan")

    return sum(ppl_list) / len(ppl_list)


pretrined = False

if pretrined==True:
    base_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, use_auth_token=token, device_map="auto")
    # Load LoRA adapter
    from peft import PeftModel
    model = PeftModel.from_pretrained(base_model, output_dir)
model.eval()

# 3-5 random chunks for perplexity
eval_ds = load_dataset("json", data_files=str(dataset_path), split="train[:1%]")
sample_texts = [eval_ds[i]["text"] for i in range(min(5, len(eval_ds)))]
ppl = perplexity(sample_texts, model, tokenizer)
print(f"[i] Domain PPL ≈ {ppl:.2f}")

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

if eval_questions:
    df = pd.read_csv(eval_questions)
    results = []

    for idx, row in df.iterrows():
        prompt = textwrap.dedent(
            f"""\
            [INST] {row['question']} [/INST]
            """
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        gen = model.generate(**inputs, max_new_tokens=64, pad_token_id=tokenizer.eos_token_id)
        answer = tokenizer.decode(gen[0], skip_special_tokens=True).strip()
        gold = row["answer"].strip()

        bleu = sentence_bleu([gold.split()], answer.split())
        rouge_l = scorer.score(gold, answer)['rougeL'].fmeasure
        print(f"[{idx}] BLEU: {bleu}, ROUGE-L: {rouge_l}")

[i] Domain PPL ≈ 2.29
[0] BLEU: 7.884916681118857e-232, ROUGE-L: 0.07792207792207792
[1] BLEU: 8.147480343967206e-232, ROUGE-L: 0.09523809523809523
[2] BLEU: 6.573479617511883e-232, ROUGE-L: 0.0547945205479452
[3] BLEU: 9.641193013181824e-232, ROUGE-L: 0.13513513513513514
[4] BLEU: 0, ROUGE-L: 0.0547945205479452
[5] BLEU: 6.995501686664742e-232, ROUGE-L: 0.08955223880597014
[6] BLEU: 0, ROUGE-L: 0.027777777777777776
[7] BLEU: 0, ROUGE-L: 0.0
[8] BLEU: 8.928691163795855e-232, ROUGE-L: 0.08450704225352113
[9] BLEU: 0, ROUGE-L: 0.0
[10] BLEU: 0, ROUGE-L: 0.08823529411764705
[11] BLEU: 6.784338172413661e-232, ROUGE-L: 0.02666666666666667
[12] BLEU: 6.752107625974243e-232, ROUGE-L: 0.06060606060606061
[13] BLEU: 0, ROUGE-L: 0.06896551724137931
[14] BLEU: 0, ROUGE-L: 0.0
[15] BLEU: 7.919883909890055e-232, ROUGE-L: 0.08955223880597016
[16] BLEU: 6.630398171726777e-232, ROUGE-L: 0.02941176470588235
[17] BLEU: 6.720628411503338e-232, ROUGE-L: 0.08695652173913043
[18] BLEU: 6.466558133769387e-23

In [None]:
#!/usr/bin/env python
"""
LoRA-DAPT Pipeline for Llama-3.1-8B-Instruct on Semiconductor Domain Text
==========================================================================

This single-file script provides three CLI sub-commands:
    1. preprocess - Convert PDFs to clean text, deduplicate, chunk, and build Hugging Face Dataset.
    2. train      - Run QLoRA continued pre-training on the processed dataset.
    3. evaluate   - Quick perplexity and closed-book QA evaluation.

Usage examples
--------------
# 1. Pre-process raw PDFs (stored in data/raw) and create data/processed/train.jsonl
python lora_dapt_pipeline.py preprocess --pdf_dir data/raw --out_dir data/processed --min_tokens 128 --max_tokens 2048 --overlap 256

# 2. Train QLoRA adapter (saved to checkpoints/lora)
python lora_dapt_pipeline.py train --dataset_path data/processed/train.jsonl --output_dir checkpoints/lora --num_epochs 3 --per_device_train_batch 8

# 3. Evaluate perplexity & domain quiz accuracy
python lora_dapt_pipeline.py evaluate --adapter_dir checkpoints/lora --eval_questions data/eval/domain_quiz.csv
"""

import argparse
import json
import os
import re
import textwrap
from pathlib import Path
from typing import List

# --- PDF → text utility (PyMuPDF)
try:
    import fitz  # PyMuPDF
except ImportError:
    fitz = None  # Will error later if preprocess is used

# --- Deduplication
try:
    from text_dedup.minhash import MinHashDeduper
except ImportError:
    MinHashDeduper = None

# --- Hugging Face & PEFT
from datasets import Dataset, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model
import torch

########################
# 1. PRE-PROCESS STAGE #
########################

def extract_text_from_pdf(pdf_path: Path) -> str:
    """Extract raw text from a single PDF page-by-page via PyMuPDF."""
    if fitz is None:
        raise RuntimeError("PyMuPDF not installed. pip install pymupdf")
    doc = fitz.open(pdf_path)
    text_chunks = []
    for page in doc:
        page_text = page.get_text("text")
        text_chunks.append(page_text)
    raw = "\n".join(text_chunks)
    return raw

SECTION_PATTERNS = [
    re.compile(r"^references$", re.I),
    re.compile(r"^bibliography$", re.I),
    re.compile(r"^acknowledg(e)?ments?$", re.I),
]


def clean_text(text: str) -> str:
    """Remove references/acknowledgment sections & excessive blank lines."""
    lines = [l.strip() for l in text.splitlines()]
    cleaned: List[str] = []
    skip = False
    for ln in lines:
        if any(p.match(ln.lower()) for p in SECTION_PATTERNS):
            skip = True
        if not skip and ln:
            cleaned.append(ln)
    return "\n".join(cleaned)


def chunk_text(text: str, tokenizer, max_tokens: int, overlap: int) -> List[str]:
    """Slice long text into overlapping chunks by token count."""
    tokens = tokenizer(text)["input_ids"]
    chunks = []
    i = 0
    while i < len(tokens):
        chunk_tokens = tokens[i : i + max_tokens]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        if len(chunk_tokens) >= 32:  # minimal meaningful length
            chunks.append(chunk_text)
        i += max_tokens - overlap
    return chunks


def preprocess_cmd(args):
    pdf_dir = Path(args.pdf_dir)
    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    with open("token.txt", "r") as f:
        token = f.read().strip()

    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=token)

    raw_records = []
    for pdf_path in pdf_dir.rglob("*.pdf"):
        raw = extract_text_from_pdf(pdf_path)
        cleaned = clean_text(raw)
        raw_records.append({"doc_id": pdf_path.stem, "text": cleaned})
        print(f"[+] Extracted {pdf_path}")

    # Deduplicate
    if MinHashDeduper is None:
        raise RuntimeError("text-dedup not installed. pip install text_dedup")
    texts = [r["text"] for r in raw_records]
    deduper = MinHashDeduper(threshold=0.88)
    uniques = deduper(texts)
    unique_records = [raw_records[i] for i in uniques]
    print(f"[i] Deduplicated: {len(texts)} → {len(unique_records)} docs")

    # Chunking
    all_chunks = []
    for rec in unique_records:
        chunks = chunk_text(rec["text"], tokenizer, args.max_tokens, args.overlap)
        for idx, chunk in enumerate(chunks):
            all_chunks.append({"text": chunk, "source": f"{rec['doc_id']}§{idx}"})

    # Filter by min_tokens
    min_toks = args.min_tokens
    def token_len(example):
        return len(tokenizer(example["text"])["input_ids"])

    all_chunks = [c for c in all_chunks if token_len(c) >= min_toks]
    print(f"[i] Final chunks: {len(all_chunks)}")

    # Write JSONL
    jsonl_path = out_dir / "train.jsonl"
    with jsonl_path.open("w", encoding="utf-8") as f:
        for rec in all_chunks:
            f.write(json.dumps(rec, ensure_ascii=False) + "\n")
    print(f"[✓] Saved {jsonl_path}")

########################
# 2. TRAINING STAGE    #
########################

def train_cmd(args):
    dataset_path = Path(args.dataset_path)
    if not dataset_path.exists():
        raise FileNotFoundError(dataset_path)

    # Load dataset
    dataset = load_dataset("json", data_files=str(dataset_path), split="train")
    
    with open("token.txt", "r") as f:
        token = f.read().strip()

    # Tokenizer & model
    model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True, use_auth_token=token)
    tokenizer.pad_token = tokenizer.eos_token

    # 4-bit QLoRA
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

    # PEFT config
    lora_cfg = LoraConfig(
        r=args.lora_r,
        lora_alpha=args.lora_alpha,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        lora_dropout=args.lora_dropout,
        bias="none",
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_cfg)
    model.print_trainable_parameters()

    # Tokenize dataset lazily
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=args.max_seq_len)

    tokenized_ds = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text", "source"])

    data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

    training_args = TrainingArguments(
        output_dir=args.output_dir,
        num_train_epochs=args.num_epochs,
        per_device_train_batch_size=args.per_device_train_batch,
        gradient_accumulation_steps=args.grad_accum,
        learning_rate=args.lr,
        weight_decay=0.1,
        warmup_steps=args.warmup_steps,
        logging_steps=20,
        save_strategy="epoch",
        bf16=not args.fp16,
        fp16=args.fp16,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    trainer.train()
    trainer.save_model()
    tokenizer.save_pretrained(args.output_dir)
    print("[✓] Training complete – adapter+tokenizer saved.")

########################
# 3. EVALUATION STAGE  #
########################

def perplexity(eval_texts: List[str], model, tokenizer):
    encodings = tokenizer(eval_texts, return_tensors="pt", padding=True)
    max_length = encodings.input_ids.shape[1]
    stride = 512
    ppl_list = []
    for i in range(0, max_length, stride):
        inputs = {k: v[:, i : i + stride].to(model.device) for k, v in encodings.items()}
        with torch.no_grad():
            loss = model(**inputs, labels=inputs["input_ids"]).loss
        ppl_list.append(torch.exp(loss).item())
    return sum(ppl_list) / len(ppl_list)


def evaluate_cmd(args):
    model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16)
    base_model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

    # Load LoRA adapter
    from peft import PeftModel

    model = PeftModel.from_pretrained(base_model, args.adapter_dir)
    model.eval()

    # 3-5 random chunks for perplexity
    eval_ds = load_dataset("json", data_files=str(args.dataset_path), split="train[:1%]")
    sample_texts = [eval_ds[i]["text"] for i in range(min(5, len(eval_ds)))]
    ppl = perplexity(sample_texts, model, tokenizer)
    print(f"[i] Domain PPL ≈ {ppl:.2f}")

    if args.eval_questions:
        import pandas as pd
        df = pd.read_csv(args.eval_questions)
        correct = 0
        for _, row in df.iterrows():
            prompt = textwrap.dedent(
                f"""\
                [INST] {row['question']} [/INST]
                """
            )
            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
            gen = model.generate(**inputs, max_new_tokens=64)
            answer = tokenizer.decode(gen[0], skip_special_tokens=True)
            if re.search(row["answer_regex"], answer, re.I):
                correct += 1
        print(f"[i] Quiz accuracy: {correct}/{len(df)} ({100*correct/len(df):.1f}%)")

########################
# CLI / Main           #
########################

def main():
    parser = argparse.ArgumentParser(description="LoRA-DAPT pipeline")
    sub = parser.add_subparsers(dest="cmd", required=True)

    # preprocess
    p_pre = sub.add_parser("preprocess")
    p_pre.add_argument("--pdf_dir", default="./pdf", required=True)
    p_pre.add_argument("--out_dir", default="./result",  required=True)
    p_pre.add_argument("--min_tokens", type=int, default=128)
    p_pre.add_argument("--max_tokens", type=int, default=2048)
    p_pre.add_argument("--overlap", type=int, default=256)
    p_pre.set_defaults(func=preprocess_cmd)

    # train
    p_train = sub.add_parser("train")
    p_train.add_argument("--dataset_path", required=True)
    p_train.add_argument("--output_dir", required=True)
    p_train.add_argument("--num_epochs", type=int, default=3)
    p_train.add_argument("--per_device_train_batch", type=int, default=8)
    p_train.add_argument("--grad_accum", type=int, default=4)
    p_train.add_argument("--lr", type=float, default=2e-4)
    p_train.add_argument("--warmup_steps", type=int, default=100)
    p_train.add_argument("--lora_r", type=int, default=64)
    p_train.add_argument("--lora_alpha", type=int, default=128)
    p_train.add_argument("--lora_dropout", type=float, default=0.05)
    p_train.add_argument("--max_seq_len", type=int, default=2048)
    p_train.add_argument("--fp16", action="store_true")
    p_train.set_defaults(func=train_cmd)

    # evaluate
    p_eval = sub.add_parser("evaluate")
    p_eval.add_argument("--adapter_dir", required=True)
    p_eval.add_argument("--dataset_path", required=True)
    p_eval.add_argument("--eval_questions", help="CSV with columns 'question','answer_regex'")
    p_eval.set_defaults(func=evaluate_cmd)

    args = parser.parse_args()
    args.func(args)

if __name__ == "__main__":
    main()


usage: ipykernel_launcher.py [-h] {preprocess} ...
ipykernel_launcher.py: error: the following arguments are required: cmd


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
