In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/AI_Human.csv')
df.head()

In [None]:
!cp '/content/drive/MyDrive/AI_Human.csv' "/content/" # 학습용 데이터셋
!cp '/content/drive/MyDrive/daigt.csv' "/content/" # 테스트용 데이터셋

### 📦 Install / Imports & helpers

In [None]:
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,4,5,6,7"

import torch

# GPU 사용 가능한지 확인
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("사용 중인 GPU 이름:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU를 사용할 수 없습니다.")

In [None]:
import os, datetime as dt, json, random, numpy as np, pandas as pd, torch
from datasets import Dataset
from transformers import (
    RobertaTokenizerFast, RobertaForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    EarlyStoppingCallback, TrainerCallback,
)
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm.auto import tqdm
from datasets import disable_caching

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

### 하이퍼파라미터 튜닝을 위한 조합 실험 (1) 데이터 로드

In [None]:
RAW_PATH = "/content/AI_Human.csv"
assert os.path.exists(RAW_PATH), f"{RAW_PATH} not found!"

df_raw = (
    pd.read_csv(RAW_PATH, usecols=["Generation", "label"])
      .dropna(subset=["Generation"])
      .rename(columns={"Generation": "text"})
)

df_raw["text_norm"] = df_raw["text"].str.lower().str.strip()
df_raw["label"] = df_raw["label"].astype(int)

before, after = len(df_raw), df_raw["text_norm"].nunique()
df_raw = df_raw.drop_duplicates(subset="text_norm")
print(f"Removed {before - after:,} exact duplicate rows.")

### 하이퍼파라미터 튜닝을 위한 조합 실험 (2) 실험 시작

In [None]:
# 1) 샘플링 (전체 10%)
SEED = 42
df_small = df_raw.sample(frac=0.10, random_state=SEED)

# 2) train/val 분할 (8:2)
train_df_small, val_df_small = train_test_split(
    df_small,
    test_size=0.2,
    stratify=df_small["label"],
    random_state=SEED,
)

In [None]:
# 3) Tokenizer 설정
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

MAX_LEN = 512
HEAD    = 256
TAIL    = 254

def head_tail_tokenize(batch):
    encodings = {"input_ids": [], "attention_mask": []}
    for text in batch["text"]:
        ids = tokenizer.encode(text, add_special_tokens=True, truncation=False)
        if len(ids) > MAX_LEN:
            ids = ids[: HEAD + 1] + ids[-TAIL:]
            ids = ids[:MAX_LEN]
        attn = [1] * len(ids)
        encodings["input_ids"].append(ids)
        encodings["attention_mask"].append(attn)
    return encodings

# 4) Dataset 변환 및 토크나이징
disable_caching()

train_ds_small = Dataset.from_pandas(train_df_small[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc=1
)
val_ds_small = Dataset.from_pandas(val_df_small[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc=1
)

# 5) Collator
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
# 튜닝 자동화
# 튜닝 대상 조합 정의
configs = [
    {"name": "baseline",       "lr": 2e-5, "wd": 0.01},
    {"name": "high_lr",        "lr": 3e-5, "wd": 0.01},
    {"name": "strong_decay",   "lr": 2e-5, "wd": 0.1},
]

# 성능 지표
def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = pred.predictions.argmax(-1)
    return {
        "accuracy":  accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall":    recall_score(y_true, y_pred),
        "f1":        f1_score(y_true, y_pred),
    }

# 로그 콜백
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            now = dt.datetime.now().strftime("%H:%M:%S")
            # 안정적으로 logs에서 가져오기
            lr = logs.get("learning_rate", 0.0)
            watched = {
                "loss": logs.get("loss"),
                "eval_loss": logs.get("eval_loss"),
                "eval_accuracy": logs.get("eval_accuracy"),
                "eval_f1": logs.get("eval_f1"),
                "lr": lr,
            }
            # msg = " | ".join(f"{k}: {v:.4f}" for k,v in watched.items() if v is not None)
            msg = " | ".join(f"{k}: {v:.6f}" if k == "lr" else f"{k}: {v:.4f}" for k,v in watched.items() if v is not None)
            print(f"[{now}] step {state.global_step} | {msg}")


# 실험 반복
for cfg in configs:
    print(f"\n🚀 실험 시작: {cfg['name']}")

    # 고유 로그/모델 저장 경로
    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    run_id = f"{cfg['name']}_lr{cfg['lr']}_wd{cfg['wd']}_{timestamp}"

    # 모델 초기화
    model = RobertaForSequenceClassification.from_pretrained(
        "roberta-base",
        num_labels=2,
        hidden_dropout_prob=0.2,
        attention_probs_dropout_prob=0.2,
    ).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    # 학습 설정
    training_args = TrainingArguments(
        output_dir=f"/content/tune_outputs/{run_id}",
        per_device_train_batch_size=56,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        learning_rate=cfg['lr'],
        warmup_ratio=0.1,           # 전체 학습 스텝 중 10%를 warmup
        lr_scheduler_type="linear",
        weight_decay=cfg['wd'],
        fp16=torch.cuda.is_available(),
        label_smoothing_factor=0.1,
        eval_strategy="epoch", save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir=f"/content/logs/{run_id}",
        logging_steps=10, logging_first_step=True,
        save_total_limit=1,
        run_name=run_id,
        report_to=["tensorboard"],
        ddp_find_unused_parameters=False,
    )

    # Trainer 구성
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds_small,
        eval_dataset=val_ds_small,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2), LogCallback()],
    )

    # 학습 실행
    trainer.train()

### 🧹 Load & clean raw data (dedup)

In [None]:
RAW_PATH1 = "/content/AI_Human.csv"
assert os.path.exists(RAW_PATH1), f"{RAW_PATH1} not found!"

df_raw1 = (
    pd.read_csv(RAW_PATH1, usecols=["Generation", "label"])
      .dropna(subset=["Generation"])
      .rename(columns={"Generation": "text"})
)

df_raw1["text_norm"] = df_raw1["text"].str.lower().str.strip()
df_raw1["label"] = df_raw1["label"].astype(int)

before, after = len(df_raw1), df_raw1["text_norm"].nunique()
df_raw1 = df_raw1.drop_duplicates(subset="text_norm")
print(f"Removed {before - after:,} exact duplicate rows.")

### ✂️ Split or load cached splits

In [None]:
CACHE_DIR = "splits_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
paths = {name: f"{CACHE_DIR}/{name}.parquet" for name in ["train", "val", "test"]}

if all(os.path.exists(p) for p in paths.values()):
    print("📂 Cached splits found – loading.")
    train_df = pd.read_parquet(paths["train"])
    val_df   = pd.read_parquet(paths["val"])
    test_df  = pd.read_parquet(paths["test"])
else:
    print("⚙️  Creating new splits.")
    gss1 = GroupShuffleSplit(train_size=0.8, random_state=SEED, n_splits=1)
    train_idx, temp_idx = next(gss1.split(df_raw1, groups=df_raw1["text_norm"]))
    train_df = df_raw1.iloc[train_idx]
    temp_df  = df_raw1.iloc[temp_idx]

    gss2 = GroupShuffleSplit(train_size=0.5, random_state=SEED, n_splits=1)
    val_idx, test_idx = next(gss2.split(temp_df, groups=temp_df["text_norm"]))
    val_df  = temp_df.iloc[val_idx]
    test_df = temp_df.iloc[test_idx]

    train_df.to_parquet(paths["train"])
    val_df.to_parquet(paths["val"])
    test_df.to_parquet(paths["test"])
    print("Splits saved to 'splits_cache/'.")

overlap = set(train_df["text_norm"]) & set(val_df["text_norm"])
print("train ∩ val duplicates:", len(overlap))

### 🔠 Tokenize & build HF Datasets

In [None]:
tok = RobertaTokenizerFast.from_pretrained("roberta-base")

ds = Dataset.from_pandas(df_raw[["text"]], preserve_index=False)

def add_len(batch):
    batch["tok_len"] = [len(t) for t in tok(batch["text"], add_special_tokens=True)["input_ids"]]
    return batch

disable_caching()

ds = ds.map(add_len, batched=True, batch_size=1024, num_proc=1, desc="Adding token lengths")
lengths = ds["tok_len"]

pct = np.percentile(lengths, [50, 90, 95, 99])
print("median / p90 / p95 / p99 =", pct)
print("max =", max(lengths))

In [None]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

MAX_LEN = 512
HEAD    = 256
TAIL    = 254

def head_tail_tokenize(batch):
    encodings = {"input_ids": [], "attention_mask": []}
    for text in batch["text"]:
        ids = tokenizer.encode(text, add_special_tokens=True, truncation=False)
        if len(ids) > MAX_LEN:
            # ids[0] : <s>, ids[-1] : </s>
            new_ids = ids[: HEAD + 1] + ids[-TAIL:]
            ids = new_ids[:MAX_LEN]
        attn = [1] * len(ids)
        encodings["input_ids"].append(ids)
        encodings["attention_mask"].append(attn)
    return encodings

train_ds = Dataset.from_pandas(train_df[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc = 1
)
val_ds = Dataset.from_pandas(val_df[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc = 1
)
test_ds = Dataset.from_pandas(test_df[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc = 1
)

data_collator = DataCollatorWithPadding(tokenizer)

### 🏗️ Build model (RoBERTa)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=2,
    hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2,
).to(device)

### ⚙️ TrainingArguments (튜닝된 하이퍼파라미터)

In [None]:
training_args = TrainingArguments(
    output_dir="/content/roberta-output",
    per_device_train_batch_size=56,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=2,           # 3 -> 2
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    weight_decay=0.1,             # 0.01 -> 0.1
    fp16=torch.cuda.is_available(),
    label_smoothing_factor=0.1,
    eval_strategy="epoch", save_strategy="epoch",
    load_best_model_at_end=True, metric_for_best_model="f1",
    logging_dir="./logs", logging_steps=10, logging_first_step=True,
    save_total_limit=1, run_name="roberta-ai-vs-human", report_to=["tensorboard"],
    ddp_find_unused_parameters=False,
)

### 🚂 Trainer & train

In [None]:
# class LogCallback(TrainerCallback):
#     def on_log(self, args, state, control, logs=None, **kwargs):
#         if logs:
#             now = dt.datetime.now().strftime("%H:%M:%S")
#             lr  = kwargs.get("optimizer", {}).param_groups[0]["lr"] if "optimizer" in kwargs else logs.get("learning_rate")
#             watched = {**{k: logs.get(k) for k in ["loss","eval_loss","eval_accuracy","eval_f1"]}, "lr": lr}
#             msg = " | ".join(f"{k}: {v:.4f}" for k,v in watched.items() if v is not None)
#             print(f"[{now}] step {state.global_step} | {msg}")

# 로그 콜백 수정
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            now = dt.datetime.now().strftime("%H:%M:%S")
            # 안정적으로 logs에서 가져오기
            lr = logs.get("learning_rate", 0.0)
            watched = {
                "loss": logs.get("loss"),
                "eval_loss": logs.get("eval_loss"),
                "eval_accuracy": logs.get("eval_accuracy"),
                "eval_f1": logs.get("eval_f1"),
                "lr": lr,
            }
            msg = " | ".join(f"{k}: {v:.6f}" if k == "lr" else f"{k}: {v:.4f}" for k,v in watched.items() if v is not None)
            print(f"[{now}] step {state.global_step} | {msg}")

def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(-1)
    return dict(
        accuracy  = accuracy_score(labels, preds),
        precision = precision_score(labels, preds),
        recall    = recall_score(labels, preds),
        f1        = f1_score(labels, preds),
    )

trainer = Trainer(
    model=model, args=training_args,
    train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(2), LogCallback()],
)
trainer.train()

### 🧪 Evaluate on test set

In [None]:
print("📊 Test metrics:", trainer.evaluate(test_ds))

In [None]:
# 1) 모델/토크나이저 불러오기
MODEL_DIR = "/content/roberta-ai-vs-human"
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_DIR)
model     = RobertaForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# 2) DAIGT 데이터 로드 & 전처리
DAIGT_PATH = "/content/daigt.csv"
df = (pd.read_csv(DAIGT_PATH, usecols=["text", "generated"])
        .rename(columns={"generated":"label"})
        .dropna(subset=["text"]))
df["text"]  = df["text"].astype(str).str.strip()
df["label"] = df["label"].astype(int)

# 3) Head-Tail 토크나이즈
MAX_LEN, HEAD, TAIL = 512, 256, 254
def ht_tokenize(batch):
    ids_all, attn_all = [], []
    for txt in batch["text"]:
        ids = tokenizer.encode(txt, add_special_tokens=True, truncation=False)
        if len(ids) > MAX_LEN:
            ids = ids[:HEAD+1] + ids[-TAIL:]
        ids_all.append(ids)
        attn_all.append([1]*len(ids))
    return {"input_ids": ids_all, "attention_mask": attn_all}

ds_test = Dataset.from_pandas(df[["text","label"]]).map(
    ht_tokenize, batched=True, batch_size=1024,
    num_proc=20, remove_columns=["text"], desc="Tokenizing(DAIGT)"
)
data_collator = DataCollatorWithPadding(tokenizer)

# 4) 테스트용 Trainer
def metrics(p):
    y, pred = p.label_ids, p.predictions.argmax(-1)
    return {
        "accuracy":  accuracy_score(y, pred),
        "precision": precision_score(y, pred),
        "recall":    recall_score(y, pred),
        "f1":        f1_score(y, pred),
    }

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="tmp-eval",
        per_device_eval_batch_size=32,
        dataloader_drop_last=False,
        seed=42,
    ),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metrics,
)

# 5) 테스트 실행
results = trainer.evaluate(ds_test)
print("\n📊 DAIGT Test metrics")
for k,v in results.items():
    if k.startswith("eval_"):
        print(f"{k:12s}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

### 💾 Save model/tokenizer

In [None]:
SAVE_PATH = "/content/drive/MyDrive/final-tunned-roberta-ai-vs-human"
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print(f"Model & tokenizer saved to '{SAVE_PATH}'.")