In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/AI_Human.csv')
df.head()

Unnamed: 0,text,label,text_length
0,Cars. Cars have been around since they became ...,0.0,3289
1,Transportation is a large necessity in most co...,0.0,2738
2,"""America's love affair with it's vehicles seem...",0.0,4428
3,How often do you ride in a car? Do you drive a...,0.0,4013
4,Cars are a wonderful thing. They are perhaps o...,0.0,4698


In [3]:
!cp '/content/drive/MyDrive/AI_Human.csv' "/content/" # 학습용 데이터셋
!cp '/content/drive/MyDrive/daigt.csv' "/content/" # 테스트용 데이터셋

### 📦 Install / Imports & helpers

In [4]:
# import os
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,4,5,6,7"

import torch

# GPU 사용 가능한지 확인
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("사용 중인 GPU 이름:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("GPU를 사용할 수 없습니다.")

사용 중인 GPU 이름: NVIDIA A100-SXM4-40GB


In [5]:
import os, datetime as dt, json, random, numpy as np, pandas as pd, torch
from datasets import Dataset
from transformers import (
    RobertaTokenizerFast, RobertaForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding,
    EarlyStoppingCallback, TrainerCallback,
)
from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm.auto import tqdm
from datasets import disable_caching

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

<torch._C.Generator at 0x7975bd310530>

### 하이퍼파라미터 튜닝을 위한 조합 실험 (1) 데이터 로드

In [6]:
RAW_PATH = "/content/AI_Human.csv"
assert os.path.exists(RAW_PATH), f"{RAW_PATH} not found!"

df_raw = (
    pd.read_csv(RAW_PATH, usecols=["Generation", "label"])
      .dropna(subset=["Generation"])
      .rename(columns={"Generation": "text"})
)

df_raw["text_norm"] = df_raw["text"].str.lower().str.strip()
df_raw["label"] = df_raw["label"].astype(int)

before, after = len(df_raw), df_raw["text_norm"].nunique()
df_raw = df_raw.drop_duplicates(subset="text_norm")
print(f"Removed {before - after:,} exact duplicate rows.")

Removed 22,432 exact duplicate rows.


### 하이퍼파라미터 튜닝을 위한 조합 실험 (2) 실험 시작

In [7]:
# 1) 샘플링 (전체 10%)
SEED = 42
df_small = df_raw.sample(frac=0.10, random_state=SEED)

# 2) train/val 분할 (8:2)
train_df_small, val_df_small = train_test_split(
    df_small,
    test_size=0.2,
    stratify=df_small["label"],
    random_state=SEED,
)

In [8]:
# 3) Tokenizer 설정
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

MAX_LEN = 512
HEAD    = 256
TAIL    = 254

def head_tail_tokenize(batch):
    encodings = {"input_ids": [], "attention_mask": []}
    for text in batch["text"]:
        ids = tokenizer.encode(text, add_special_tokens=True, truncation=False)
        if len(ids) > MAX_LEN:
            ids = ids[: HEAD + 1] + ids[-TAIL:]
            ids = ids[:MAX_LEN]
        attn = [1] * len(ids)
        encodings["input_ids"].append(ids)
        encodings["attention_mask"].append(attn)
    return encodings

# 4) Dataset 변환 및 토크나이징
disable_caching()

train_ds_small = Dataset.from_pandas(train_df_small[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc=1
)
val_ds_small = Dataset.from_pandas(val_df_small[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc=1
)

# 5) Collator
data_collator = DataCollatorWithPadding(tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Map:   0%|          | 0/37182 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/9296 [00:00<?, ? examples/s]

In [11]:
# 튜닝 자동화
# 튜닝 대상 조합 정의
configs = [
    {"name": "baseline",       "lr": 2e-5, "wd": 0.01},
    {"name": "high_lr",        "lr": 3e-5, "wd": 0.01},
    {"name": "strong_decay",   "lr": 2e-5, "wd": 0.1},
]

# 성능 지표
def compute_metrics(pred):
    y_true = pred.label_ids
    y_pred = pred.predictions.argmax(-1)
    return {
        "accuracy":  accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall":    recall_score(y_true, y_pred),
        "f1":        f1_score(y_true, y_pred),
    }

# 로그 콜백
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            now = dt.datetime.now().strftime("%H:%M:%S")
            # 안정적으로 logs에서 가져오기
            lr = logs.get("learning_rate", 0.0)
            watched = {
                "loss": logs.get("loss"),
                "eval_loss": logs.get("eval_loss"),
                "eval_accuracy": logs.get("eval_accuracy"),
                "eval_f1": logs.get("eval_f1"),
                "lr": lr,
            }
            # msg = " | ".join(f"{k}: {v:.4f}" for k,v in watched.items() if v is not None)
            msg = " | ".join(f"{k}: {v:.6f}" if k == "lr" else f"{k}: {v:.4f}" for k,v in watched.items() if v is not None)
            print(f"[{now}] step {state.global_step} | {msg}")


# 실험 반복
for cfg in configs:
    print(f"\n🚀 실험 시작: {cfg['name']}")

    # 고유 로그/모델 저장 경로
    timestamp = dt.datetime.now().strftime("%Y%m%d_%H%M%S")
    run_id = f"{cfg['name']}_lr{cfg['lr']}_wd{cfg['wd']}_{timestamp}"

    # 모델 초기화
    model = RobertaForSequenceClassification.from_pretrained(
        "roberta-base",
        num_labels=2,
        hidden_dropout_prob=0.2,
        attention_probs_dropout_prob=0.2,
    ).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    # 학습 설정
    training_args = TrainingArguments(
        output_dir=f"/content/tune_outputs/{run_id}",
        per_device_train_batch_size=56,
        per_device_eval_batch_size=16,
        gradient_accumulation_steps=4,
        num_train_epochs=3,
        learning_rate=cfg['lr'],
        warmup_ratio=0.1,           # 전체 학습 스텝 중 10%를 warmup
        lr_scheduler_type="linear",
        weight_decay=cfg['wd'],
        fp16=torch.cuda.is_available(),
        label_smoothing_factor=0.1,
        eval_strategy="epoch", save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir=f"/content/logs/{run_id}",
        logging_steps=10, logging_first_step=True,
        save_total_limit=1,
        run_name=run_id,
        report_to=["tensorboard"],
        ddp_find_unused_parameters=False,
    )

    # Trainer 구성
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds_small,
        eval_dataset=val_ds_small,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2), LogCallback()],
    )

    # 학습 실행
    trainer.train()


🚀 실험 시작: baseline


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2052,0.538811,0.842513,0.71112,0.998888,0.830791
2,0.2061,0.331399,0.938145,0.863429,0.998054,0.925873
3,0.2012,0.549722,0.84488,0.713974,0.999722,0.833025


[11:36:44] step 1 | loss: 0.7094 | lr: 0.000000
[11:36:52] step 10 | loss: 0.7002 | lr: 0.000004
[11:37:00] step 20 | loss: 0.6699 | lr: 0.000008
[11:37:08] step 30 | loss: 0.6320 | lr: 0.000011
[11:37:17] step 40 | loss: 0.4689 | lr: 0.000015
[11:37:25] step 50 | loss: 0.3058 | lr: 0.000019
[11:37:34] step 60 | loss: 0.3006 | lr: 0.000020
[11:37:42] step 70 | loss: 0.2541 | lr: 0.000019
[11:37:51] step 80 | loss: 0.2268 | lr: 0.000019
[11:37:59] step 90 | loss: 0.2217 | lr: 0.000018
[11:38:08] step 100 | loss: 0.2162 | lr: 0.000018
[11:38:16] step 110 | loss: 0.2181 | lr: 0.000017
[11:38:25] step 120 | loss: 0.2157 | lr: 0.000017
[11:38:33] step 130 | loss: 0.2171 | lr: 0.000017
[11:38:42] step 140 | loss: 0.2140 | lr: 0.000016
[11:38:50] step 150 | loss: 0.2180 | lr: 0.000016
[11:38:59] step 160 | loss: 0.2052 | lr: 0.000015
[11:39:19] step 166 | eval_loss: 0.5388 | eval_accuracy: 0.8425 | eval_f1: 0.8308 | lr: 0.000000
[11:39:25] step 170 | loss: 0.2122 | lr: 0.000015
[11:39:34] ste

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2057,0.481334,0.873924,0.754618,0.999166,0.859842
2,0.2066,0.301098,0.94815,0.882801,0.99861,0.937141
3,0.2011,0.453272,0.892104,0.782249,0.999444,0.877608


[11:44:42] step 1 | loss: 0.7388 | lr: 0.000000
[11:44:50] step 10 | loss: 0.7237 | lr: 0.000005
[11:44:58] step 20 | loss: 0.6630 | lr: 0.000011
[11:45:07] step 30 | loss: 0.5423 | lr: 0.000017
[11:45:15] step 40 | loss: 0.3776 | lr: 0.000023
[11:45:24] step 50 | loss: 0.2974 | lr: 0.000029
[11:45:32] step 60 | loss: 0.2435 | lr: 0.000029
[11:45:41] step 70 | loss: 0.2948 | lr: 0.000029
[11:45:49] step 80 | loss: 0.3215 | lr: 0.000028
[11:45:58] step 90 | loss: 0.2830 | lr: 0.000027
[11:46:06] step 100 | loss: 0.2451 | lr: 0.000027
[11:46:14] step 110 | loss: 0.2261 | lr: 0.000026
[11:46:23] step 120 | loss: 0.2137 | lr: 0.000025
[11:46:31] step 130 | loss: 0.2186 | lr: 0.000025
[11:46:40] step 140 | loss: 0.2198 | lr: 0.000024
[11:46:48] step 150 | loss: 0.2150 | lr: 0.000023
[11:46:57] step 160 | loss: 0.2057 | lr: 0.000023
[11:47:18] step 166 | eval_loss: 0.4813 | eval_accuracy: 0.8739 | eval_f1: 0.8598 | lr: 0.000000
[11:47:23] step 170 | loss: 0.2159 | lr: 0.000022
[11:47:32] ste

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2135,0.452145,0.872633,0.752934,0.99861,0.858542
2,0.2067,0.290719,0.955142,0.897327,0.998332,0.945139
3,0.2016,0.39651,0.914479,0.819467,0.999166,0.900438


[11:52:40] step 1 | loss: 0.7388 | lr: 0.000000
[11:52:48] step 10 | loss: 0.7275 | lr: 0.000004
[11:52:56] step 20 | loss: 0.6835 | lr: 0.000008
[11:53:05] step 30 | loss: 0.6048 | lr: 0.000012
[11:53:13] step 40 | loss: 0.4429 | lr: 0.000016
[11:53:22] step 50 | loss: 0.3449 | lr: 0.000020
[11:53:30] step 60 | loss: 0.2463 | lr: 0.000020
[11:53:39] step 70 | loss: 0.2457 | lr: 0.000019
[11:53:47] step 80 | loss: 0.2253 | lr: 0.000019
[11:53:56] step 90 | loss: 0.2136 | lr: 0.000018
[11:54:04] step 100 | loss: 0.2166 | lr: 0.000018
[11:54:13] step 110 | loss: 0.2121 | lr: 0.000017
[11:54:21] step 120 | loss: 0.2159 | lr: 0.000017
[11:54:30] step 130 | loss: 0.2200 | lr: 0.000017
[11:54:38] step 140 | loss: 0.2157 | lr: 0.000016
[11:54:47] step 150 | loss: 0.2150 | lr: 0.000016
[11:54:55] step 160 | loss: 0.2135 | lr: 0.000015
[11:55:16] step 166 | eval_loss: 0.4521 | eval_accuracy: 0.8726 | eval_f1: 0.8585 | lr: 0.000000
[11:55:22] step 170 | loss: 0.2137 | lr: 0.000015
[11:55:30] ste

### 🧹 Load & clean raw data (dedup)

In [12]:
RAW_PATH1 = "/content/AI_Human.csv"
assert os.path.exists(RAW_PATH1), f"{RAW_PATH1} not found!"

df_raw1 = (
    pd.read_csv(RAW_PATH1, usecols=["Generation", "label"])
      .dropna(subset=["Generation"])
      .rename(columns={"Generation": "text"})
)

df_raw1["text_norm"] = df_raw1["text"].str.lower().str.strip()
df_raw1["label"] = df_raw1["label"].astype(int)

before, after = len(df_raw1), df_raw1["text_norm"].nunique()
df_raw1 = df_raw1.drop_duplicates(subset="text_norm")
print(f"Removed {before - after:,} exact duplicate rows.")

Removed 22,432 exact duplicate rows.


### ✂️ Split or load cached splits

In [13]:
CACHE_DIR = "splits_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
paths = {name: f"{CACHE_DIR}/{name}.parquet" for name in ["train", "val", "test"]}

if all(os.path.exists(p) for p in paths.values()):
    print("📂 Cached splits found – loading.")
    train_df = pd.read_parquet(paths["train"])
    val_df   = pd.read_parquet(paths["val"])
    test_df  = pd.read_parquet(paths["test"])
else:
    print("⚙️  Creating new splits.")
    gss1 = GroupShuffleSplit(train_size=0.8, random_state=SEED, n_splits=1)
    train_idx, temp_idx = next(gss1.split(df_raw1, groups=df_raw1["text_norm"]))
    train_df = df_raw1.iloc[train_idx]
    temp_df  = df_raw1.iloc[temp_idx]

    gss2 = GroupShuffleSplit(train_size=0.5, random_state=SEED, n_splits=1)
    val_idx, test_idx = next(gss2.split(temp_df, groups=temp_df["text_norm"]))
    val_df  = temp_df.iloc[val_idx]
    test_df = temp_df.iloc[test_idx]

    train_df.to_parquet(paths["train"])
    val_df.to_parquet(paths["val"])
    test_df.to_parquet(paths["test"])
    print("Splits saved to 'splits_cache/'.")

overlap = set(train_df["text_norm"]) & set(val_df["text_norm"])
print("train ∩ val duplicates:", len(overlap))

⚙️  Creating new splits.
Splits saved to 'splits_cache/'.
train ∩ val duplicates: 0


### 🔠 Tokenize & build HF Datasets

In [14]:
tok = RobertaTokenizerFast.from_pretrained("roberta-base")

ds = Dataset.from_pandas(df_raw[["text"]], preserve_index=False)

def add_len(batch):
    batch["tok_len"] = [len(t) for t in tok(batch["text"], add_special_tokens=True)["input_ids"]]
    return batch

disable_caching()

ds = ds.map(add_len, batched=True, batch_size=1024, num_proc=1, desc="Adding token lengths")
lengths = ds["tok_len"]

pct = np.percentile(lengths, [50, 90, 95, 99])
print("median / p90 / p95 / p99 =", pct)
print("max =", max(lengths))

Adding token lengths:   0%|          | 0/464782 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (697 > 512). Running this sequence through the model will result in indexing errors


median / p90 / p95 / p99 = [ 439.  732.  854. 1121.]
max = 5121


In [15]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

MAX_LEN = 512
HEAD    = 256
TAIL    = 254

def head_tail_tokenize(batch):
    encodings = {"input_ids": [], "attention_mask": []}
    for text in batch["text"]:
        ids = tokenizer.encode(text, add_special_tokens=True, truncation=False)
        if len(ids) > MAX_LEN:
            # ids[0] : <s>, ids[-1] : </s>
            new_ids = ids[: HEAD + 1] + ids[-TAIL:]
            ids = new_ids[:MAX_LEN]
        attn = [1] * len(ids)
        encodings["input_ids"].append(ids)
        encodings["attention_mask"].append(attn)
    return encodings

train_ds = Dataset.from_pandas(train_df[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc = 1
)
val_ds = Dataset.from_pandas(val_df[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc = 1
)
test_ds = Dataset.from_pandas(test_df[["text", "label"]]).map(
    head_tail_tokenize, batched=True, remove_columns=["text"], num_proc = 1
)

data_collator = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/371825 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (898 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/46478 [00:00<?, ? examples/s]

Map:   0%|          | 0/46479 [00:00<?, ? examples/s]

### 🏗️ Build model (RoBERTa)

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base", num_labels=2,
    hidden_dropout_prob=0.2, attention_probs_dropout_prob=0.2,
).to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### ⚙️ TrainingArguments (튜닝된 하이퍼파라미터)

In [17]:
training_args = TrainingArguments(
    output_dir="/content/roberta-output",
    per_device_train_batch_size=56,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=4,
    num_train_epochs=2,           # 3 -> 2
    learning_rate=2e-5,
    warmup_ratio=0.1,
    lr_scheduler_type="linear",
    weight_decay=0.1,             # 0.01 -> 0.1
    fp16=torch.cuda.is_available(),
    label_smoothing_factor=0.1,
    eval_strategy="epoch", save_strategy="epoch",
    load_best_model_at_end=True, metric_for_best_model="f1",
    logging_dir="./logs", logging_steps=10, logging_first_step=True,
    save_total_limit=1, run_name="roberta-ai-vs-human", report_to=["tensorboard"],
    ddp_find_unused_parameters=False,
)

### 🚂 Trainer & train

In [18]:
# class LogCallback(TrainerCallback):
#     def on_log(self, args, state, control, logs=None, **kwargs):
#         if logs:
#             now = dt.datetime.now().strftime("%H:%M:%S")
#             lr  = kwargs.get("optimizer", {}).param_groups[0]["lr"] if "optimizer" in kwargs else logs.get("learning_rate")
#             watched = {**{k: logs.get(k) for k in ["loss","eval_loss","eval_accuracy","eval_f1"]}, "lr": lr}
#             msg = " | ".join(f"{k}: {v:.4f}" for k,v in watched.items() if v is not None)
#             print(f"[{now}] step {state.global_step} | {msg}")

# 로그 콜백 수정
class LogCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            now = dt.datetime.now().strftime("%H:%M:%S")
            # 안정적으로 logs에서 가져오기
            lr = logs.get("learning_rate", 0.0)
            watched = {
                "loss": logs.get("loss"),
                "eval_loss": logs.get("eval_loss"),
                "eval_accuracy": logs.get("eval_accuracy"),
                "eval_f1": logs.get("eval_f1"),
                "lr": lr,
            }
            msg = " | ".join(f"{k}: {v:.6f}" if k == "lr" else f"{k}: {v:.4f}" for k,v in watched.items() if v is not None)
            print(f"[{now}] step {state.global_step} | {msg}")

def compute_metrics(pred):
    logits, labels = pred
    preds = logits.argmax(-1)
    return dict(
        accuracy  = accuracy_score(labels, preds),
        precision = precision_score(labels, preds),
        recall    = recall_score(labels, preds),
        f1        = f1_score(labels, preds),
    )

trainer = Trainer(
    model=model, args=training_args,
    train_dataset=train_ds, eval_dataset=val_ds,
    tokenizer=tokenizer, data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(2), LogCallback()],
)
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2008,0.324381,0.944834,0.876396,0.999559,0.933935
2,0.1991,0.223512,0.989543,0.973895,1.0,0.986775


[12:34:29] step 1 | loss: 0.7265 | lr: 0.000000
[12:34:37] step 10 | loss: 0.7345 | lr: 0.000001
[12:34:45] step 20 | loss: 0.7262 | lr: 0.000001
[12:34:54] step 30 | loss: 0.7101 | lr: 0.000002
[12:35:02] step 40 | loss: 0.6953 | lr: 0.000002
[12:35:11] step 50 | loss: 0.6677 | lr: 0.000003
[12:35:19] step 60 | loss: 0.6039 | lr: 0.000004
[12:35:28] step 70 | loss: 0.5004 | lr: 0.000004
[12:35:37] step 80 | loss: 0.4115 | lr: 0.000005
[12:35:45] step 90 | loss: 0.3723 | lr: 0.000005
[12:35:54] step 100 | loss: 0.3265 | lr: 0.000006
[12:36:02] step 110 | loss: 0.2902 | lr: 0.000007
[12:36:11] step 120 | loss: 0.2359 | lr: 0.000007
[12:36:19] step 130 | loss: 0.2338 | lr: 0.000008
[12:36:28] step 140 | loss: 0.2332 | lr: 0.000008
[12:36:37] step 150 | loss: 0.2206 | lr: 0.000009
[12:36:45] step 160 | loss: 0.2238 | lr: 0.000010
[12:36:54] step 170 | loss: 0.2390 | lr: 0.000010
[12:37:02] step 180 | loss: 0.2200 | lr: 0.000011
[12:37:11] step 190 | loss: 0.2241 | lr: 0.000011
[12:37:19] 

TrainOutput(global_step=3320, training_loss=0.2148863543587995, metrics={'train_runtime': 3003.2608, 'train_samples_per_second': 247.614, 'train_steps_per_second': 1.105, 'total_flos': 1.953150601420914e+17, 'train_loss': 0.2148863543587995, 'epoch': 2.0})

### 🧪 Evaluate on test set

In [19]:
print("📊 Test metrics:", trainer.evaluate(test_ds))

[13:26:51] step 3320 | eval_loss: 0.2227 | eval_accuracy: 0.9899 | eval_f1: 0.9871 | lr: 0.000000
📊 Test metrics: {'eval_loss': 0.22267919778823853, 'eval_accuracy': 0.989887906366316, 'eval_precision': 0.974520221186165, 'eval_recall': 1.0, 'eval_f1': 0.9870957113832299, 'eval_runtime': 78.4727, 'eval_samples_per_second': 592.295, 'eval_steps_per_second': 37.019, 'epoch': 2.0}


In [24]:
# 1) 모델/토크나이저 불러오기
MODEL_DIR = "/content/roberta-ai-vs-human"
tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_DIR)
model     = RobertaForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

# 2) DAIGT 데이터 로드 & 전처리
DAIGT_PATH = "/content/daigt.csv"
df = (pd.read_csv(DAIGT_PATH, usecols=["text", "generated"])
        .rename(columns={"generated":"label"})
        .dropna(subset=["text"]))
df["text"]  = df["text"].astype(str).str.strip()
df["label"] = df["label"].astype(int)

# 3) Head-Tail 토크나이즈
MAX_LEN, HEAD, TAIL = 512, 256, 254
def ht_tokenize(batch):
    ids_all, attn_all = [], []
    for txt in batch["text"]:
        ids = tokenizer.encode(txt, add_special_tokens=True, truncation=False)
        if len(ids) > MAX_LEN:
            ids = ids[:HEAD+1] + ids[-TAIL:]
        ids_all.append(ids)
        attn_all.append([1]*len(ids))
    return {"input_ids": ids_all, "attention_mask": attn_all}

ds_test = Dataset.from_pandas(df[["text","label"]]).map(
    ht_tokenize, batched=True, batch_size=1024,
    num_proc=20, remove_columns=["text"], desc="Tokenizing(DAIGT)"
)
data_collator = DataCollatorWithPadding(tokenizer)

# 4) 테스트용 Trainer
def metrics(p):
    y, pred = p.label_ids, p.predictions.argmax(-1)
    return {
        "accuracy":  accuracy_score(y, pred),
        "precision": precision_score(y, pred),
        "recall":    recall_score(y, pred),
        "f1":        f1_score(y, pred),
    }

trainer = Trainer(
    model=model,
    args=TrainingArguments(
        output_dir="tmp-eval",
        per_device_eval_batch_size=32,
        dataloader_drop_last=False,
        seed=42,
    ),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=metrics,
)

# 5) 테스트 실행
results = trainer.evaluate(ds_test)
print("\n📊 DAIGT Test metrics")
for k,v in results.items():
    if k.startswith("eval_"):
        print(f"{k:12s}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

Tokenizing(DAIGT) (num_proc=20):   0%|          | 0/2730 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (846 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (594 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (786 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (812 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (715 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for thi



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33moneokiwa[0m ([33moneokiwa-pukyong-national-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin



📊 DAIGT Test metrics
eval_loss   : 0.0752
eval_model_preparation_time: 0.0033
eval_accuracy: 0.9861
eval_precision: 0.9743
eval_recall : 0.9985
eval_f1     : 0.9863
eval_runtime: 4.3530
eval_samples_per_second: 627.1530
eval_steps_per_second: 19.7560


### 💾 Save model/tokenizer

In [25]:
SAVE_PATH = "/content/drive/MyDrive/final-tunned-roberta-ai-vs-human"
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print(f"Model & tokenizer saved to '{SAVE_PATH}'.")

Model & tokenizer saved to '/content/drive/MyDrive/final-tunned-roberta-ai-vs-human'.
