# AI Text Detector — Colab Training
**Before running anything:** `Runtime → Change runtime type → GPU → A100 or H100 → Save`

In [1]:
# Mount Google Drive — your trained model will be saved here
from google.colab import drive
drive.mount('/content/drive')

import os
os.makedirs('/content/drive/MyDrive/ai-detector', exist_ok=True)
print('Drive mounted. Model will save to: /content/drive/MyDrive/ai-detector')

Mounted at /content/drive
Drive mounted. Model will save to: /content/drive/MyDrive/ai-detector


In [2]:
!pip install -q transformers datasets accelerate evaluate scikit-learn torch

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import pandas as pd

# Load HC3
df = pd.read_parquet(
    "hf://datasets/Hello-SimpleAI/HC3@refs/convert/parquet/all/train/0000.parquet"
)

# Flatten to binary rows — plain text only (no Q:/A: wrapper so it's
# consistent with RAID which has no question context)
human = df[["human_answers"]].explode("human_answers").rename(columns={"human_answers": "input"})
human["label"] = 0

ai = df[["chatgpt_answers"]].explode("chatgpt_answers").rename(columns={"chatgpt_answers": "input"})
ai["label"] = 1

hc3 = pd.concat([human, ai], ignore_index=True)
hc3["input"] = hc3["input"].astype(str).str.strip()
hc3 = hc3[hc3["input"].str.len() > 30].reset_index(drop=True)

print("HC3 loaded:", hc3["label"].value_counts().to_dict())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


HC3 loaded: {0: 58492, 1: 26878}


In [None]:
from datasets import load_dataset

# Load RAID — multi-domain, multi-generator (news, essays, poems, etc.)
# RAID has 5.6M rows — sample 60K before converting to pandas to avoid OOM
# Label comes from the 'model' column ('human' = human, anything else = AI)
raid_raw = load_dataset("liamdugan/raid", split="train")
raid_sample = raid_raw.shuffle(seed=42).select(range(60_000))
raid_df = raid_sample.to_pandas()

raid_df["label"] = (raid_df["model"] != "human").astype(int)  # 0=human, 1=ai
raid_df["input"] = raid_df["generation"].astype(str).str.strip()
raid = raid_df[["input", "label"]]
raid = raid[raid["input"].str.len() > 30].reset_index(drop=True)

print("RAID loaded:", raid["label"].value_counts().to_dict())

In [5]:
from datasets import load_dataset

ds = load_dataset("liamdugan/raid")   # (no split) so you can inspect
print(ds)                             # shows splits
print(ds["train"].column_names)       # columns in train
print(ds["extra"].column_names)       # columns in extra

Loading dataset shards:   0%|          | 0/24 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation'],
        num_rows: 5615820
    })
    extra: Dataset({
        features: ['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation'],
        num_rows: 2039100
    })
})
['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation']
['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation']


In [6]:
# Combine HC3 + RAID, then balance classes so neither dominates
combined = pd.concat([hc3, raid], ignore_index=True)

n = combined["label"].value_counts().min()
combined = (
    combined
    .groupby("label")
    .sample(n=n, random_state=42)
    .sample(frac=1, random_state=42)  # shuffle
    .reset_index(drop=True)
)

print("Combined (balanced):", combined["label"].value_counts().to_dict())
print("Total examples:", len(combined))

NameError: name 'raid' is not defined

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(combined, test_size=0.2, random_state=42, stratify=combined["label"])
train_df, val_df  = train_test_split(train_df,  test_size=0.2, random_state=42, stratify=train_df["label"])

train_df = train_df.reset_index(drop=True)
val_df   = val_df.reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)

print(f"Train: {len(train_df)}  Val: {len(val_df)}  Test: {len(test_df)}")

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LEN = 256

def tokenize(batch):
    return tokenizer(
        batch["input"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LEN,
    )

train_ds = Dataset.from_pandas(train_df[["input", "label"]])
val_ds   = Dataset.from_pandas(val_df[["input", "label"]])
test_ds  = Dataset.from_pandas(test_df[["input", "label"]])

train_tok = train_ds.map(tokenize, batched=True)
val_tok   = val_ds.map(tokenize, batched=True)
test_tok  = test_ds.map(tokenize, batched=True)

cols = ["input_ids", "attention_mask", "label"]
train_tok.set_format("torch", columns=cols)
val_tok.set_format("torch", columns=cols)
test_tok.set_format("torch", columns=cols)

In [None]:
import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

acc = evaluate.load("accuracy")
f1  = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": acc.compute(predictions=preds, references=labels)["accuracy"],
        "f1_macro": f1.compute(predictions=preds, references=labels, average="macro")["f1"],
    }

In [None]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="/content/drive/MyDrive/ai-detector",  # saves directly to Drive
    learning_rate=2e-5,
    per_device_train_batch_size=32,   # larger batch is fine on A100/H100
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=100,
    fp16=True,                        # ~2x faster on A100/H100
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
# Evaluate on held-out test set
trainer.evaluate(test_tok)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Fit temperature scaling on the validation set
val_pred   = trainer.predict(val_tok)
val_logits = torch.tensor(val_pred.predictions, dtype=torch.float32)
val_labels = torch.tensor(val_pred.label_ids, dtype=torch.long)

class TemperatureScaler(nn.Module):
    def __init__(self):
        super().__init__()
        self.log_temp = nn.Parameter(torch.zeros(1))

    def forward(self, logits):
        return logits / torch.exp(self.log_temp)

scaler = TemperatureScaler()
opt = torch.optim.LBFGS([scaler.log_temp], lr=0.1, max_iter=50)

def loss_fn():
    opt.zero_grad()
    loss = F.cross_entropy(scaler(val_logits), val_labels)
    loss.backward()
    return loss

opt.step(loss_fn)

T = float(torch.exp(scaler.log_temp).detach().cpu().numpy()[0])
print("Learned temperature:", T)

In [None]:
ID2LABEL = {0: "human", 1: "ai"}

def score(text: str, threshold: float = 0.85):
    """
    Returns ai_prob and human_prob after temperature calibration.
    If neither class exceeds `threshold`, pred is 'uncertain'.
    """
    m = trainer.model
    m.eval()

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    inputs = {k: v.to(m.device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = m(**inputs).logits[0].detach().cpu()

    probs = torch.softmax(logits / T, dim=-1).numpy()
    ai_p, human_p = float(probs[1]), float(probs[0])

    if max(ai_p, human_p) < threshold:
        pred = "uncertain"
    else:
        pred = ID2LABEL[int(ai_p >= human_p)]

    print(f"AI:      {ai_p:.4f}")
    print(f"Human:   {human_p:.4f}")
    print(f"Pred:    {pred}")
    return {"ai_prob": ai_p, "human_prob": human_p, "pred": pred}

In [None]:
# Test it
score("Military pressure from outside groups, often referred to as barbarian tribes by the Romans, was another major cause.")

In [None]:
# Download the trained model as a zip to your machine
import shutil
from google.colab import files

# Save the best model + tokenizer cleanly
trainer.model.save_pretrained('/content/drive/MyDrive/ai-detector/best-model')
tokenizer.save_pretrained('/content/drive/MyDrive/ai-detector/best-model')

# Zip and download (optional — skip if you're happy leaving it on Drive)
shutil.make_archive('/content/ai-detector-export', 'zip', '/content/drive/MyDrive/ai-detector/best-model')
files.download('/content/ai-detector-export.zip')