# AI Text Detector — Upgraded Training (RoBERTa + Multi-Dataset)

**Before running:** `Runtime → Change runtime type → GPU → A100 → Save`

### What's improved over v1
| | v1 | v2 |
|---|---|---|
| Base model | DistilBERT (67M) | RoBERTa-base (125M) |
| Max tokens | 256 | 512 |
| Training data | ~85K HC3 + 60K RAID | ~85K HC3 + 300K RAID (stratified across ALL generators) |
| AI generators covered | ChatGPT only | GPT-4, GPT-3.5, GPT-2, Llama-2 (7/13/70B), Mistral, Cohere, BLOOM, + adversarial attacks |
| Epochs | 3 | 4 with early stopping |
| Precision | fp16 | bf16 (more stable on A100) |
| LR schedule | linear | cosine with warmup |
| Label smoothing | no | 0.05 (reduces overconfidence) |
| Temperature | hardcoded 1.8 | auto-calibrated + saved to JSON |

**Expected training time on A100:** ~2–3 hours for 4 epochs over ~385K examples.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
SAVE_DIR = '/content/drive/MyDrive/ai-detector-v2'
os.makedirs(SAVE_DIR, exist_ok=True)
print(f'Drive mounted. Model will save to: {SAVE_DIR}')

In [None]:
!pip install -q transformers datasets accelerate evaluate scikit-learn torch seaborn matplotlib

In [None]:
# ── Configuration ─────────────────────────────────────────────────────────────
# Change BASE_MODEL here to try different architectures:
#   'roberta-base'                 — best accuracy / speed tradeoff (recommended)
#   'microsoft/deberta-v3-base'    — highest accuracy, ~2x slower
#   'distilbert-base-uncased'      — fastest, lowest accuracy

BASE_MODEL   = 'roberta-base'
MAX_LEN      = 512
EPOCHS       = 4
BATCH_SIZE   = 32   # per device; effective batch = 32 * grad_accum_steps
GRAD_ACCUM   = 2    # effective batch = 64
LR           = 2e-5
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
LABEL_SMOOTH = 0.05
SEED         = 42

# How many rows to sample from RAID (5.6M available — more = better, but slower)
RAID_AI_SAMPLE    = 250_000   # AI-generated rows (stratified across all generators)
RAID_HUMAN_SAMPLE = 100_000   # Human rows from RAID
RAID_ADV_SAMPLE   =  50_000   # Adversarially-attacked AI rows (robustness)

print('Config loaded.')
print(f'  Base model : {BASE_MODEL}')
print(f'  Max tokens : {MAX_LEN}')
print(f'  Epochs     : {EPOCHS}')
print(f'  Save dir   : {SAVE_DIR}')

In [None]:
import pandas as pd
import numpy as np

# ── HC3 — ChatGPT vs human answers (Reddit, medicine, finance, etc.) ──────────
hc3_raw = pd.read_parquet(
    'hf://datasets/Hello-SimpleAI/HC3@refs/convert/parquet/all/train/0000.parquet'
)

human_hc3 = (
    hc3_raw[['human_answers']]
    .explode('human_answers')
    .rename(columns={'human_answers': 'text'})
    .assign(label=0, source='hc3')
)
ai_hc3 = (
    hc3_raw[['chatgpt_answers']]
    .explode('chatgpt_answers')
    .rename(columns={'chatgpt_answers': 'text'})
    .assign(label=1, source='hc3')
)

hc3 = pd.concat([human_hc3, ai_hc3], ignore_index=True)
hc3['text'] = hc3['text'].astype(str).str.strip()
hc3 = hc3[hc3['text'].str.len() > 50].reset_index(drop=True)

print('HC3:', hc3['label'].value_counts().to_dict())

In [None]:
from datasets import load_dataset

# ── RAID — multi-generator, multi-domain ──────────────────────────────────────
# Generators: gpt-4, gpt-3.5-turbo, gpt-2, llama-2 (7/13/70b), mistral-7b,
#             cohere, bloom-7b1, davinci-003, and more
# Domains: news, reddit, recipes, abstracts, reviews, wiki, poetry, etc.
# Attacks: none, homoglyph, paraphrase, whitespace, etc.

print('Loading RAID dataset (this may take a few minutes)...')
raid_raw = load_dataset('liamdugan/raid', split='train')

# Convert to pandas in chunks to avoid OOM
raid_df = raid_raw.to_pandas()
print(f'RAID loaded: {len(raid_df):,} rows')
print('Generators:', raid_df['model'].value_counts().head(15).to_dict())
print('Domains:', raid_df['domain'].value_counts().to_dict())

In [None]:
# ── Sample RAID strategically ─────────────────────────────────────────────────

# 1. Clean AI rows (no adversarial attack) — stratified across all generators
clean_ai = raid_df[(raid_df['model'] != 'human') & (raid_df['attack'] == 'none')]
generators = clean_ai['model'].unique()
per_gen = RAID_AI_SAMPLE // len(generators)

clean_ai_sample = (
    clean_ai
    .groupby('model', group_keys=False)
    .apply(lambda g: g.sample(min(len(g), per_gen), random_state=SEED))
    .reset_index(drop=True)
)
print(f'Clean AI samples: {len(clean_ai_sample):,}')
print('Per generator:', clean_ai_sample['model'].value_counts().to_dict())

# 2. Adversarially-attacked AI rows — teaches model to resist paraphrasing tricks
adv_ai = raid_df[(raid_df['model'] != 'human') & (raid_df['attack'] != 'none')]
adv_ai_sample = adv_ai.sample(min(len(adv_ai), RAID_ADV_SAMPLE), random_state=SEED)
print(f'Adversarial AI samples: {len(adv_ai_sample):,}')
print('Attack types:', adv_ai_sample['attack'].value_counts().to_dict())

# 3. Human rows from RAID
human_raid = raid_df[raid_df['model'] == 'human']
human_raid_sample = human_raid.sample(min(len(human_raid), RAID_HUMAN_SAMPLE), random_state=SEED)
print(f'Human RAID samples: {len(human_raid_sample):,}')

# Build RAID dataframe
raid_ai = pd.concat([clean_ai_sample, adv_ai_sample], ignore_index=True)
raid_ai_df = raid_ai[['generation']].rename(columns={'generation': 'text'}).assign(label=1, source='raid')
raid_human_df = human_raid_sample[['generation']].rename(columns={'generation': 'text'}).assign(label=0, source='raid')

raid = pd.concat([raid_ai_df, raid_human_df], ignore_index=True)
raid['text'] = raid['text'].astype(str).str.strip()
raid = raid[raid['text'].str.len() > 50].reset_index(drop=True)

print(f'\nRAID final: {raid["label"].value_counts().to_dict()}')

# Free the huge raw dataframe
del raid_df, raid_raw, clean_ai, adv_ai, clean_ai_sample, adv_ai_sample, human_raid, human_raid_sample
import gc; gc.collect()

In [None]:
# ── Combine all datasets ──────────────────────────────────────────────────────
combined = pd.concat([hc3[['text','label','source']], raid], ignore_index=True)

# Drop empty/whitespace rows
combined = combined[combined['text'].str.strip().str.len() > 50].reset_index(drop=True)

# Shuffle
combined = combined.sample(frac=1, random_state=SEED).reset_index(drop=True)

print('=== Combined dataset ===')
print(f'Total rows  : {len(combined):,}')
print(f'Human (0)   : {(combined["label"]==0).sum():,}')
print(f'AI (1)      : {(combined["label"]==1).sum():,}')
print(f'Sources     : {combined["source"].value_counts().to_dict()}')

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    combined, test_size=0.1, random_state=SEED, stratify=combined['label']
)
train_df, val_df = train_test_split(
    train_df, test_size=0.1, random_state=SEED, stratify=train_df['label']
)

train_df = train_df.reset_index(drop=True)
val_df   = val_df.reset_index(drop=True)
test_df  = test_df.reset_index(drop=True)

print(f'Train : {len(train_df):,}  |  Val : {len(val_df):,}  |  Test : {len(test_df):,}')

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

def tokenize(batch):
    return tokenizer(
        batch['text'],
        truncation=True,
        padding='max_length',
        max_length=MAX_LEN,
    )

train_ds = Dataset.from_pandas(train_df[['text','label']])
val_ds   = Dataset.from_pandas(val_df[['text','label']])
test_ds  = Dataset.from_pandas(test_df[['text','label']])

train_tok = train_ds.map(tokenize, batched=True, batch_size=1000, num_proc=2)
val_tok   = val_ds.map(tokenize,   batched=True, batch_size=1000, num_proc=2)
test_tok  = test_ds.map(tokenize,  batched=True, batch_size=1000, num_proc=2)

cols = ['input_ids', 'attention_mask', 'label']
train_tok.set_format('torch', columns=cols)
val_tok.set_format('torch',   columns=cols)
test_tok.set_format('torch',  columns=cols)

print('Tokenization done.')

In [None]:
import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    BASE_MODEL,
    num_labels=2,
    id2label={0: 'human', 1: 'ai'},
    label2id={'human': 0, 'ai': 1},
)

# Enable gradient checkpointing — halves VRAM at the cost of ~20% speed
model.gradient_checkpointing_enable()

total_params = sum(p.numel() for p in model.parameters())
print(f'Model: {BASE_MODEL}  |  Params: {total_params/1e6:.1f}M')

# ── Metrics ──
acc_metric = evaluate.load('accuracy')
f1_metric  = evaluate.load('f1')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = acc_metric.compute(predictions=preds, references=labels)['accuracy']
    f1  = f1_metric.compute(predictions=preds, references=labels, average='macro')['f1']
    f1_ai    = f1_metric.compute(predictions=preds, references=labels, average='binary', pos_label=1)['f1']
    f1_human = f1_metric.compute(predictions=preds, references=labels, average='binary', pos_label=0)['f1']
    return {
        'accuracy'  : acc,
        'f1_macro'  : f1,
        'f1_ai'     : f1_ai,
        'f1_human'  : f1_human,
    }

In [None]:
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback

args = TrainingArguments(
    output_dir=SAVE_DIR,

    # Learning rate
    learning_rate=LR,
    lr_scheduler_type='cosine',
    warmup_ratio=WARMUP_RATIO,

    # Batch & gradient
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=64,
    gradient_accumulation_steps=GRAD_ACCUM,
    gradient_checkpointing=True,

    # Regularization
    weight_decay=WEIGHT_DECAY,
    label_smoothing_factor=LABEL_SMOOTH,

    # Epochs & evaluation
    num_train_epochs=EPOCHS,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',

    # Performance
    bf16=True,              # bf16 is more stable than fp16 on A100
    dataloader_num_workers=4,
    group_by_length=True,   # speeds up training by batching similar-length sequences

    # Logging
    logging_steps=200,
    report_to='none',       # disable wandb/mlflow
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

print('Trainer ready.')

In [None]:
# ── TRAIN ─────────────────────────────────────────────────────────────────────
# On A100 with ~385K examples, expect ~2-3 hours for 4 epochs.
# Early stopping will halt if val f1_macro stops improving.

trainer.train()
print('\nTraining complete!')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

# ── Full evaluation on held-out test set ─────────────────────────────────────
test_pred = trainer.predict(test_tok)
preds     = np.argmax(test_pred.predictions, axis=-1)
labels    = test_pred.label_ids

print('=== Test Set Results ===')
print(classification_report(labels, preds, target_names=['Human', 'AI'], digits=4))

# Confusion matrix
cm = confusion_matrix(labels, preds)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Human','AI'], yticklabels=['Human','AI'])
plt.title('Confusion Matrix — Test Set')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig(f'{SAVE_DIR}/confusion_matrix.png', dpi=150)
plt.show()

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import json

# ── Temperature scaling calibration on validation set ─────────────────────────
# This makes the model's confidence scores better calibrated (less overconfident).

val_pred   = trainer.predict(val_tok)
val_logits = torch.tensor(val_pred.predictions, dtype=torch.float32)
val_labels = torch.tensor(val_pred.label_ids,   dtype=torch.long)

class TemperatureScaler(nn.Module):
    def __init__(self):
        super().__init__()
        self.log_temp = nn.Parameter(torch.zeros(1))
    def forward(self, logits):
        return logits / torch.exp(self.log_temp)

scaler = TemperatureScaler()
opt    = torch.optim.LBFGS([scaler.log_temp], lr=0.1, max_iter=100)

def calib_loss():
    opt.zero_grad()
    loss = F.cross_entropy(scaler(val_logits), val_labels)
    loss.backward()
    return loss

opt.step(calib_loss)

T = float(torch.exp(scaler.log_temp).detach().numpy()[0])
print(f'Calibrated temperature: {T:.4f}')

# ── Save temperature alongside model so model_loader.py picks it up automatically
config_path = f'{SAVE_DIR}/best-model/training_config.json'
training_config = {
    'temperature': T,
    'base_model': BASE_MODEL,
    'max_length': MAX_LEN,
}
os.makedirs(f'{SAVE_DIR}/best-model', exist_ok=True)
with open(config_path, 'w') as f:
    json.dump(training_config, f, indent=2)
print(f'Saved training_config.json: {training_config}')

In [None]:
# ── Save best model + tokenizer ───────────────────────────────────────────────
BEST_MODEL_DIR = f'{SAVE_DIR}/best-model'

trainer.model.save_pretrained(BEST_MODEL_DIR)
tokenizer.save_pretrained(BEST_MODEL_DIR)

print(f'Model saved to: {BEST_MODEL_DIR}')

# List saved files
for f in os.listdir(BEST_MODEL_DIR):
    size = os.path.getsize(f'{BEST_MODEL_DIR}/{f}') / 1e6
    print(f'  {f}  ({size:.1f} MB)')

In [None]:
# ── Sanity-check inference ────────────────────────────────────────────────────

def predict(text: str, threshold: float = 0.85):
    m = trainer.model
    m.eval()
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=MAX_LEN)
    inputs = {k: v.to(m.device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = m(**inputs).logits[0].detach().cpu()
    probs    = torch.softmax(logits / T, dim=-1).numpy()
    ai_p     = float(probs[1])
    human_p  = float(probs[0])
    pred     = ('uncertain' if max(ai_p, human_p) < threshold
                else ('ai' if ai_p >= human_p else 'human'))
    print(f'AI:    {ai_p:.4f}')
    print(f'Human: {human_p:.4f}')
    print(f'Pred:  {pred}')
    return {'ai_prob': ai_p, 'human_prob': human_p, 'pred': pred}

print('--- Should be HUMAN ---')
predict("""The quick brown fox jumps over the lazy dog.
i was just kinda sitting there and then boom it happened lol idk man it was weird.
we went to the store and got some stuff, nothing special really.""")

print('\n--- Should be AI ---')
predict("""In conclusion, the integration of artificial intelligence into modern healthcare
systems presents both significant opportunities and considerable challenges. By leveraging
machine learning algorithms, medical professionals can enhance diagnostic accuracy,
streamline patient care workflows, and ultimately improve patient outcomes.
However, it is essential to address ethical considerations, data privacy concerns,
and the need for robust validation frameworks to ensure the responsible deployment
of these transformative technologies.""")

print('\n--- News article (should be HUMAN, not AI) ---')
predict("""The Federal Reserve raised interest rates by a quarter point Wednesday,
its 10th increase since March 2022, as officials signaled they may be nearing
the end of their aggressive campaign to bring inflation back under control.
The decision was unanimous. The benchmark federal funds rate now sits in a
target range of 5% to 5.25%, the highest level in 16 years.""")

In [None]:
# ── Download the trained model as a zip ───────────────────────────────────────
import shutil
from google.colab import files

zip_path = '/content/ai-detector-v2-export'
shutil.make_archive(zip_path, 'zip', BEST_MODEL_DIR)

zip_size = os.path.getsize(zip_path + '.zip') / 1e6
print(f'Zip size: {zip_size:.0f} MB')
print('Downloading...')
files.download(zip_path + '.zip')