In [None]:
import json
from src.mbmark import MbMark
from src.gaussmark import GaussMark
from src.kgw_distilled import KGWDistilled
from src.kgwmark import KGWMark
from sklearn.metrics import roc_auc_score
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset, load_from_disk
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import math
import scipy.stats
torch.manual_seed(42)
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve
from scipy.interpolate import interp1d


In [None]:
model_name = os.getenv("MODEL", "meta-llama/Llama-2-7b-hf")
model_suffix = model_name.split("/")[-1]
output_file = os.getenv("OUTPUT_FILE", None)
K = int(os.getenv("K", 16))

assert output_file is not None, "Please set the OUTPUT_FILE environment variable to the path of your output file."


BATCH_SIZE = 64
print(f"Batch size: {BATCH_SIZE}")
print(f"Model name: {model_name}")


In [None]:

with open(output_file, "r") as f:
    output_data = json.load(f)

samples = output_data["samples"]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    device_map="auto")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto", torch_dtype=torch.bfloat16)

watermark_type = output_data["watermark"]
config = output_data["config"]

if watermark_type == "gaussmark":
    watermark = GaussMark(sigma=config["sigma"],
                          tokenizer=tokenizer, model=model, seed=config["hash_key"], target_param_name=config["target_param_name"])
    BATCH_SIZE = 8
elif watermark_type == "mb":
    dataset_suffix = "openwebtext"
    model_suffix = model_name.split("/")[-1]
    final_matrix_path = f"saved_models/{dataset_suffix}_{model_suffix}/selector_matrix_k{K}.pth"
    final_weight = torch.load(final_matrix_path)
    watermark = MbMark.mb(
        delta=config["delta"],
        gamma=config["gamma"],
        seed=config["hash_key"],
        final_weight=final_weight,
        model=model,
        tokenizer=tokenizer,
        unembedding_param_name=config["unembedding_param_name"]
    )
elif watermark_type == "noise":
    watermark = MbMark.noise_injection(
        delta=config["delta"],
        seed=config["hash_key"],
        model=model,
        tokenizer=tokenizer,
        unembedding_param_name=config["unembedding_param_name"],
        distribution=config["distribution"]
    )
elif watermark_type == "distilled":
    watermark = KGWDistilled(
        gamma=config["gamma"],
        seeding_scheme=config["seeding_scheme"],
        kgw_device=config["kgw_device"],
        model=model,
        tokenizer=tokenizer,
    )
elif watermark_type == "kgw" or watermark_type == "kgw_llr":
    llr_detection = watermark_type == "kgw_llr"
    watermark = KGWMark(
        gamma=config["gamma"],
        delta=config["delta"],
        hash_key=config["hash_key"],
        kgw_device=config["kgw_device"],
        model=model,
        tokenizer=tokenizer,
        llr_detection=llr_detection
    )

In [None]:
def get_zcores(column):
    all_scores = []

    data = samples[column]
    for i in range(0, len(data), BATCH_SIZE):
        batch = data[i:i + BATCH_SIZE]
        with torch.no_grad():
            scores = watermark.score_text_batch(batch)
            all_scores.append(scores)

    all_scores = torch.cat(all_scores)

    return all_scores

#### Human text / negative sample scores

In [None]:
negative_z = get_zcores("human_text")
mean_negative_z = negative_z.mean().item()
std_negative_z = negative_z.std().item()
print(f"Mean negative z value: {mean_negative_z}")
print(f"Std negative z value: {std_negative_z}")

#### Watermaked text score

In [None]:
def compute_metrics(watermark_scores, null_scores):
    min_sweep = min(watermark_scores.min().item(),
                    null_scores.min().item()) - 1
    max_sweep = max(watermark_scores.max().item(),
                    null_scores.max().item()) + 1

    # Compute AUROC
    y_true = np.concatenate([
        np.ones_like(watermark_scores),
        np.zeros_like(null_scores)
    ])
    y_score = np.concatenate([watermark_scores, null_scores])

    # Original logic: p-values → low = positive = watermark
    auroc = roc_auc_score(y_true, y_score)

    # Compute best F1 score
    f1_scores = []
    thresholds = np.linspace(min_sweep, max_sweep, 1000)
    for threshold in thresholds:
        y_pred = (y_score >= threshold).astype(int)  # Predict class 1 = null
        tp = np.sum((y_true == 1) & (y_pred == 1))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision +
                                               recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1_score)

    f1_scores = np.array(f1_scores)
    thresholds = np.array(thresholds)

    best_f1_score = f1_scores.max()
    best_indices = np.where(f1_scores == best_f1_score)[0]

    # Recompute precision at each of the best-F1 thresholds
    best_precisions = []
    for idx in best_indices:
        threshold = thresholds[idx]
        y_pred = (y_score >= threshold).astype(int)
        tp = np.sum((y_true == 1) & (y_pred == 1))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        best_precisions.append(precision)

    best_precisions = np.array(best_precisions)
    max_precision = best_precisions.max()
    precision_indices = best_indices[np.where(
        best_precisions == max_precision)[0]]

    # Choose the highest threshold among those with best F1 and max precision
    best_threshold = thresholds[precision_indices].max()

    # Interpolated TPR@1%FPR
    fpr_array, tpr_array, roc_thresholds = roc_curve(y_true, y_score)
    tpr_interp = interp1d(fpr_array, tpr_array, kind='linear',
                          bounds_error=False, fill_value=(tpr_array[0], tpr_array[-1]))
    tpr_at_1_fpr = float(tpr_interp(0.01))
    tpr_at_01_fpr = float(tpr_interp(0.001))
    tpr_at_005_fpr = float(tpr_interp(0.0005))
    tpr_at_5_fpr = float(tpr_interp(0.05))

    fpr_diff = np.abs(fpr_array - 0.01)
    threshold_at_1_fpr = roc_thresholds[fpr_diff.argmin()]
    # Convert to float for consistency
    threshold_at_1_fpr = float(threshold_at_1_fpr)

    return {
        "auroc": auroc,
        "best_f1_score": best_f1_score,
        "tpr_1_fpr": tpr_at_1_fpr,
        "tpr_0.1_fpr": tpr_at_01_fpr,
        "tpr_0.05_fpr": tpr_at_005_fpr,
        "tpr_5_fpr": tpr_at_5_fpr,
        "best_threshold": best_threshold,
        "threshold_at_1_fpr": threshold_at_1_fpr,
    }


def compute_scores(column):
    positive_z = get_zcores(
        column)
    mean_positive_z = positive_z.mean().item()
    std_positive_z = positive_z.std().item()

    plt.figure(figsize=(8, 6))
    plt.hist(negative_z.cpu().numpy(), bins=50, alpha=0.5,
             label='Human Text', color='blue')
    plt.hist(positive_z.cpu().numpy(), bins=50, alpha=0.5,
             label='Watermarked Text', color='orange')
    plt.title('Avg LLR Scores')
    plt.xlabel('Avg LLR Score')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid()
    plt.show()

    print(f"Mean positive z value: {mean_positive_z}")
    print(f"Std positive z value: {std_positive_z}")
    watermark_scores = positive_z.cpu().numpy()
    null_scores = negative_z.cpu().numpy()
    metrics = compute_metrics(watermark_scores, null_scores)

    return metrics

In [None]:
metrics = compute_scores("model_text")

output_data["metrics"] = metrics

# Print all metrics as a map
print("Metrics:")
for key, value in metrics.items():
    print(f"{key}: {value}")

In [None]:
with torch.no_grad():
    # Clear the cache
    torch.cuda.empty_cache()

#### Paraphrased text score

In [None]:
BATCH_SIZE = BATCH_SIZE // 2

if "dipper_text_lex60_order0" in samples:
    metrics = compute_scores("dipper_text_lex60_order0")
    output_data["metrics_dipper_text_lex60_order0"] = metrics
    print("Dipper 60 Metrics:")
    for key, value in metrics.items():
        print(f"    {key}: {value}")

In [None]:
if "dipper_text_lex20_order0" in samples:
    metrics = compute_scores("dipper_text_lex20_order0")
    output_data["metrics_dipper_text_lex20_order0"] = metrics
    print("Dipper 20 Metrics:")
    for key, value in metrics.items():
        print(f"    {key}: {value}")

In [None]:
# Save the output data to the output file
with open(output_file, "w") as f:
    json.dump(output_data, f, indent=4)