In [None]:
import os
import sys

root_dir_path = os.getenv("ROOT_PATH")
xe
sys.path.append(f"{root_dir_path}/os-watermarking") #! add path for the watermarks
sys.path.append(f"{root_dir_path}/os-watermarking/MarkLLM") #! add path for the watermarks

In [None]:
import json
import torch
import math
import scipy.stats
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from scipy.interpolate import interp1d
from sklearn.metrics import roc_auc_score, roc_curve
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig

torch.manual_seed(42)

from MarkLLM.watermark.auto_watermark import AutoWatermarkForVLLM
from MarkLLM.utils.transformers_config import TransformersConfig

In [None]:
model_name = os.getenv("MODEL", "meta-llama/Llama-2-7b-hf")
model_suffix = model_name.split("/")[-1]
output_file = os.getenv("OUTPUT_FILE", None)

assert output_file is not None, "Please set the OUTPUT_FILE environment variable to the path of your output file."

BATCH_SIZE = 64
print(f"Batch size: {BATCH_SIZE}")
print(f"Model name: {model_name}")

In [None]:

with open(output_file, "r") as f:
    output_data = json.load(f)

samples = output_data["samples"]

In [None]:
config = AutoConfig.from_pretrained(model_name)

transformers_config = TransformersConfig(
        model=AutoModelForCausalLM.from_pretrained(model_name),
        tokenizer=AutoTokenizer.from_pretrained(model_name),
        vocab_size=config.vocab_size,
        device="cuda"
    )

watermark_type = output_data["watermark"]
config = output_data["config"]

#* Load the relevant watermark
wtm_config = f'config/{watermark_type}/prefix_{config["prefix_length"]}_gamma_{config["gamma"]}_delta_{config["delta"]}_key_{config["hash_key"]}.json'
watermark = AutoWatermarkForVLLM(algorithm_name="KGW", 
                        algorithm_config=wtm_config, 
                        transformers_config=transformers_config)

In [None]:
def get_zcores(column):
    all_z_scores = []

    data = samples[column]
    for i in range(0, len(data)):
        all_z_scores.append( watermark.detect_watermark(data[i])['score'] )
    
    all_z_scores = torch.tensor(all_z_scores)
    # all_z_scores = torch.cat(all_z_scores)

    return all_z_scores

#### Human text / negative sample scores

In [None]:
negative_z = get_zcores("human_text")
mean_negative_z = negative_z.mean().item()
std_negative_z = negative_z.std().item()
print(f"Mean negative z value: {mean_negative_z}")
print(f"Std negative z value: {std_negative_z}")

#### Watermaked text score

In [None]:
def compute_metrics(watermark_scores, null_scores):
    min_sweep = min(watermark_scores.min().item(),
                    null_scores.min().item()) - 1
    max_sweep = max(watermark_scores.max().item(),
                    null_scores.max().item()) + 1

    # Compute AUROC
    y_true = np.concatenate([
        np.zeros_like(watermark_scores),
        np.ones_like(null_scores)
    ])
    y_score = np.concatenate([watermark_scores, null_scores])

    # Original logic: p-values → low = positive = watermark
    auroc = roc_auc_score(y_true, y_score)

    # Compute best F1 score
    f1_scores = []
    thresholds = np.linspace(min_sweep, max_sweep, 1000)
    for threshold in thresholds:
        y_pred = (y_score >= threshold).astype(int)  # Predict class 1 = null
        tp = np.sum((y_true == 1) & (y_pred == 1))
        fp = np.sum((y_true == 0) & (y_pred == 1))
        fn = np.sum((y_true == 1) & (y_pred == 0))
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1_score = 2 * (precision * recall) / (precision +
                                               recall) if (precision + recall) > 0 else 0
        f1_scores.append(f1_score)
    best_f1_score = max(f1_scores)

    # Interpolated TPR@1%FPR
    fpr_array, tpr_array, _ = roc_curve(y_true, y_score)
    tpr_interp = interp1d(fpr_array, tpr_array, kind='linear',
                          bounds_error=False, fill_value=(tpr_array[0], tpr_array[-1]))
    tpr_at_1_fpr = float(tpr_interp(0.01))
    tpr_at_01_fpr = float(tpr_interp(0.001))

    return {
        "auroc": auroc,
        "best_f1_score": best_f1_score,
        "tpr_1_fpr": tpr_at_1_fpr,
        "tpr_0.1_fpr": tpr_at_01_fpr,
    }


def compute_scores(column):
    positive_z = get_zcores(
        column)
    mean_positive_z = positive_z.mean().item()
    std_positive_z = positive_z.std().item()

    plt.figure(figsize=(8, 6))
    plt.hist(negative_z.cpu().numpy(), bins=50, alpha=0.5,
             label='Negative Z-Scores', color='blue')
    plt.hist(positive_z.cpu().numpy(), bins=50, alpha=0.5,
             label='Positive Z-Scores', color='orange')
    plt.title('Z-Score Distribution')
    plt.xlabel('Z-Score')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid()
    plt.show()

    print(f"Mean positive z value: {mean_positive_z}")
    print(f"Std positive z value: {std_positive_z}")
    watermark_scores = positive_z.cpu().numpy()
    null_scores = negative_z.cpu().numpy()
    metrics = compute_metrics(-watermark_scores, -null_scores)

    return metrics

In [None]:
metrics = compute_scores("model_text")

output_data["metrics"] = metrics

In [None]:
with torch.no_grad():
    # Clear the cache
    torch.cuda.empty_cache()

#### Paraphrased text score

In [None]:
BATCH_SIZE = BATCH_SIZE // 2

if "dipper_text_lex60_order0" in samples:
    metrics = compute_scores("dipper_text_lex60_order0")
    output_data["metrics_dipper_text_lex60_order0"] = metrics

In [None]:
if "dipper_text_lex20_order0" in samples:
    metrics = compute_scores("dipper_text_lex20_order0")
    output_data["metrics_dipper_text_lex20_order0"] = metrics

#### Save the results

In [None]:
# Save the output data to the output file
with open(output_file, "w") as f:
    json.dump(output_data, f, indent=4)