In [1]:
import os
os.getcwd()
import sys 
sys.path.append('/data/users/miroojin/saksham/watermark-adapters')
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"

In [2]:
import json
from src.mbmark import MbMark, MbMark2, Mode
from src.gaussmark import GaussMark
from sklearn.metrics import roc_auc_score
import os
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from datasets import load_dataset, load_from_disk
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import math
import scipy.stats
torch.manual_seed(42)
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve
from scipy.interpolate import interp1d


In [3]:
model_name = os.getenv("MODEL", "mistralai/Mistral-7B-v0.3")
model_suffix = model_name.split("/")[-1]
output_file = os.getenv("OUTPUT_FILE", "../output/Mistral-7B-v0.3/output_align=0_delta=1.2_gamma=0.3_k=4_seed=12997009_watermark=mb_dataset=arxiv.json")

assert output_file is not None, "Please set the OUTPUT_FILE environment variable to the path of your output file."


BATCH_SIZE = 64
print(f"Batch size: {BATCH_SIZE}")
print(f"Model name: {model_name}")


Batch size: 64
Model name: mistralai/Mistral-7B-v0.3


In [4]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    device_map="auto")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto", torch_dtype=torch.bfloat16)

with open(output_file, "r") as f:
    output_data = json.load(f)

watermark_type = output_data["watermark"]
config = output_data["config"]

if watermark_type == "gaussmark":
    watermark = GaussMark(sigma=config["sigma"],
                          tokenizer=tokenizer, model=model, seed=config["hash_key"], target_param_name=config["target_param_name"])
    BATCH_SIZE = 8
elif watermark_type == "mb":
    final_weight = torch.tensor(output_data["final_matrix"])
    watermark = MbMark.from_params(
        delta=config["delta"],
        gamma=config["gamma"],
        seed=config["hash_key"],
        final_weight=final_weight,
        model=model,
        tokenizer=tokenizer,
        unembedding_param_name=config["unembedding_param_name"],
        mode=Mode.Generate,
    )
elif watermark_type == "mb2":
    watermark = MbMark2.from_seed(
        seed=config["hash_key"],
        model=model,
        tokenizer=tokenizer,
        unembedding_param_name=config["unembedding_param_name"],
        mode=Mode.Generate
    )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
watermarked_model = watermark.model

watermarked_model_name = f"{model_name}-watermarked-{watermark_type}-seed{config['hash_key']}"


# Save the model to /pool.ssd/assets/models/
watermarked_model_name = os.path.join(
    "/pool.ssd/assets/models/", watermarked_model_name)
if not os.path.exists(watermarked_model_name):
    os.makedirs(watermarked_model_name)
# Save the model to the specified directory
watermarked_model.save_pretrained(watermarked_model_name)
tokenizer.save_pretrained(watermarked_model_name)
print(f"Watermarked model saved to {watermarked_model_name}")

Watermarked model saved to /pool.ssd/assets/models/mistralai/Mistral-7B-v0.3-watermarked-mb-seed12997009
