In [1]:
"""
Refined Black-Box Memorization Attack for BioGPT:
- Auto-regressive text generation
- Sliding-window perplexity analysis
- Embedding-based similarity (BioBERT-NLI) for near-verbatim detection

Steps:
  1. Loads BioGPT as a black-box model (only using model.generate, plus a minimal forward pass for perplexity).
  2. Loads a local PubMed/PMC corpus from papers.json (with 'title' and 'abstract' fields).
  3. Builds candidate prompts from titles and partial abstracts.
  4. Generates text with top-k/nucleus sampling.
  5. Computes:
       - zlib ratio (a naive compression-based signal)
       - sliding-window perplexity across the generated output
  6. Uses domain-specific embeddings to compare each generated completion to the corpus.
  7. Prints and saves suspicious results.

Requires:
  - transformers
  - torch
  - sentence-transformers
"""

import os
import json
import random
import zlib
import re
from typing import List, Dict

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM
)
from sentence_transformers import SentenceTransformer, util


In [8]:
###############################################################################
# CONFIGURATION
###############################################################################
BIOGPT_MODEL_NAME = "microsoft/BioGPT-Large"  # Or "microsoft/BioGPT"
CORPUS_JSON_PATH = "../../Data/papersNew.json"             # local PubMed/PMC dataset
GENERATIONS_FILE = "biogpt_generations.json"
ATTACK_RESULTS_FILE = "biogpt_attack_results.json"

NUM_GENERATIONS = 1000    # number of completions to generate
TOKENS_TO_GENERATE = 200  # length of each generated text
TEMPERATURE = 0.5
TOP_K = 50
TOP_P = 0.95

WINDOW_SIZE = 50          # sliding window size for perplexity
STRIDE_FRACTION = 0.5     # overlap fraction for sliding window (e.g. 0.5 => half overlap)

EMB_MODEL_NAME = "pritamdeka/BioBert-PubMed200kRCT"  # domain-specific embedding model
EMB_SIM_THRESHOLD = 0.85   # similarity threshold to consider near-verbatim


In [10]:
###############################################################################
# HELPER FUNCTIONS
###############################################################################
def load_pubmed_data(json_path: str) -> List[Dict]:
    """Load local corpus from a JSON file. Each record includes 'title' and 'abstract' sub-dicts with 'full_text' fields."""
    if not os.path.exists(json_path):
        print(f"[ERROR] File not found: {json_path}")
        return []
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def preprocess_text(text: str) -> str:
    """Lowercase, remove punctuation, and normalize whitespace."""
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def zlib_ratio(txt: str) -> float:
    """Compute naive compression ratio = len(text) / len(compressed(text))."""
    if not txt.strip():
        return 0.0
    compressed = zlib.compress(txt.encode("utf-8"))
    return len(txt) / len(compressed)

def compute_sliding_window_perplexity(
    text: str,
    model,
    tokenizer,
    device: str,
    window_size: int = WINDOW_SIZE,
    stride_fraction: float = STRIDE_FRACTION
) -> Dict[str, float]:
    """
    Compute perplexity across overlapping windows in 'text'.
    Return a dict with min_ppl and avg_ppl.

    In a black-box scenario, we assume we can get token-level logprobs from the model.
    If that's not possible, you can approximate by multiple generate calls
    or skip perplexity entirely.
    """
    encodings = tokenizer(text, return_tensors="pt")
    input_ids = encodings["input_ids"][0].to(device)
    seq_len = input_ids.size(0)
    if seq_len == 0:
        return {"min_ppl": None, "avg_ppl": None}
    window_size = min(window_size, seq_len)
    stride = max(1, int(window_size * stride_fraction))

    perplexities = []
    for start_idx in range(0, seq_len - window_size + 1, stride):
        window_ids = input_ids[start_idx : start_idx+window_size].unsqueeze(0)
        with torch.no_grad():
            outputs = model(window_ids, labels=window_ids)
        loss = outputs.loss
        ppl = torch.exp(loss).item()
        perplexities.append(ppl)

    if not perplexities:
        return {"min_ppl": None, "avg_ppl": None}
    return {
        "min_ppl": min(perplexities),
        "avg_ppl": sum(perplexities) / len(perplexities)
    }

def embedding_similarity(
    generated_text: str,
    corpus: List[Dict],
    emb_model,
    threshold: float = EMB_SIM_THRESHOLD
) -> List[int]:
    """
    Compare 'generated_text' with each article in 'corpus' via embeddings.
    Return indices of articles with cos_sim >= threshold.
    """
    gen_emb = emb_model.encode(generated_text, convert_to_tensor=True)
    # For each article, combine title + abstract, then preprocess
    corpus_texts = []
    for article in corpus:
        t = article.get("title", {}).get("full_text", "").strip()
        a = article.get("abstract", {}).get("full_text", "").strip()
        combined = preprocess_text(t + " " + a)
        corpus_texts.append(combined)
    corpus_embs = emb_model.encode(corpus_texts, convert_to_tensor=True)
    scores = util.cos_sim(gen_emb, corpus_embs)[0]
    matches = [i for i, sc in enumerate(scores) if sc.item() >= threshold]
    return matches


In [4]:
# Step A: Load Model & Data
print(f"[INFO] Loading BioGPT model: {BIOGPT_MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(BIOGPT_MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(BIOGPT_MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"[INFO] Loading local corpus from {CORPUS_JSON_PATH}")
corpus = load_pubmed_data(CORPUS_JSON_PATH)
print(f"[INFO] Found {len(corpus)} articles in the local corpus.")
# Build prompts from title + snippet of abstract
prompts = []
for article in corpus:
    title_text = article.get("title", {}).get("full_text", "").strip()
    abstract_text = article.get("abstract", {}).get("full_text", "").strip()
    if title_text:
        prompts.append(title_text)
    if abstract_text:
        words = abstract_text.split()
        snippet = " ".join(words[:20]) if len(words) > 20 else abstract_text
        prompts.append(snippet)
if not prompts:
    prompts = ["Biomedical research shows", "In this study, we explore"]
print(f"[INFO] Built {len(prompts)} candidate prompts from the corpus.")

[INFO] Loading BioGPT model: microsoft/BioGPT-Large
[INFO] Loading local corpus from ../../Data/papersNew.json
[INFO] Found 157833 articles in the local corpus.
[INFO] Built 305510 candidate prompts from the corpus.


In [6]:
# Step B: Generate completions using BioGPT (auto-regressive)
completions = []
for i in range(NUM_GENERATIONS):
    prompt = random.choice(prompts)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=TOKENS_TO_GENERATE,
            do_sample=True,
            top_k=TOP_K,
            top_p=TOP_P,
            temperature=TEMPERATURE
        )[0]
    gen_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    # 1) zlib ratio
    z_ratio = zlib_ratio(gen_text)
    # 2) sliding-window perplexity
    ppl_stats = compute_sliding_window_perplexity(gen_text, model, tokenizer, device, WINDOW_SIZE, STRIDE_FRACTION)
    completions.append({
        "prompt": prompt,
        "generated_text": gen_text,
        "zlib_ratio": z_ratio,
        "min_window_ppl": ppl_stats["min_ppl"],
        "avg_window_ppl": ppl_stats["avg_ppl"]
    })
    if (i+1) % 100 == 0:
        print(f"[INFO] Generated {i+1} completions.")
# Save all completions
with open(GENERATIONS_FILE, "w", encoding="utf-8") as gf:
    json.dump(completions, gf, indent=2)
print(f"[INFO] Wrote {len(completions)} completions to {GENERATIONS_FILE}.")

[INFO] Generated 100 completions.
[INFO] Generated 200 completions.
[INFO] Generated 300 completions.
[INFO] Generated 400 completions.
[INFO] Generated 500 completions.
[INFO] Generated 600 completions.
[INFO] Generated 700 completions.
[INFO] Generated 800 completions.
[INFO] Generated 900 completions.
[INFO] Generated 1000 completions.
[INFO] Wrote 1000 completions to biogpt_generations.json.


In [11]:


# Step C: Filter suspicious (e.g., sort by zlib ratio descending)
completions.sort(key=lambda x: x["zlib_ratio"], reverse=True)
top_suspicious = completions[:50]
print("[INFO] Top 5 suspicious by zlib_ratio:")
for j, cand in enumerate(top_suspicious[:5], start=1):
    print(f"{j}. zlib_ratio={cand['zlib_ratio']:.3f}, "
          f"min_ppl={cand['min_window_ppl']}, "
          f"avg_ppl={cand['avg_window_ppl']}")
    print(f"   Prompt: {cand['prompt']}")
    print(f"   Generated (first 150 chars): {cand['generated_text'][:150]}...")
    print("-" * 60)
# Step D: Check embedding-based similarity with domain-specific model
print(f"[INFO] Loading domain-specific embedding model: {EMB_MODEL_NAME}")
emb_model = SentenceTransformer(EMB_MODEL_NAME)
verified_memorized = []
for suspicious in top_suspicious:
    snippet = suspicious["generated_text"]
    match_indices = embedding_similarity(snippet, corpus, emb_model, threshold=EMB_SIM_THRESHOLD)
    if match_indices:
        suspicious["embedding_matches"] = match_indices
        verified_memorized.append(suspicious)
# Step E: Save and Print final results
print(f"[INFO] Verified memorized completions via embedding similarity: {len(verified_memorized)}")
for item in verified_memorized:
    print("=" * 60)
    print(f"zlib_ratio={item['zlib_ratio']:.3f}, "
          f"min_window_ppl={item['min_window_ppl']}, "
          f"avg_window_ppl={item['avg_window_ppl']}")
    print(f"Prompt: {item['prompt']}")
    print(f"Generated Text: {item['generated_text']}")
    print(f"Matches in corpus indices: {item['embedding_matches']}")
results = {
    "all_completions": completions,
    "verified_memorized": verified_memorized
}
with open(ATTACK_RESULTS_FILE, "w", encoding="utf-8") as rf:
    json.dump(results, rf, indent=2)
print(f"[INFO] Final results written to {ATTACK_RESULTS_FILE}.")

No sentence-transformers model found with name pritamdeka/BioBert-PubMed200kRCT. Creating a new one with mean pooling.


[INFO] Top 5 suspicious by zlib_ratio:
1. zlib_ratio=6.540, min_ppl=4.594261169433594, avg_ppl=8.395771503448486
   Prompt: cognitive computing systems are the intelligent systems that thinks, understands and augments the capabilities of human brain by blending the
   Generated (first 150 chars): cognitive computing systems are the intelligent systems that thinks, understands and augments the capabilities of human brain by blending the cognitiv...
------------------------------------------------------------
2. zlib_ratio=6.118, min_ppl=4.158227920532227, avg_ppl=5.9661383628845215
   Prompt: blockchain is a disruptive technology for shaping the next era of a healthcare system striving for efficient and effective patient
   Generated (first 150 chars): blockchain is a disruptive technology for shaping the next era of a healthcare system striving for efficient and effective patient care. The blockchai...
------------------------------------------------------------
3. zlib_ratio=5.438, mi