In [None]:
"""
Black-Box Memorization Attack with BioGPT

This script simulates a black-box setting using BioGPT:
  1. It loads BioGPT (auto-regressive) and your local PMC data (pmc_fulltext.json).
  2. It builds a list of candidate prompts from the local data (titles and abstract snippets).
  3. It generates many completions from BioGPT using sampling (top-k, top-p, temperature).
  4. It computes two membership-inference metrics for each generation:
       - A naive zlib compression ratio.
       - The perplexity (computed using the model?s output probabilities).
  5. It ranks the completions (here, we assume higher zlib ratio is more suspicious).
  6. It performs a naive substring search in your local corpus to verify if any completion
     appears verbatim.
     
All interactions with the model are via generate() (and a separate forward pass for scoring),
so we're simulating a black-box interface.

Authors: Nilesh Rijhwani & Bhavana Krishna

"""

import os
import json
import random
import zlib
from typing import List, Dict
import re
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [39]:
###############################################################################
# Configuration
###############################################################################
MODEL_NAME = "microsoft/BioGPT-Large"  # or "microsoft/BioGPT"
PMC_JSON_PATH = "../Data/pubmed_2010_2024_intelligence.json"    # local PMC data JSON file
OUTPUT_GENERATIONS = "biogpt_generations.json"
ATTACK_RESULTS = "attack_results.json"

NUM_GENERATIONS = 4000    # Total number of completions to generate
TOKENS_TO_GENERATE = 400  # Number of tokens to generate for each completion
TOP_K = 50
TOP_P = 0.95
TEMPERATURE = 0.6
SUBSTRING_SEARCH_MAX = 2  # Max matching articles per candidate


In [48]:
###############################################################################
# Helper Functions
###############################################################################
def load_pmc_data(json_path: str) -> List[Dict]:
    """Load local PMC data from JSON file."""
    if not os.path.exists(json_path):
        print(f"[ERROR] File not found: {json_path}")
        return []
    data=[]
    with open(json_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:  # skip empty lines
                data.append(json.loads(line))
    return data

def zlib_ratio(txt: str) -> float:
    """Compute a naive zlib compression ratio as a membership inference metric."""
    if not txt.strip():
        return 0.0
    compressed = zlib.compress(txt.encode("utf-8"))
    return len(txt) / len(compressed)

def preprocess_text(text: str) -> str:
    """
    Lowercase the text, remove punctuation, and extra whitespace.
    """
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def fuzzy_ngram_search(snippet: str, corpus: List[Dict], n: int = 2, threshold: float = 0.3, max_results: int = 2) -> List[int]:
    """
    Compute n-gram overlap similarity between the snippet and the combined text
    (title + abstract) from each article in the corpus.
    
    Returns indices of articles where the overlap ratio (intersection/union) 
    is at least the threshold.
    
    Adjust n and threshold as needed.
    """
    # Preprocess snippet to remove punctuation and normalize text.
    snippet = preprocess_text(snippet)
    snippet_tokens = snippet.split()
    if len(snippet_tokens) < n:
        snippet_ngrams = set([tuple(snippet_tokens)])
    else:
        snippet_ngrams = set(zip(*[snippet_tokens[i:] for i in range(n)]))
    
    matches = []
    for i, article in enumerate(corpus):
        title = article.get("title", {}) or ""
        abstract = article.get("abstract", {}) or ""
        combined = preprocess_text(title + " " + abstract)
        combined_tokens = combined.split()
        if len(combined_tokens) < n:
            combined_ngrams = set([tuple(combined_tokens)])
        else:
            combined_ngrams = set(zip(*[combined_tokens[i:] for i in range(n)]))
        
        if not snippet_ngrams or not combined_ngrams:
            continue

        intersection = snippet_ngrams.intersection(combined_ngrams)
        union = snippet_ngrams.union(combined_ngrams)
        similarity = len(intersection) / len(union) if union else 0.0

        if similarity >= threshold:
            matches.append(i)
            if len(matches) >= max_results:
                break

    return matches

def compute_perplexity(text: str, model, tokenizer, device: str) -> float:
    """
    Compute perplexity for a given text using the model.
    This function uses the model in a black-box way: we simply pass the text and get loss.
    Note: In a real black-box API you might not have this ability.
    """
    inputs = tokenizer(text, return_tensors="pt").to(device)
    # Use labels identical to inputs for computing loss
    with torch.no_grad():
        outputs = model(**inputs, labels=inputs["input_ids"])
    loss = outputs.loss  # average negative log likelihood per token
    perplexity = torch.exp(loss)
    return perplexity.item()

In [42]:
# --- Step A: Load Model & Local Data ---
print(f"[INFO] Loading BioGPT model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f"[INFO] Loading local PMC data from {PMC_JSON_PATH}")
corpus = load_pmc_data(PMC_JSON_PATH)
print(f"[INFO] Loaded {len(corpus)} articles from local corpus.")

[INFO] Loading BioGPT model: microsoft/BioGPT-Large
[INFO] Loading local PMC data from ../Data/pubmed_2010_2024_intelligence.json
[INFO] Loaded 54583 articles from local corpus.


In [43]:
# --- Step B: Build Candidate Prompts ---
prompts = []
for article in corpus:
    title = article.get("title", "").strip()
    if title:
        prompts.append(title)
    abstract = article.get("abstract", "").strip()
    if abstract:
        words = abstract.split()
        prompt_abstract = " ".join(words[:20]) if len(words) > 20 else abstract
        prompts.append(prompt_abstract)
if not prompts:
    prompts = ["Biomedical research shows", "In this study, we explore"]
print(f"[INFO] Using {len(prompts)} candidate prompts.")


[INFO] Using 106151 candidate prompts.


In [44]:
# --- Step C: Generate Text Completions (Black-Box) ---
generations = []
for i in range(NUM_GENERATIONS):
    prompt = random.choice(prompts)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=TOKENS_TO_GENERATE,
            do_sample=True,
            temperature=TEMPERATURE,
            top_k=TOP_K,
            top_p=TOP_P
        )[0]
    gen_text = tokenizer.decode(output_ids, skip_special_tokens=True)
    ppl = compute_perplexity(gen_text, model, tokenizer, device)
    generations.append({
        "prompt": prompt,
        "generated_text": gen_text,
        "perplexity": ppl,
        "zlib_ratio": zlib_ratio(gen_text)
    })
    if (i+1) % 100 == 0:
        print(f"[INFO] Generated {i+1} completions.")

with open(OUTPUT_GENERATIONS, "w", encoding="utf-8") as f:
    json.dump(generations, f, indent=2)
print(f"[INFO] Saved {len(generations)} completions to {OUTPUT_GENERATIONS}.")

[INFO] Generated 100 completions.
[INFO] Generated 200 completions.
[INFO] Generated 300 completions.
[INFO] Generated 400 completions.
[INFO] Generated 500 completions.
[INFO] Generated 600 completions.
[INFO] Generated 700 completions.
[INFO] Generated 800 completions.
[INFO] Generated 900 completions.
[INFO] Generated 1000 completions.
[INFO] Generated 1100 completions.
[INFO] Generated 1200 completions.
[INFO] Generated 1300 completions.
[INFO] Generated 1400 completions.
[INFO] Generated 1500 completions.
[INFO] Generated 1600 completions.
[INFO] Generated 1700 completions.
[INFO] Generated 1800 completions.
[INFO] Generated 1900 completions.
[INFO] Generated 2000 completions.
[INFO] Generated 2100 completions.
[INFO] Generated 2200 completions.
[INFO] Generated 2300 completions.
[INFO] Generated 2400 completions.
[INFO] Generated 2500 completions.
[INFO] Generated 2600 completions.
[INFO] Generated 2700 completions.
[INFO] Generated 2800 completions.
[INFO] Generated 2900 complet

In [45]:
# --- Step D: Membership Inference Filtering ---
# Here we combine two metrics: perplexity and zlib_ratio.
# In this example, we simply rank by zlib_ratio (higher means more repeated structure)
generations.sort(key=lambda x: x["zlib_ratio"], reverse=True)
top_suspicious = generations[:50]  # Top 50 candidates by zlib_ratio
print("[INFO] Top 5 suspicious completions by zlib_ratio:")
for j, cand in enumerate(top_suspicious[:5], start=1):
    print(f"{j}. zlib_ratio: {cand['zlib_ratio']:.4f}, perplexity: {cand['perplexity']:.2f}")
    print(f"Prompt: {cand['prompt']}")
    print(f"Generated (first 100 chars): {cand['generated_text'][:100]}...")
    print("-" * 60)

[INFO] Top 5 suspicious completions by zlib_ratio:
1. zlib_ratio: 19.1169, perplexity: 5.93
Prompt: CT ​Evaluation ​by ​Artificial ​Intelligence ​for ​Atherosclerosis, Stenosis and Vascular ​Morphology ​(CLARIFY): ​A ​Multi-center, international study.
Generated (first 100 chars): CT Evaluation by Artificial Intelligence for Atherosclerosis, Stenosis and Vascular Morphology (CLAR...
------------------------------------------------------------
2. zlib_ratio: 8.4358, perplexity: 1.73
Prompt: Access to care has always been at the heart of the concerns of the actors of psychiatry. History reminds us
Generated (first 100 chars): Access to care has always been at the heart of the concerns of the actors of psychiatry. History rem...
------------------------------------------------------------
3. zlib_ratio: 7.5184, perplexity: 1.91
Prompt: Molecular variants of vitamin B<sub>12</sub>, siderophores, and glycans occur. To take up variant forms, bacteria may express an array of
Generated (first 

In [49]:
top_suspicious[1]

{'prompt': 'Access to care has always been at the heart of the concerns of the actors of psychiatry. History reminds us',
 'generated_text': 'Access to care has always been at the heart of the concerns of the actors of psychiatry. History reminds us that access to care is a political, social and cultural issue. Access to care is a concept that is not new, but it has been increasingly used in the last 2 0 years. In this context, access to care is a question that has been raised in the context of deinstitutionalisation and of the growth of the community mental health teams. In this context, access to care is a question that has been raised in the context of deinstitutionalisation and of the growth of the community mental health teams. Access to care is a question that has been raised in the context of deinstitutionalisation and of the growth of the community mental health teams. Access to care is a question that has been raised in the context of deinstitutionalisation and of the growth o

In [50]:
# --- Step E: Verification via Substring Search ---
verified_memorized = []
for suspicious in top_suspicious:
    snippet = suspicious["generated_text"]
    # Use fuzzy matching (with trigrams and threshold of 0.5, adjust as needed)
    matches = fuzzy_ngram_search(snippet, corpus, n=2, threshold=0.3, max_results=SUBSTRING_SEARCH_MAX)
    if matches:
        suspicious["matches"] = matches
        verified_memorized.append(suspicious)

print(f"[INFO] Verified memorized samples (exact substring matches): {len(verified_memorized)}")
for v in verified_memorized:
    print("=" * 60)
    print(f"zlib_ratio: {v['zlib_ratio']:.4f}, perplexity: {v['perplexity']:.2f}")
    print(f"Prompt: {v['prompt']}")
    print(f"Generated Text: {v['generated_text']}")
    print(f"Found in corpus indices: {v['matches']}")

[INFO] Verified memorized samples (exact substring matches): 0


In [51]:
# --- Save Final Attack Results ---
results = {
    "generations": generations,
    "verified_memorized": verified_memorized
}
with open(ATTACK_RESULTS, "w", encoding="utf-8") as rf:
    json.dump(results, rf, indent=2)
print(f"[INFO] Attack results saved to {ATTACK_RESULTS}")

[INFO] Attack results saved to attack_results.json
