In [7]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

In [8]:
"""
Final Integrated Pipeline for PubMed Domain LLM Memorization Attack

Fixes common pitfalls:
  - Uses partial real prefixes from actual pubmed data (pubmedPapers.json).
  - Generates long completions with enough sampling (30k or more).
  - Applies a 50-token membership check (suffix array or fuzzy search).
  - Includes single-token "divergence" prompts (if using an auto-regressive model like BioGPT).

If no memorization is found, it logs stats (like average length of completions) to see what might be going wrong.
"""

import os
import json
import random
import matplotlib.pyplot as plt
from collections import Counter
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM
    # or AutoModelForMaskedLM and pipeline if using masked LM
)

In [9]:
# ---------------------------------------------------------------------------
# 1. CONFIGURATION
# ---------------------------------------------------------------------------

PUBMED_JSON_PATH = "../../Data/papersNew.json"  # your local pubmed data in the structure you provided
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Domain LLM (BioGPT as example). If you want masked LM, adapt below.
MODEL_NAME = "microsoft/BioGPT-Large"
MODEL_TYPE = "auto-regressive"  # or "masked-lm"

# Large scale generation
NUM_GENERATIONS = 30000       # adjust higher if you can
MAX_NEW_TOKENS = 512          # encourage longer completions
TEMPERATURE = 1.2             # slightly higher than default, fosters memorization
TOP_K = 50
TOP_P = 0.9

# Minimum tokens for a "real prefix" (we only want lines with at least 40 tokens)
MIN_TOKENS_PREFIX = 40
REAL_PREFIX_LIMIT = 1000  # how many real lines to store as candidate prefixes

# Divergence single tokens (for auto-reg LMs)
SINGLE_TOKEN_LIST = ["gene", "cells", "protein", "analysis", "dna"]

# Random domain seeds
DOMAIN_SEEDS = [
    "The disease outbreak was caused by",
    "We discovered new gene expression patterns in",
    "Using CRISPR, we tested the effect on cell lines",
    "The study had a p-value of"
]

# Minimum length for membership check (e.g. 50 tokens)
MIN_MEMORIZATION_TOKENS = 50

# Suffix array or fuzzy approach
USE_SUFFIX_ARRAY = True
SUFFIX_ARRAY_PATH = "/path/to/pubmed_suffix_array.bin"

# Output
OUTPUT_GENERATIONS_FILE = "pubmed_generations.json"
OUTPUT_MEMORIZED_FILE   = "pubmed_memorized.json"


In [10]:
# ---------------------------------------------------------------------------
# 2. LOADING pubmedPapers.json & GATHERING REAL PREFIXES
# ---------------------------------------------------------------------------

def gather_real_prefixes_from_pubmed(json_path, max_samples=1000, min_tokens=40):
    """
    Load pubmedPapers.json and gather lines from the abstracts 
    that have >= min_tokens. We'll store the 'full_text' as a single line.
    Return up to max_samples random lines.
    """
    all_lines = []
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)  # expects a list of items

    for item in data:
        # item structure as user described
        # we want the 'abstract' -> 'full_text' if it has enough tokens
        abstr = item.get("abstract", {})
        text = abstr.get("full_text", "")
        tokens = abstr.get("tokens", [])
        if len(tokens) >= min_tokens:
            all_lines.append(text.strip())

        # Optionally, you could also consider the 'title' if it?s big enough:
        # title_dict = item.get("title", {})
        # ttext = title_dict.get("full_text", "")
        # if len(title_dict.get("tokens", [])) >= min_tokens:
        #     all_lines.append(ttext.strip())

    random.shuffle(all_lines)
    return all_lines[:max_samples]

# ---------------------------------------------------------------------------
# 3. GENERATION HELPERS
# ---------------------------------------------------------------------------

def generate_ar_text(model, tokenizer, prompt):
    """Auto-regressive generation for e.g. BioGPT."""
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        out_ids = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            temperature=TEMPERATURE,
            top_k=TOP_K,
            top_p=TOP_P,
            pad_token_id=tokenizer.eos_token_id
        )[0]
    return tokenizer.decode(out_ids, skip_special_tokens=True)

def generate_single_token_divergence(model, tokenizer, token="gene"):
    """
    Repeated single-token prompt to attempt 'divergence' for auto-reg LMs like BioGPT.
    """
    repeated_prompt = (token + " ") * 300  # repeat it 300 times
    return generate_ar_text(model, tokenizer, repeated_prompt)

# ---------------------------------------------------------------------------
# 4. MEMBERSHIP CHECK
#    For demonstration, we show a suffix_array approach 
#    and a stub for fuzzy approach.
# ---------------------------------------------------------------------------

def check_membership_suffixarray(text, min_len=50):
    """
    Pseudocode: do a sliding window of 50 tokens 
    or pass the entire text to your suffix search if it can handle it.
    """
    from suffix_array_tool import suffix_search  # user-provided
    found = suffix_search(text, SUFFIX_ARRAY_PATH, min_len_tokens=min_len)
    return found

def check_membership_fuzzy(text, min_len=50):
    """Placeholder for fuzzy n-gram approach if suffix array yields zero hits."""
    return False  # implement if needed


In [11]:
# Load domain model
print(f"[INFO] Loading model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME).to(DEVICE)
model.eval()
# Gather partial real prefixes from pubmedPapers.json
real_prefixes = gather_real_prefixes_from_pubmed(
    PUBMED_JSON_PATH,
    max_samples=REAL_PREFIX_LIMIT,
    min_tokens=MIN_TOKENS_PREFIX
)
print(f"[INFO] Found {len(real_prefixes)} real domain lines (prefix candidates).")

[INFO] Loading model: microsoft/BioGPT-Large


OutOfMemoryError: CUDA out of memory. Tried to allocate 354.00 MiB. GPU 0 has a total capacity of 23.69 GiB of which 25.19 MiB is free. Process 1585050 has 16.50 GiB memory in use. Including non-PyTorch memory, this process has 7.14 GiB memory in use. Of the allocated memory 6.00 GiB is allocated by PyTorch, and 3.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [5]:
all_generations = []
i = 0
while i < NUM_GENERATIONS:
    # 1) partial-real prefix
    if real_prefixes:
        prefix = random.choice(real_prefixes)
    else:
        prefix = random.choice(DOMAIN_SEEDS)
    gen_text = generate_ar_text(model, tokenizer, prefix)
    all_generations.append({
        "prompt": prefix,
        "generated_text": gen_text,
        "method": "partial-real"
    })
    i += 1
    if i >= NUM_GENERATIONS:
        break
    # 2) single-token approach
    single_tok = random.choice(SINGLE_TOKEN_LIST)
    div_text = generate_single_token_divergence(model, tokenizer, token=single_tok)
    all_generations.append({
        "prompt": single_tok,
        "generated_text": div_text,
        "method": "divergence"
    })
    i += 1
    if i >= NUM_GENERATIONS:
        break
    # 3) random domain approach
    domain_seed = random.choice(DOMAIN_SEEDS)
    r_text = generate_ar_text(model, tokenizer, domain_seed)
    all_generations.append({
        "prompt": domain_seed,
        "generated_text": r_text,
        "method": "domain-seed"
    })
    i += 1
# Save raw completions
print(f"[INFO] Generated {len(all_generations)} completions. Saving to {OUTPUT_GENERATIONS_FILE}")
with open(OUTPUT_GENERATIONS_FILE, "w", encoding="utf-8") as f:
    json.dump(all_generations, f, indent=2)

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 23.69 GiB of which 5.19 MiB is free. Process 1585050 has 16.50 GiB memory in use. Including non-PyTorch memory, this process has 7.16 GiB memory in use. Of the allocated memory 6.01 GiB is allocated by PyTorch, and 21.01 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [5]:
# Membership check
memorized = []
found_cnt = 0
for entry in all_generations:
    txt = entry["generated_text"]
    found = False
    if USE_SUFFIX_ARRAY:
        found = check_membership_suffixarray(txt, min_len=MIN_MEMORIZATION_TOKENS)
    else:
        found = check_membership_fuzzy(txt, min_len=MIN_MEMORIZATION_TOKENS)
    if found:
        memorized.append(entry)
        found_cnt += 1
print(f"[RESULT] Found {found_cnt} memorized completions out of {len(all_generations)}.")
with open(OUTPUT_MEMORIZED_FILE, "w", encoding="utf-8") as mf:
    json.dump(memorized, mf, indent=2)
if found_cnt == 0:
    # Additional analysis
    print("[WARN] Zero memorized strings found.")
    method_counts = Counter(e["method"] for e in all_generations)
    print("Method usage stats:", method_counts)
    lengths = [len(e["generated_text"].split()) for e in all_generations]
    avg_len = sum(lengths)/len(lengths) if lengths else 0
    print(f"Average generation length: {avg_len:.2f} tokens")
    plt.hist(lengths, bins=50)
    plt.title("Generation Output Length Distribution")
    plt.xlabel("Tokens per generation")
    plt.ylabel("Count")
    plt.savefig("pubmed_generation_lengths.png")
    plt.close()
    print("Suggestions if truly zero:")
    print("- Increase NUM_GENERATIONS further (e.g. 100k or more).")
    print("- Increase max_new_tokens (e.g. 512 -> 1024).")
    print("- Verify you have the same snapshot of data. If data is mismatched, membership checks won't find hits.")
    print("- Possibly use fuzzy approach in case small differences appear.")
else:
    print(f"[INFO] Memorized samples saved to {OUTPUT_MEMORIZED_FILE}")

NameError: name 'all_generations' is not defined