In [None]:
"""
Full Pipeline: BioGPT Data Extraction Example

This script demonstrates:
  1. Loading BioGPT (a large language model).
  2. Loading a local subset of PubMed data (papers.json).
  3. Generating prompts with first 20 words from abstract.
  4. Generating cpmpletions from BioGPT without any sampling or black-box approach.
  5. Applying naive approach- calculation zlib ratio for all the generations and selecting top-50.
  5. Searching the local PubMed data to see if the text is indeed memorized
     (naive_substring_search).

DISCLAIMER:
  - The membership inference here is simplified. Real approaches might
    compare perplexities from multiple models or do more advanced metrics.
  - The substring search is naive and may need optimization or fuzzy matching.
  - This code is a proof-of-concept. We will Modify and expand to suit our needs.

Authors: Nilesh Rijhwani and Bhavana Krishna

Version: 1.0
"""

import os
import json
import random
import zlib
import torch
from typing import List, Dict
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    pipeline
)

In [8]:
###############################################################################
# Configuration
###############################################################################
MODEL_NAME = "microsoft/BioGPT"  # Or "microsoft/BioGPT"
PMC_JSON_PATH = "./Data/pubmed_2010_2024_intelligence.json"   # file with your downloaded PMC data
OUTPUT_GENERATIONS = "biogpt_generations.json"
ATTACK_RESULTS = "attack_results.json"

NUM_GENERATIONS = 15000  # total completions to generate
TOKENS_TO_GENERATE = 512  # tokens each time
TOP_K = 50
TOP_P = 0.95
TEMPERATURE = 0.8
SUBSTRING_SEARCH_MAX = 2


In [23]:
###############################################################################
# Functions
###############################################################################

def load_pmc_data(json_path: str) -> List[Dict]:
    """
    Loads the local PMC data from a JSON file.
    Expects a list of dicts each with 'full_text', 'abstract', etc.
    """
    if not os.path.exists(json_path):
        print(f"[ERROR] PMC data file not found: {json_path}")
        return []
    data=[]
    with open(json_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:  # skip empty lines
                data.append(json.loads(line))
    return data

def zlib_ratio(txt: str) -> float:
    """
    Naive measure of how well the text compresses.
    ratio = len(txt) / len(zlib.compress(txt.encode('utf-8')))
    Lower ratio might indicate random/unstructured text
    or higher ratio might indicate repeated structure.
    """
    if not txt.strip():
        return 0.0
    c = zlib.compress(txt.encode("utf-8"))
    return len(txt) / len(c)

def naive_substring_search(snippet: str, corpus: List[Dict], max_results: int = 2) -> List[int]:
    """
    Check if 'snippet' appears verbatim in the 'full_text' or 'abstract' or 'title'
    for each article in the corpus. Return indices of up to max_results matches.

    Note: Very naive substring match, ignoring punctuation/casing differences.
    """
    snippet_lower = snippet.lower()
    matches = []
    for i, article in enumerate(corpus):
        # combine multiple fields
        combined_text = (
            (article.get("title", "") + " ") +
            (article.get("abstract", "") + " ") +
            (article.get("full_text") or "")
        ).lower()
        if snippet_lower in combined_text:
            matches.append(i)
            if len(matches) >= max_results:
                break
    return matches

In [15]:
############################################################################
# Step A: Load Model & Data
############################################################################
print(f"[INFO] Loading BioGPT model: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(f"[INFO] Loading local PMC data from {PMC_JSON_PATH}")
pmc_data = load_pmc_data(PMC_JSON_PATH)
pmc_data = [record for record in pmc_data if record.get("abstract", "").strip()]
print(f"[INFO] Found {len(pmc_data)} articles in local corpus.")
# We'll create a small list of "seed prompts" from the PMC data
# e.g., random lines from 'title' or 'full_text'
# We'll store them in a list of strings
prompts = []
for article in pmc_data:
    t = article.get("title", "").strip()
    if t:
        prompts.append(t)
    ab = article.get("abstract", "").strip()
    if ab:
        # maybe take first 20 words
        words = ab.split()
        partial_ab = " ".join(words[:20])
        prompts.append(partial_ab)
if not prompts:
    prompts = ["Biomedical research indicates", "In this study, we explore"]  # fallback
print(f"[INFO] Built {len(prompts)} candidate seed prompts from the data.")


[INFO] Loading BioGPT model: microsoft/BioGPT
[INFO] Loading local PMC data from ./Data/pubmed_2010_2024_intelligence.json
[INFO] Found 51568 articles in local corpus.
[INFO] Built 103136 candidate seed prompts from the data.


In [16]:
############################################################################
# Step B: Generate Text
############################################################################
# We'll do a simple approach: pick random prompts from 'prompts' and generate
# completions from BioGPT. Then store them in a list.
all_generations = []
for _ in range(NUM_GENERATIONS):
    # pick random prompt
    prompt = random.choice(prompts)
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        output_seq = model.generate(
            **inputs,
            max_new_tokens=TOKENS_TO_GENERATE,
            do_sample=True,
            temperature=TEMPERATURE,
            top_k=TOP_K,
            top_p=TOP_P
        )[0]
    generated_text = tokenizer.decode(output_seq, skip_special_tokens=True)
    all_generations.append({
        "prompt": prompt,
        "generated_text": generated_text
    })
print(f"[INFO] Generated {len(all_generations)} completions total.")
# Save raw generations
with open(OUTPUT_GENERATIONS, "w", encoding="utf-8") as f:
    json.dump(all_generations, f, indent=2)
print(f"[INFO] Wrote raw generations to {OUTPUT_GENERATIONS}.")


[INFO] Generated 15000 completions total.
[INFO] Wrote raw generations to biogpt_generations.json.


In [17]:
############################################################################
# Step C: Membership Inference (zlib ratio)
############################################################################
# Let's do a naive approach: compute zlib_ratio for each generation. 
# Then pick the top-50 or top-100 "most suspicious" (lowest or highest ratio?).
# Typically, "lowest perplexity" => "lowest ratio" might be suspicious, 
# but this is very heuristic. We'll just pick the top 50 with the "highest ratio"
# to demonstrate. (Carlini's approach can vary.)
# Let's define "highest ratio" as suspicious (lots of repeated patterns => bigger compress).
# Alternatively, do "lowest ratio" if you interpret random text compresses worse. 
# You can experiment either direction.
extended_gens = []
for g in all_generations:
    txt = g["generated_text"]
    ratio = zlib_ratio(txt)
    g["zlib_ratio"] = ratio
    extended_gens.append(g)
# Sort by ratio descending (could do ascending if you prefer).
extended_gens.sort(key=lambda x: x["zlib_ratio"], reverse=True)
# Let's pick top 50 as suspicious
top_suspicious = extended_gens[:50]
print("[INFO] Top 50 suspicious by zlib ratio (descending):")
for i, sus in enumerate(top_suspicious[:5], start=1):
    print(f"{i}. ratio={sus['zlib_ratio']:.4f} => {sus['generated_text'][:100]}...")


[INFO] Top 50 suspicious by zlib ratio (descending):
1. ratio=2.5192 => To summarize recently published key articles on the topics of biomedical engineering, biotechnology ...
2. ratio=2.2910 => Insomnia, intelligence and neuroticism are three typical traits and dysfunctions mainly regulated by...
3. ratio=2.2821 => To explore the cross-level relationships between group organisational citizenship behaviour, emotion...
4. ratio=2.1987 => We demonstrate a link between preschoolers' quantitative competencies and their school-entry knowled...
5. ratio=2.1543 => In this piece, Daniel Leufer introduces his project, aimyths.org, a website that tackles eight of th...


In [24]:
############################################################################
# Step D: Verify by Searching in Local Data
############################################################################
verified_memorized = []
for sus in top_suspicious:
    snippet = sus["generated_text"]
    matches = naive_substring_search(snippet, pmc_data, max_results=SUBSTRING_SEARCH_MAX)
    if matches:
        sus["corpus_matches"] = matches
        verified_memorized.append(sus)
print(f"[INFO] Found {len(verified_memorized)} 'verified' memorized samples (exact substring).")
for vm in verified_memorized:
    print("----------------------------------------------------")
    print(f"zlib_ratio={vm['zlib_ratio']:.4f}")
    print(f"Prompt: {vm['prompt']}")
    print(f"Generated: {vm['generated_text']}")
    print(f"Matches in corpus indices: {vm['corpus_matches']}")
# Save final results
results_dict = {
    "all_generations": extended_gens,
    "verified_memorized": verified_memorized
}
with open(ATTACK_RESULTS, "w", encoding="utf-8") as rf:
    json.dump(results_dict, rf, indent=2)
print(f"[INFO] Attack complete. Results in {ATTACK_RESULTS}.")

[INFO] Found 0 'verified' memorized samples (exact substring).
[INFO] Attack complete. Results in attack_results.json.
