In [16]:
import random
from textwrap import fill
from datasets import load_dataset

# ---------- Config ---------- #
BUFFER_SIZE = 1000  # how many rows to read from the stream
SAMPLES = 20        # how many random examples to show
RNG_SEED = 42


# ---------- Helpers ---------- #

def buffer_and_sample(stream, buffer_size=1000, k=10, seed=42):
    """
    Take the first `buffer_size` examples from a streaming dataset,
    then randomly sample `k` of them.
    """
    buf = []
    for i, ex in enumerate(stream.take(buffer_size)):
        buf.append(ex)

    if not buf:
        return []

    rng = random.Random(seed)
    k = min(k, len(buf))
    return rng.sample(buf, k)


def pretty_print_example(example, idx, source_name, wrap_width=100, max_chars=800):
    print(f"\n--- {source_name} Example {idx} ---")
    print("-" * 80)

    for key, value in example.items():
        print(f"{key}:")
        if isinstance(value, str):
            text = value.strip()
            if len(text) > max_chars:
                text = text[:max_chars] + "..."
            text = fill(text, width=wrap_width)
            print(text)
        else:
            print(value)
        print()


# ---------- Load datasets (streaming) ---------- #

textbooks_stream = load_dataset("MedRAG/textbooks", split="train", streaming=True)
pubmed_stream = load_dataset("MedRAG/pubmed", split="train", streaming=True)

# Get random samples from first BUFFER_SIZE rows
textbook_samples = buffer_and_sample(textbooks_stream, BUFFER_SIZE, SAMPLES, seed=0)
pubmed_samples   = buffer_and_sample(pubmed_stream,   BUFFER_SIZE, SAMPLES, seed=1)


# ---------- Print samples ---------- #

print("=" * 80)
print(f"TEXTBOOKS - {len(textbook_samples)} Random Examples (from first {BUFFER_SIZE})")
print("=" * 80)
for i, ex in enumerate(textbook_samples, start=1):
    pretty_print_example(ex, i, "Textbook")

print("\n" + "=" * 80)
print(f"PUBMED - {len(pubmed_samples)} Random Examples (from first {BUFFER_SIZE})")
print("=" * 80)
for i, ex in enumerate(pubmed_samples, start=1):
    pretty_print_example(ex, i, "PubMed")


TEXTBOOKS - 20 Random Examples (from first 1000)

--- Textbook Example 1 ---
--------------------------------------------------------------------------------
id:
Anatomy_Gray_864

title:
Anatomy_Gray

content:
The greater omentum is also an important site for metastatic tumor spread. Direct omental spread by
a transcoelomic route is common for carcinoma of the ovary. As the metastases develop within the
greater omentum, it becomes significantly thickened. In computed tomography imaging and during
laparotomy, the thickened omentum is referred to as an “omental cake.” In the clinic Epithelial
transition between the abdominal esophagus and stomach

contents:
Anatomy_Gray. The greater omentum is also an important site for metastatic tumor spread. Direct
omental spread by a transcoelomic route is common for carcinoma of the ovary. As the metastases
develop within the greater omentum, it becomes significantly thickened. In computed tomography
imaging and during laparotomy, the thickened omen

In [15]:
ANIMAL_KEYWORDS = [
    # Rodents
    " rat ", " rats ",
    " mouse ", " mice ", " murine ", " rodent ", " rodents ",
    " sprague-dawley", " wistar", " c57bl", " balb/c",

    # Mammals
    " rabbit ", " rabbits ",
    " dog ", " dogs ", " canine ",
    " pig ", " pigs ", " porcine ", " swine ",
    " sheep ", " goat ",
    " macaque ", " primate ", " monkey ", " monkeys ",

    # Other species
    " zebrafish ", " drosophila ", " xenopus ",

    # Model-specific phrases
    " animal model", " animal models",
    " murine model", " rodent model",
    " in vivo", " preclinical", " transgenic mouse", " transgenic mice",
    " knockout mouse", " knockout mice"
]

def is_animal_study(text: str) -> bool:
    t = " " + text.lower() + " "
    return any(keyword in t for keyword in ANIMAL_KEYWORDS)


In [14]:
import random
from datasets import load_dataset
from collections import Counter

# ---------- Config ---------- #
BUFFER_SIZE = 100000  # how many rows to read from the stream


# ---------- Labeling helper ---------- #

def get_study_type(example) -> str:
    """
    Very simple heuristic labeling for PubMed examples.
    """
    text = (example.get("contents") or example.get("content") or "").lower()

    if is_animal_study(text):
        return "animal"
    elif ("patient" in text):
        return "case_report"
    else:
        return "other"


# ---------- Load dataset (streaming) ---------- #

pubmed_stream = load_dataset("MedRAG/pubmed", split="train", streaming=True)

# ---------- Label first BUFFER_SIZE rows ---------- #

label_counts = Counter()
total = 0

for i, ex in enumerate(pubmed_stream.take(BUFFER_SIZE), start=1):
    label = get_study_type(ex)
    label_counts[label] += 1
    total += 1

# ---------- Print results ---------- #

print("=" * 80)
print(f"PUBMED - Label counts for first {total} examples")
print("=" * 80)
for label in ["animal", "case_report", "other"]:
    print(f"{label}: {label_counts[label]}")


PUBMED - Label counts for first 100000 examples
animal: 26412
case_report: 21042
other: 52546


In [17]:
import random
from datasets import load_dataset
from collections import Counter

# ---------- Config ---------- #
BUFFER_SIZE = 100000  # how many rows to read from the stream


# ---------- Keyword sets ---------- #

ANIMAL_KEYWORDS = [
    # Rodents
    " rat ", " rats ",
    " mouse ", " mice ", " murine ", " rodent ", " rodents ",
    " sprague-dawley", " wistar", " c57bl", " balb/c",

    # Mammals / others
    " rabbit ", " rabbits ",
    " dog ", " dogs ", " canine ",
    " pig ", " pigs ", " porcine ", " swine ",
    " sheep ", " goat ",
    " camel", " bovine ",
    " macaque ", " primate ", " monkey ", " monkeys ",

    # Birds / fish / misc lab species
    " chicken ", " turkey ", " turkeys ", " avian ",
    " zebrafish ", " drosophila ", " xenopus ",

    # Model phrases
    " animal model", " animal models",
    " murine model", " rodent model",
    " in vivo", " preclinical", " transgenic mouse", " transgenic mice",
    " knockout mouse", " knockout mice"
]

# Keywords suggesting basic biochemistry / cell-bio / enzyme work
BASIC_SCIENCE_KEYWORDS = [
    "microsome", "mitochondria", "phosphatase", "ribonuclease",
    "poly(a)", "rna polymerase", "atpase", "carboxykinase",
    "fatty acid synthetase", "enzyme activity", "subcellular distribution",
    "cell line", "fibroblast", "erythrocyte"
]

# Many hard-core lab / assay markers
LAB_UNIT_KEYWORDS = [
    " mmol", " μmol", " umol", " mM ", " µM", " uM ",
    " mg/kg", " g/liter", " g/l ", " ml/min",
    "51cr", "32p", " radioactive", "labelled", "labeled",
]

# Microbiology / bacteria-focused
MICROBIO_KEYWORDS = [
    " bacillus ", " lactobacillus ", " thuringiensis",
    " sporulation", " fermentor", " fermentation",
    " bacterial", " bacteria "
]

# Clinical / human cues
CLINICAL_KEYWORDS = [
    " patient", " patients", " clinical", " burn", "burned patients",
    " disease", " syndrome", " symptoms", " treated", " therapy",
    " risk", " mortality", " complication", "diagnosis", " follow-up",
    "normal human", "human stratum corneum", "subjects"
]

CASE_REPORT_PATTERNS = [
    "a patient is reported",
    "we describe a patient",
    "one case is reported",
    "case report",
    "a case is reported"
]


# ---------- Helper functions ---------- #

def is_animal_study(text: str) -> bool:
    t = " " + text.lower() + " "
    return any(kw in t for kw in ANIMAL_KEYWORDS)


def is_microbiology_basic(text: str) -> bool:
    t = " " + text.lower() + " "
    return any(kw in t for kw in MICROBIO_KEYWORDS)


def is_basic_science_biochem(text: str) -> bool:
    t = " " + text.lower() + " "
    return any(kw in t for kw in BASIC_SCIENCE_KEYWORDS) or any(kw in t for kw in LAB_UNIT_KEYWORDS)


def is_case_report(text: str) -> bool:
    t = text.lower()
    return any(pat in t for pat in CASE_REPORT_PATTERNS)


def is_clinical_human(text: str) -> bool:
    t = text.lower()
    # Clinical language AND at least some human/subject notion, without being obviously animal-only
    has_clinical = any(kw in t for kw in CLINICAL_KEYWORDS)
    return has_clinical and not is_animal_study(t)


# ---------- Labeling helper ---------- #

def get_study_type(example) -> str:
    """
    Heuristic labeling for PubMed examples.
    Priority order:
      animal > microbiology_basic > basic_science_biochem > case_report > clinical_human > other
    """
    text = (example.get("contents") or example.get("content") or "").lower()

    if not text.strip():
        return "other"

    if is_animal_study(text):
        return "animal"
    elif is_microbiology_basic(text):
        return "microbiology_basic"
    elif is_basic_science_biochem(text):
        return "basic_science_biochem"
    elif is_case_report(text):
        return "case_report"
    elif is_clinical_human(text):
        return "clinical_human"
    else:
        return "other"


# ---------- Load dataset (streaming) ---------- #

pubmed_stream = load_dataset("MedRAG/pubmed", split="train", streaming=True)

# ---------- Label first BUFFER_SIZE rows ---------- #

label_counts = Counter()
total = 0

for i, ex in enumerate(pubmed_stream.take(BUFFER_SIZE), start=1):
    label = get_study_type(ex)
    label_counts[label] += 1
    total += 1

# ---------- Print results ---------- #

print("=" * 80)
print(f"PUBMED - Label counts for first {total} examples")
print("=" * 80)

# Fixed order for readability
labels_order = [
    "animal",
    "microbiology_basic",
    "basic_science_biochem",
    "clinical_human",
    "case_report",
    "other"
]

for label in labels_order:
    print(f"{label}: {label_counts[label]}")


PUBMED - Label counts for first 100000 examples
animal: 31113
microbiology_basic: 2619
basic_science_biochem: 9574
clinical_human: 29152
case_report: 531
other: 27011


In [18]:
from datasets import load_dataset
from collections import Counter

# ---------- Config ---------- #
BUFFER_SIZE = 100000  # how many rows to read from the stream


# ---------- Keyword sets ---------- #

TEXTBOOK_CLINICAL_KEYWORDS = [
    "in the clinic",
    "patient", "patients",
    "symptom", "symptoms",
    "disease", "disorder", "syndrome",
    "risk", "complication", "complications",
    "bleeding", "edema", "ascites", "cancer", "tumor", "lymphoma",
    "diagnosis", "diagnostic", "examination", "exam", "assessment",
    "treatment", "therapy", "management", "follow-up", "prognosis",
    "reflex", "hernia", "mass", "lesion", "injury", "fracture",
    "burn", "trauma", "clinical", "radiograph", "computed tomography",
    "ct ", "mri ", "ultrasound"
]

def is_figure_caption(text: str) -> bool:
    """
    Heuristic: lots of 'Fig.' and label-like words, usually low sentence structure.
    """
    t = text
    # Count occurrences of "Fig."
    fig_count = t.count("Fig.")
    # also treat 'Fig ' or 'Figure ' as hint
    figure_like = fig_count >= 2 or "Fig. " in t[:40] or "Figure " in t[:40]
    # crude check: lots of short label-like chunks separated by spaces (no verbs)
    has_many_numbers = sum(ch.isdigit() for ch in t) > 10
    return figure_like and has_many_numbers


def is_textbook_clinical(text: str) -> bool:
    """
    Clinical/“In the clinic” style chunks vs pure anatomy.
    """
    t = text.lower()
    return any(kw in t for kw in TEXTBOOK_CLINICAL_KEYWORDS)


# ---------- Labeling helper ---------- #

def get_textbook_type(example) -> str:
    """
    Label textbooks chunks as:
      - figure_caption
      - textbook_clinical
      - textbook_anatomy
      - other
    """
    text = (example.get("contents") or example.get("content") or "").strip()
    if not text:
        return "other"

    # Strip the 'Anatomy_Gray.' prefix if present, for cleaner heuristics
    title = (example.get("title") or "").strip()
    if title and text.startswith(title + "."):
        text = text[len(title) + 1:].lstrip()

    if is_figure_caption(text):
        return "figure_caption"
    elif is_textbook_clinical(text):
        return "textbook_clinical"
    else:
        # default: normal anatomy/description
        return "textbook_anatomy"


# ---------- Load dataset (streaming) ---------- #

textbooks_stream = load_dataset("MedRAG/textbooks", split="train", streaming=True)

# ---------- Label first BUFFER_SIZE rows ---------- #

label_counts = Counter()
total = 0

for i, ex in enumerate(textbooks_stream.take(BUFFER_SIZE), start=1):
    label = get_textbook_type(ex)
    label_counts[label] += 1
    total += 1

# ---------- Print results ---------- #

print("=" * 80)
print(f"TEXTBOOKS - Label counts for first {total} examples")
print("=" * 80)

for label in ["figure_caption", "textbook_clinical", "textbook_anatomy", "other"]:
    print(f"{label}: {label_counts[label]}")


TEXTBOOKS - Label counts for first 100000 examples
figure_caption: 2340
textbook_clinical: 77814
textbook_anatomy: 19846
other: 0
