In [2]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("jo-mengr/cellxgene_pseudo_bulk_3_5k_multiplets_natural_language_annotation")

In [3]:
import anndata as ad
import requests

# Get the share link from a dataset row
row = dataset["train"][0]  # First row as example
share_link = row["share_link"]
sample_idx = row["sample_idx"]

In [7]:
share_link

'https://nxc-fredato.imbi.uni-freiburg.de/s/YKjrgdnbdNeYTsL'

In [8]:
# Download the file from the share link, preserving its original filename from the Content-Disposition header if possible.
# If the filename is not provided, fall back to "downloaded_file".
# Then, try to open it as a Zarr zip store (if .zarr.zip), or as h5ad otherwise.

import os

response = requests.get(f"{share_link}/download", stream=True)
if response.status_code == 200:
    # Try to get filename from Content-Disposition header
    cd = response.headers.get("Content-Disposition", "")
    import re

    match = re.search(r'filename="?([^"]+)"?', cd)
    if match:
        filename = match.group(1)
    else:
        # Try to guess from the share_link or fallback
        filename = os.path.basename(share_link) or "downloaded_file"
    # Save the file
    with open(filename, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)
    try:
        if filename.endswith(".zarr.zip"):
            import zarr

            # Open zipped Zarr store
            store = zarr.ZipStore(filename, mode="r")
            adata = ad.read_zarr(store)
            store.close()
        else:
            adata = ad.read_h5ad(filename)
        # The sample_idx corresponds to adata.obs.index
        sample_data = adata[adata.obs.index == sample_idx]
        print(f"Found sample: {sample_data.shape}")
    except Exception as e:
        print(f"Failed to read AnnData object: {e}")
else:
    print("Failed to download AnnData object")

Found sample: (1, 15310)


In [9]:
adata.shape

(2825, 15310)