In [2]:
import requests
from tqdm import tqdm

def download_file(url, save_path):
    """
    Download a file with a progress bar.

    Parameters
    ----------
    url : str
        The direct URL to the file.
    save_path : str
        The local file path to save the downloaded file.

    Returns
    -------
    bool
        True if the download was successful, False otherwise.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Raise an error for failed requests (e.g., 404, 403)

        # Get total file size from headers
        total_size = int(response.headers.get("content-length", 0))

        # Download with progress bar
        with open(save_path, "wb") as file, tqdm(
            desc=save_path,
            total=total_size,
            unit="B",
            unit_scale=True,
            unit_divisor=1024,
        ) as bar:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
                bar.update(len(chunk))  # Update progress bar

        print(f"\nDownload complete: {save_path}")
        return True
    except requests.exceptions.RequestException as e:
        print(f"\nDownload failed: {e}")
        return False

In [3]:
url = "https://medical-epigenomics.org/papers/schaefer2024/data/datasets/archs4_geo/full_data.h5ad"
save_path = "../data/RNA/geo_full.h5ad"
download_file(url, save_path)

../data/RNA/geo_full.h5ad: 100%|██████████| 163G/163G [21:52<00:00, 134MB/s]    



Download complete: ../data/RNA/geo_full.h5ad


True

In [5]:
#read the file and save a subset of 10k random cells
import scanpy as sc
import numpy as np
adata = sc.read_h5ad(save_path)
adata = adata[np.random.choice(adata.obs.index, 10000, replace=False)]
adata.write_h5ad("../data/RNA/cellxgene_pseudo_bulk_10k.h5ad")