In [None]:
%%autoreload 2

In [1]:
from omicidx_etl.sra.extract import get_sra_urls

In [2]:
urls = get_sra_urls()

[32m2025-12-11 15:34:42.055[0m | [1mINFO    [0m | [36momicidx_etl.sra.extract[0m:[36mget_sra_urls[0m:[36m148[0m - [1mFound URLs in https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251211: 5[0m
[32m2025-12-11 15:34:42.340[0m | [1mINFO    [0m | [36momicidx_etl.sra.extract[0m:[36mget_sra_urls[0m:[36m148[0m - [1mFound URLs in https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251210: 5[0m
[32m2025-12-11 15:34:42.597[0m | [1mINFO    [0m | [36momicidx_etl.sra.extract[0m:[36mget_sra_urls[0m:[36m148[0m - [1mFound URLs in https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251209: 5[0m
[32m2025-12-11 15:34:42.852[0m | [1mINFO    [0m | [36momicidx_etl.sra.extract[0m:[36mget_sra_urls[0m:[36m148[0m - [1mFound URLs in https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251208: 5[0m
[32m2025-12-11 15:34:43.106[0m | [1mINFO    [0m | [36momicidx_etl.sra.extract[0m:

In [3]:
urls

['https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251211/meta_experiment_set.xml.gz',
 'https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251211/meta_run_set.xml.gz',
 'https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251211/meta_sample_set.xml.gz',
 'https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251211/meta_study_set.xml.gz',
 'https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251210/meta_experiment_set.xml.gz',
 'https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251210/meta_run_set.xml.gz',
 'https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251210/meta_sample_set.xml.gz',
 'https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251210/meta_study_set.xml.gz',
 'https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251209/meta_experiment_set.xml.gz',
 'https://ftp.ncbi.nlm.nih.gov/sra/reports

In [None]:
from dataclasses import dataclass

In [None]:
import datetime
from dataclasses import dataclass
import re

class SRAMirrorEntry():
    """Represents an entry in the SRA mirror file list
    
    And entry will have a url like:
    https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251206_Full/meta_study_set.xml.gz
    
    From the URL, we can extract the following information:
    - SRA entity (study, sample, experiment, run)
    - Full or incremental file
    - Date of the file
    """
    def __init__(self, url):
        self.url = url
        self._extract_sra_entity()
        self._is_full_file()
        self._extract_date()
        self._in_current_batch()
        
    def __repr__(self):
        return f"SRAMirrorEntry(url={self.url}, entity={self.entity}, is_full={self.is_full}, date={self.date}, in_current_batch={self.in_current_batch})"

    def _extract_sra_entity(self):
        # Implement the logic to extract the SRA entity from the URL
        if 'study' in self.url:
            self.entity = 'study'
        elif 'sample' in self.url:
            self.entity = 'sample'
        elif 'experiment' in self.url:
            self.entity = 'experiment'
        elif 'run' in self.url:
            self.entity = 'run'
        else:
            raise ValueError("Unknown SRA entity")
    
    def _is_full_file(self):
        # Implement the logic to determine if the file is full or incremental
        if 'Full' in self.url:
            self.is_full = True
        else:
            self.is_full = False
    
    import re
    from datetime import datetime
    
    def _extract_date(self):
        """Extract the date from the URL in format NCBI_SRA_Mirroring_YYYYMMDD_Full"""
        # Match the date pattern: NCBI_SRA_Mirroring_(\d{8})
        match = re.search(r'NCBI_SRA_Mirroring_(\d{8})', self.url)
        if not match:
            raise ValueError(f"Could not extract date from URL: {self.url}")
        
        date_str = match.group(1)  # e.g., '20251206'
        # Parse the date string (YYYYMMDD format)
        date_obj = datetime.datetime.strptime(date_str, '%Y%m%d').date()
        self.date = date_obj  # Store the date object
    
    def _in_current_batch(self):
        # Implement the logic to determine if the entry is in the current batch
        self.in_current_batch = False

In [19]:
s = SRAMirrorEntry(urls[0])
s

SRAMirrorEntry(url=https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251211/meta_experiment_set.xml.gz, entity=experiment, is_full=False, date=2025-12-11, in_current_batch=False)

In [65]:
from upath import UPath
from typing import List

def get_sra_mirror_entries() -> List[SRAMirrorEntry]:
    """Fetch the SRA mirror entries from the SRA mirror file urls
    
    The idea is to get the latest full file and then the incremental files that follow it.
    
    The function will return a list of SRAMirrorEntry objects, with all the 
    entries in the current batch marked as such. 
    
    Returns:
    - List of SRAMirrorEntry objects
    """
    up = UPath("https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/")
    l = list(reversed([str(f) for f in up.glob("**/*set.xml.gz")]))
    found_full = False
    out_of_full = False
    entries = []
    for url in l:
        try:
            sra_mirror_entry = SRAMirrorEntry(url)
        except ValueError:
            continue
        if "Full" in url and not found_full:
            found_full = True
        if found_full and "Full" not in url:
            out_of_full = True
        if out_of_full:
            sra_mirror_entry.in_current_batch = False
            entries.append(sra_mirror_entry)
            continue
        sra_mirror_entry.in_current_batch = True
        entries.append(sra_mirror_entry)
    return entries


In [None]:
import gzip
import shutil
import tempfile

import orjson
from omicidx_etl.path_provider import PathProvider
from omicidx.sra.parser import sra_object_generator
from omicidx_etl.sra.mirror_parquet import process_mirror_entry_to_parquet_parts
from omicidx_etl.sra.schema import PYARROW_SCHEMAS

from upath import UPath

class SRACatalog:
    def __init__(self, path_provider: PathProvider):
        self.path_provider = path_provider
        
    def path_for_mirror_entry(self, mirror_entry: SRAMirrorEntry):
        """Return the path where the SRA mirror entry should be stored
        
        Stores the files in a directory structure like:
        {base_path}/study/date=2025-12-06/stage=Full/data_0.ndjson.gz
        """
        return self.path_provider.get_path(
            mirror_entry.entity,
            f"date={mirror_entry.date.strftime('%Y-%m-%d')}",
            f"stage={'Full' if mirror_entry.is_full else 'Incremental'}",
            "data_0.ndjson.gz"
        )

    def parquet_dir_for_mirror_entry(self, mirror_entry: SRAMirrorEntry):
        return self.path_provider.get_path(
            mirror_entry.entity,
            f"date={mirror_entry.date.strftime('%Y-%m-%d')}",
            f"stage={'Full' if mirror_entry.is_full else 'Incremental'}",
        )

    def _rm_tree(self, p: UPath) -> None:
        """
        Remove a directory/prefix recursively (works for local + fsspec-backed remotes like S3).
        """
        # UPath exposes the fsspec filesystem as `.fs` and the fs-native path as `.path`
        # For s3://bucket/prefix -> p.path == "bucket/prefix"
        # For local paths -> p.path is the local filesystem path
        try:
            p.fs  # touch to ensure provider is initialized (optional)
        except Exception:
            pass

        if not p.exists():
            return

        try:
            p.fs.rm(p.path, recursive=True)
        except TypeError:
            # Some FS implementations use `rm(path, recursive=True)` but may not accept kwarg
            p.fs.rm(p.path, True)

    def cleanup_one(self, mirror_entry: SRAMirrorEntry):
        """Remove all stored artifacts for a mirror entry (entire directory/prefix)."""
        # New parquet layout: delete the whole directory for this entry
        out_dir = self.parquet_dir_for_mirror_entry(mirror_entry)
        self._rm_tree(out_dir)

        # Optional: remove legacy single-file landing path if you still generate it anywhere
        legacy = self.path_for_mirror_entry(mirror_entry)
        try:
            legacy.unlink(missing_ok=True)
        except Exception:
            pass

        print(f"Cleaned up {mirror_entry.url} -> removed {out_dir}")
        
    def cleanup(self, mirror_entries: List[SRAMirrorEntry]):
        """Clean up the catalog by removing old files"""
        for entry in mirror_entries:
            if not entry.in_current_batch:
                self.cleanup_one(entry)
                
    def process_one(self, mirror_entry: SRAMirrorEntry):
        out_dir = self.parquet_dir_for_mirror_entry(mirror_entry)
        process_mirror_entry_to_parquet_parts(
            url=mirror_entry.url,
            out_dir=out_dir,
            entity=mirror_entry.entity,
            schema=PYARROW_SCHEMAS[mirror_entry.entity],
            CHUNK_SIZE=100_000,
            basename="data",
        )
    
    def process(self, mirror_entries: List[SRAMirrorEntry]):
        """Process the SRA mirror entries and store them in the catalog"""
        for entry in mirror_entries:
            if entry.in_current_batch:
                # Implement the logic to process and store the entry
                print(f"Processing and storing {entry.url} in {self.path_for_mirror_entry(entry)}")
                self.process_one(entry)

In [None]:
from omicidx_etl.path_provider import get_path_provider
pp = get_path_provider("s3://omicidx/sra")
sc = SRACatalog(pp)
sc.process(get_sra_mirror_entries())

Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251212/meta_study_set.xml.gz in s3://omicidx/sra/study/date=2025-12-12/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:15:53.913[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/study/date=2025-12-12/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251212/meta_sample_set.xml.gz in s3://omicidx/sra/sample/date=2025-12-12/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:15:59.303[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/sample/date=2025-12-12/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251212/meta_run_set.xml.gz in s3://omicidx/sra/run/date=2025-12-12/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:01.221[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/run/date=2025-12-12/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251212/meta_experiment_set.xml.gz in s3://omicidx/sra/experiment/date=2025-12-12/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:04.377[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/experiment/date=2025-12-12/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251211/meta_study_set.xml.gz in s3://omicidx/sra/study/date=2025-12-11/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:04.731[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/study/date=2025-12-11/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251211/meta_sample_set.xml.gz in s3://omicidx/sra/sample/date=2025-12-11/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:12.895[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/sample/date=2025-12-11/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251211/meta_run_set.xml.gz in s3://omicidx/sra/run/date=2025-12-11/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:14.864[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/run/date=2025-12-11/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251211/meta_experiment_set.xml.gz in s3://omicidx/sra/experiment/date=2025-12-11/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:17.914[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/experiment/date=2025-12-11/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251210/meta_study_set.xml.gz in s3://omicidx/sra/study/date=2025-12-10/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:18.399[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/study/date=2025-12-10/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251210/meta_sample_set.xml.gz in s3://omicidx/sra/sample/date=2025-12-10/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:28.968[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/sample/date=2025-12-10/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251210/meta_run_set.xml.gz in s3://omicidx/sra/run/date=2025-12-10/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:31.464[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/run/date=2025-12-10/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251210/meta_experiment_set.xml.gz in s3://omicidx/sra/experiment/date=2025-12-10/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:35.353[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/experiment/date=2025-12-10/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251209/meta_study_set.xml.gz in s3://omicidx/sra/study/date=2025-12-09/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:35.701[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/study/date=2025-12-09/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251209/meta_sample_set.xml.gz in s3://omicidx/sra/sample/date=2025-12-09/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:43.184[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/sample/date=2025-12-09/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251209/meta_run_set.xml.gz in s3://omicidx/sra/run/date=2025-12-09/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:47.098[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/run/date=2025-12-09/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251209/meta_experiment_set.xml.gz in s3://omicidx/sra/experiment/date=2025-12-09/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:50.991[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/experiment/date=2025-12-09/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251208/meta_study_set.xml.gz in s3://omicidx/sra/study/date=2025-12-08/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:52.549[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/study/date=2025-12-08/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251208/meta_sample_set.xml.gz in s3://omicidx/sra/sample/date=2025-12-08/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:16:58.183[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/sample/date=2025-12-08/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251208/meta_run_set.xml.gz in s3://omicidx/sra/run/date=2025-12-08/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:17:01.351[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/run/date=2025-12-08/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251208/meta_experiment_set.xml.gz in s3://omicidx/sra/experiment/date=2025-12-08/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:17:05.191[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/experiment/date=2025-12-08/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251207/meta_study_set.xml.gz in s3://omicidx/sra/study/date=2025-12-07/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:17:06.469[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/study/date=2025-12-07/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251207/meta_sample_set.xml.gz in s3://omicidx/sra/sample/date=2025-12-07/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:17:13.035[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/sample/date=2025-12-07/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251207/meta_run_set.xml.gz in s3://omicidx/sra/run/date=2025-12-07/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:17:20.181[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/run/date=2025-12-07/stage=Incremental/data_00000.parquet[0m
[32m2025-12-12 04:17:21.904[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/run/date=2025-12-07/stage=Incremental/data_00001.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251207/meta_experiment_set.xml.gz in s3://omicidx/sra/experiment/date=2025-12-07/stage=Incremental/data_0.ndjson.gz


[32m2025-12-12 04:17:25.281[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/experiment/date=2025-12-07/stage=Incremental/data_00000.parquet[0m


Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251206_Full/meta_study_set.xml.gz in s3://omicidx/sra/study/date=2025-12-06/stage=Full/data_0.ndjson.gz


[32m2025-12-12 04:17:33.353[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/study/date=2025-12-06/stage=Full/data_00000.parquet[0m
[32m2025-12-12 04:17:40.991[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/study/date=2025-12-06/stage=Full/data_00001.parquet[0m
[32m2025-12-12 04:17:48.775[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/study/date=2025-12-06/stage=Full/data_00002.parquet[0m
[32m2025-12-12 04:17:56.895[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/study/date=2025-12-06/stage=Full/data_00003.parquet[0m
[32m2025-12-12 04:18:05.301[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m

Processing and storing https://ftp.ncbi.nlm.nih.gov/sra/reports/Mirroring/NCBI_SRA_Mirroring_20251206_Full/meta_sample_set.xml.gz in s3://omicidx/sra/sample/date=2025-12-06/stage=Full/data_0.ndjson.gz


[32m2025-12-12 04:18:29.788[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/sample/date=2025-12-06/stage=Full/data_00000.parquet[0m
[32m2025-12-12 04:18:43.126[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/sample/date=2025-12-06/stage=Full/data_00001.parquet[0m
[32m2025-12-12 04:18:57.993[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/sample/date=2025-12-06/stage=Full/data_00002.parquet[0m
[32m2025-12-12 04:19:17.840[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81[0m - [1mWrote parquet part: s3://omicidx/sra/sample/date=2025-12-06/stage=Full/data_00003.parquet[0m
[32m2025-12-12 04:19:36.220[0m | [1mINFO    [0m | [36momicidx_etl.sra.mirror_parquet[0m:[36mflush[0m:[36m81