In [None]:
%load_ext autoreload
%autoreload 2

# download_processed

> Download processed RRBS data. This is usually in BED format.

In [None]:
#| default_exp download_processed

In [None]:
#| hide
from nbdev.showdoc import *

  import pkg_resources,importlib


# Import

In [None]:
#| export
from geofetch import Geofetcher as OriginalGeofetcher
from tenacity import retry, wait_exponential, stop_after_attempt 
from joblib import Memory

memory = Memory(".cache", verbose=0)

In [None]:
#| export
def _fetch_projects(*a, **kw):
    return OriginalGeofetcher().get_projects(*a, **kw)

_fetch_projects_retry = retry(
    wait=wait_exponential(multiplier=0.5, min=0.1, max=10),
    stop=stop_after_attempt(3),
    reraise=True
)(_fetch_projects)

_get_projects_cached = memory.cache(_fetch_projects_retry)

In [None]:
#| export
class Geofetcher(OriginalGeofetcher):
    def get_projects(self, *a, ignore_cache=False, **kw):
        if ignore_cache:
            _get_projects_cached.clear()
            return _fetch_projects_retry(*a, **kw)
        return _get_projects_cached(*a, **kw)

In [None]:
#| show_doc
geo=Geofetcher(just_metadata=True)
acc = 'GSE51239'

[INFO] [18:08:41] Metadata folder: /mnt/idms/home/magyary/bs-dna-methyl/nbs/project_name


List files

In [None]:
#| show_doc
#|eval: false
projects = geo.get_projects(acc)
projects

[INFO] [18:08:41] Metadata folder: /mnt/idms/home/magyary/bs-dna-methyl/nbs/project_name
[INFO] [18:08:41] Trying GSE51239 (not a file) as accession...
[INFO] [18:08:41] Trying GSE51239 (not a file) as accession...
[INFO] [18:08:41] Skipped 0 accessions. Starting now.
[INFO] [18:08:41] [38;5;200mProcessing accession 1 of 1: 'GSE51239'[0m
[INFO] [18:08:43] Processed 48 samples.
[INFO] [18:08:43] Expanding metadata list...
[INFO] [18:08:43] Found SRA Project accession: SRP030612
[INFO] [18:08:43] Downloading SRP030612 sra metadata
[INFO] [18:08:46] Parsing SRA file to download SRR records
[INFO] [18:08:46] Dry run, no data will be downloaded
[INFO] [18:08:46] Finished processing 1 accession(s)
[INFO] [18:08:46] Cleaning soft files ...
[INFO] [18:08:46] Creating complete project annotation sheets and config file...


{'GSE51239_raw': Project
 48 samples (showing first 20): hsperm-524-90, hsperm-530-90, hsperm-533-90, hsperm-534-90, h8c-1, h8c-2, hblast-1, hblast-2, hblast-3, hblastsingle-2, hblastsingle-5, hicm-1, hicm-2, hte-1, hte-2, hesp0-e1, hesp0-e4, hesp0-e5, hesp1-e1, hesp1-e4
 Sections: name, pep_version, sample_table, experiment_metadata, sample_modifiers, description}

Download files

In [None]:
#| export
#|eval: false
geof=Geofetcher(processed=True, data_source="all", metadata_folder = f"/mnt/idms/home/magyary/sra-data/metadata/",
               name = acc)

[INFO] [18:09:46] Metadata folder: /mnt/idms/home/magyary/sra-data/metadata/GSE51239


In [None]:
#|eval: false
projects_files=geof.get_projects(acc, just_metadata=False, ignore_cache=True)

[INFO] [18:09:55] Metadata folder: /mnt/idms/home/magyary/bs-dna-methyl/nbs/project_name
[INFO] [18:09:55] Trying GSE51239 (not a file) as accession...
[INFO] [18:09:55] Trying GSE51239 (not a file) as accession...
[INFO] [18:09:55] Skipped 0 accessions. Starting now.
[INFO] [18:09:55] [38;5;200mProcessing accession 1 of 1: 'GSE51239'[0m
[INFO] [18:09:57] Processed 48 samples.
[INFO] [18:09:57] Expanding metadata list...
[INFO] [18:09:57] Found SRA Project accession: SRP030612
[INFO] [18:09:57] Downloading SRP030612 sra metadata
[INFO] [18:09:58] Parsing SRA file to download SRR records
[INFO] [18:09:58] Getting SRR: SRR1003182  in (GSE51239)


2025-07-28T16:09:58 prefetch.3.2.1: 1) Resolving 'SRR1003182'...
2025-07-28T16:09:59 prefetch.3.2.1: Current preference is set to retrieve SRA Normalized Format files with full base quality scores


[INFO] [18:10:00] Getting SRR: SRR1003183  in (GSE51239)


2025-07-28T16:10:00 prefetch.3.2.1: 1) 'SRR1003182' is found locally 
2025-07-28T16:10:00 prefetch.3.2.1: 1) Resolving 'SRR1003183'...
2025-07-28T16:10:01 prefetch.3.2.1: Current preference is set to retrieve SRA Normalized Format files with full base quality scores
2025-07-28T16:10:02 prefetch.3.2.1: 1) Downloading 'SRR1003183'...
2025-07-28T16:10:02 prefetch.3.2.1:  SRA Normalized Format file is being retrieved
2025-07-28T16:10:02 prefetch.3.2.1:  Downloading via HTTPS...
2025-07-28T16:10:02 prefetch.3.2.1:    Continue download of 'SRR1003183' from 154660408


Explore files

In [None]:
#| export
#|eval: false
projects_files['GSE51239_raw'].sample_table.iloc[:5,:]

Unnamed: 0_level_0,sample_name,protocol,organism,read_type,data_source,srr,srx,sample_title,sample_geo_accession,sample_status,...,passage,sequenced_molecule,genome_build,biosample,gsm_id,sra,reanalyzed_by,sample_treatment_protocol_ch1,sample_extract_protocol_ch1,supplementary_files_format_and_content
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
hsperm-524-90,hsperm-524-90,,,,,,,hSperm-524-90,GSM1240860,Public on Jul 23 2014,...,0,Msp1 digested bisulfite converted genomic DNA,hg19,https://www.ncbi.nlm.nih.gov/biosample/SAMN023...,,,,"hICM, hTE,mICM and mTE were isolated using sta...",DNA was extracted from all tissue using 14 hou...,".cpgs: (chromosome,position,seenCount,methylat..."
hsperm-530-90,hsperm-530-90,,,,,,,hSperm-530-90,GSM1240861,Public on Jul 23 2014,...,0,Msp1 digested bisulfite converted genomic DNA,hg19,https://www.ncbi.nlm.nih.gov/biosample/SAMN023...,,,,"hICM, hTE,mICM and mTE were isolated using sta...",DNA was extracted from all tissue using 14 hou...,".cpgs: (chromosome,position,seenCount,methylat..."
hsperm-533-90,hsperm-533-90,,,,,,,hSperm-533-90,GSM1240862,Public on Jul 23 2014,...,0,Msp1 digested bisulfite converted genomic DNA,hg19,https://www.ncbi.nlm.nih.gov/biosample/SAMN023...,,,,"hICM, hTE,mICM and mTE were isolated using sta...",DNA was extracted from all tissue using 14 hou...,".cpgs: (chromosome,position,seenCount,methylat..."
hsperm-534-90,hsperm-534-90,,,,,,,hSperm-534-90,GSM1240863,Public on Jul 23 2014,...,0,Msp1 digested bisulfite converted genomic DNA,hg19,https://www.ncbi.nlm.nih.gov/biosample/SAMN023...,,,,"hICM, hTE,mICM and mTE were isolated using sta...",DNA was extracted from all tissue using 14 hou...,".cpgs: (chromosome,position,seenCount,methylat..."
h8c-1,h8c-1,,,,,,,h8c-1,GSM1240864,Public on Jul 23 2014,...,0,Msp1 digested bisulfite converted genomic DNA,hg19,https://www.ncbi.nlm.nih.gov/biosample/SAMN023...,,,,"hICM, hTE,mICM and mTE were isolated using sta...",DNA was extracted from all tissue using 14 hou...,".cpgs: (chromosome,position,seenCount,methylat..."


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()