<a href="https://colab.research.google.com/github/marcusvdl/healthier/blob/main/healthier_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DOWNLOADING DATA

In [15]:
#!/usr/local/bin/python3

"""---------------------------------------------------------------------------
A utility script for downloading data from the Harmonizome, with the ability
to configure which datasets and which download types from which to download.
Note that all content decompressed is roughly 30GB. The default is to not
decompress the files on download.
Dependencies:
- requests is an HTTP library with an easy-to-use API:
  http://docs.python-requests.org/en/latest/
---------------------------------------------------------------------------"""

import requests
import os
import zlib

def _download_file(response, filename):
    with open(filename, 'wb') as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)

def _download_and_decompress_file(response, filename):
    decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
    filename = filename[:-3]
    with open(filename, 'w+') as f:
        while True:
            chunk = response.raw.read(1024)
            if not chunk:
                break
            string = decompressor.decompress(chunk)
            f.write(string)

#decompress = True or False
def download_datasets(selected_datasets, selected_downloads, decompress=False):
    for dataset, path in selected_datasets:
        if not os.path.exists(dataset):
            os.mkdir(dataset)

        for downloadable in selected_downloads:
            url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/%s/%s' %\
                  (path, downloadable)
            response = requests.get(url, stream=True)
            filename = '%s/%s' % (dataset, downloadable)

            # Not every dataset has all downloadables.
            if response.status_code != 200:
                continue

            if decompress and 'txt.gz' in filename:
                _download_and_decompress_file(response, filename)
            else:
                _download_file(response, filename)

        print('%s downloaded.' % dataset)


if __name__ == '__main__':
    # Uncomment a dataset to download it.
    download_datasets([
        # ('Achilles Cell Line Gene Essentiality Profiles', 'achilles'),
        # ('Allen Brain Atlas Adult Human Brain Tissue Gene Expression Profiles', 'brainatlasadulthuman'),
        # ('Allen Brain Atlas Adult Mouse Brain Tissue Gene Expression Profiles', 'brainatlasadultmouse'),
        # ('Allen Brain Atlas Developing Human Brain Tissue Gene Expression Profiles by Microarray', 'brainatlasdevelopmentalhumanmicroarray'),
        # ('Allen Brain Atlas Developing Human Brain Tissue Gene Expression Profiles by RNA-seq', 'brainatlasdevelopmentalhumanrnaseq'),
        # ('Allen Brain Atlas Prenatal Human Brain Tissue Gene Expression Profiles', 'brainatlasprenatalhuman'),
        # ('BIND Biomolecular Interactions', 'bind'),
        # ('BioGPS Cell Line Gene Expression Profiles', 'biogpsnci60'),
        # ('BioGPS Human Cell Type and Tissue Gene Expression Profiles', 'biogpshuman'),
        # ('BioGPS Mouse Cell Type and Tissue Gene Expression Profiles', 'biogpsmouse'),
        # ('BioGRID Protein-Protein Interactions', 'biogrid'),
        # ('Biocarta Pathways', 'biocarta'),
        # ('CCLE Cell Line Gene CNV Profiles', 'cclecnv'),
        # ('CCLE Cell Line Gene Expression Profiles', 'cclemrna'),
        # ('CCLE Cell Line Gene Mutation Profiles', 'cclemut'),
        # ('CHEA Transcription Factor Binding Site Profiles', 'chea'),
        # ('CHEA Transcription Factor Targets', 'cheappi'),
        # ('CMAP Signatures of Differentially Expressed Genes for Small Molecules', 'cmap'),
        # ('COMPARTMENTS Curated Protein Localization Evidence Scores', 'jensencompartmentcurated'),
        # ('COMPARTMENTS Experimental Protein Localization Evidence Scores', 'jensencompartmentexpts'),
        # ('COMPARTMENTS Text-mining Protein Localization Evidence Scores', 'jensencompartmenttextmining'),
        # ('CORUM Protein Complexes', 'corum'),
        # ('COSMIC Cell Line Gene CNV Profiles', 'cosmiccnv'),
        # ('COSMIC Cell Line Gene Mutation Profiles', 'cosmicmut'),
        # ('CTD Gene-Chemical Interactions', 'ctdchemical'),
        # ('CTD Gene-Disease Associations', 'ctddisease'),
        # ('ClinVar SNP-Phenotype Associations', 'clinvar'),
        # ('Combined Pathways Pathways', 'combinedpathways'),
        # ('dbGAP Gene-Trait Associations', 'dbgap'),
        # ('DEPOD Substrates of Phosphatases', 'depod'),
        # ('DIP Protein-Protein Interactions', 'dip'),
        # ('DISEASES Curated Gene-Disease Assocation Evidence Scores', 'jensendiseasecurated'),
        # ('DISEASES Experimental Gene-Disease Assocation Evidence Scores', 'jensendiseaseexpts'),
        # ('DISEASES Text-mining Gene-Disease Assocation Evidence Scores', 'jensendiseasetextmining'),
        # ('DrugBank Drug Targets', 'drugbank'),
        # ('ENCODE Histone Modification Site Profiles', 'encodehm'),
        # ('ENCODE Transcription Factor Binding Site Profiles', 'encodetf'),
        # ('ENCODE Transcription Factor Targets', 'encodetfppi'),
        # ('ESCAPE Omics Signatures of Genes and Proteins for Stem Cells', 'escape'),
        # ('GAD Gene-Disease Associations', 'gad'),
        # ('GAD High Level Gene-Disease Associations', 'gadhighlevel'),
        # ('GDSC Cell Line Gene Expression Profiles', 'gdsc'),
        # ('GEO Signatures of Differentially Expressed Genes for Diseases', 'geodisease'),
        # ('GEO Signatures of Differentially Expressed Genes for Gene Perturbations', 'geogene'),
        # ('GEO Signatures of Differentially Expressed Genes for Kinase Perturbations', 'geokinase'),
        # ('GEO Signatures of Differentially Expressed Genes for Small Molecules', 'geochemical'),
        # ('GEO Signatures of Differentially Expressed Genes for Transcription Factor Perturbations', 'geotf'),
        # ('GEO Signatures of Differentially Expressed Genes for Viral Infections', 'geovirus'),
        # ('GO Biological Process Annotations', 'gobp'),
        # ('GO Cellular Component Annotations', 'gocc'),
        # ('GO Molecular Function Annotations', 'gomf'),
        # ('GTEx Tissue Gene Expression Profiles', 'gtextissue'),
        # ('GTEx Tissue Sample Gene Expression Profiles', 'gtexsample'),
        # ('GTEx eQTL', 'gtexeqtl'),
        # ('GWAS Catalog SNP-Phenotype Associations', 'gwascatalog'),
        # ('GWASdb SNP-Disease Associations', 'gwasdbdisease'),
        # ('GWASdb SNP-Phenotype Associations', 'gwasdbphenotype'),
        # ('GeneRIF Biological Term Annotations', 'generif'),
        # ('GeneSigDB Published Gene Signatures', 'genesigdb'),
        # ('Graph of Medicine EHR Text-mining Clinical Term Annotations', 'graphofmedicine'),
        # ('Guide to Pharmacology Chemical Ligands of Receptors', 'guidetopharmchemical'),
        # ('Guide to Pharmacology Protein Ligands of Receptors', 'guidetopharmprotein'),
        # ('HMDB Metabolites of Enzymes', 'hmdb'),
        # ('HPA Cell Line Gene Expression Profiles', 'hpacelllines'),
        # ('HPA Tissue Gene Expression Profiles', 'hpatissuesmrna'),
        # ('HPA Tissue Protein Expression Profiles', 'hpatissuesprotein'),
        # ('HPA Tissue Sample Gene Expression Profiles', 'hpasamples'),
        # ('HPM Cell Type and Tissue Protein Expression Profiles', 'hpm'),
        # ('HPO Gene-Disease Associations', 'hpo'),
        # ('HPRD Protein-Protein Interactions', 'hprd'),
        # ('Heiser et al., PNAS, 2011 Cell Line Gene Expression Profiles', 'heiser'),
        # ('HuGE Navigator Gene-Phenotype Associations', 'hugenavigator'),
        # ('Hub Proteins Protein-Protein Interactions', 'hubs'),
        # ('HumanCyc Biomolecular Interactions', 'humancycppi'),
        # ('HumanCyc Pathways', 'humancyc'),
        # ('IntAct Biomolecular Interactions', 'intact'),
        # ('InterPro Predicted Protein Domain Annotations', 'interpro'),
        # ('JASPAR Predicted Transcription Factor Targets', 'jasparpwm'),
        # ('KEA Substrates of Kinases', 'kea'),
        # ('KEGG Biomolecular Interactions', 'keggppi'),
        # ('KEGG Pathways', 'kegg'),
        # ('Kinativ Kinase Inhibitor Bioactivity Profiles', 'kinativ'),
        # ('KinomeScan Kinase Inhibitor Targets', 'kinomescan'),
        # ('Klijn et al., Nat. Biotechnol., 2015 Cell Line Gene CNV Profiles', 'klijncnv'),
        # ('Klijn et al., Nat. Biotechnol., 2015 Cell Line Gene Expression Profiles', 'klijnmrna'),
        # ('Klijn et al., Nat. Biotechnol., 2015 Cell Line Gene Mutation Profiles', 'klijnmut'),
        # ('LINCS L1000 CMAP Signatures of Differentially Expressed Genes for Gene Knockdowns', 'lincscmapgene'),
        # ('LINCS L1000 CMAP Signatures of Differentially Expressed Genes for Small Molecules', 'lincscmapchemical'),
        # ('LOCATE Curated Protein Localization Annotations', 'locate'),
        # ('LOCATE Predicted Protein Localization Annotations', 'locatepredicted'),
        # ('MPO Gene-Phenotype Associations', 'mgimpo'),
        # ('MSigDB Cancer Gene Co-expression Modules', 'msigdbcomp'),
        # ('MSigDB Signatures of Differentially Expressed Genes for Cancer Gene Perturbations', 'msigdbonc'),
        # ('MiRTarBase microRNA Targets', 'mirtarbase'),
        # ('MotifMap Predicted Transcription Factor Targets', 'motifmap'),
        # ('NURSA Protein Complexes', 'nursa'),
        # ('NURSA Protein-Protein Interactions', 'nursappi'),
        # ('OMIM Gene-Disease Associations', 'omim'),
        # ('PANTHER Biomolecular Interactions', 'pantherppi'),
        # ('PANTHER Pathways', 'panther'),
        # ('PID Biomolecular Interactions', 'pidppi'),
        # ('PID Pathways', 'pid'),
        # ('Pathway Commons Protein-Protein Interactions', 'pc'),
        # ('PhosphoSitePlus Phosphosite-Disease Associations', 'phosphositeplusdisease'),
        # ('PhosphoSitePlus Substrates of Kinases', 'phosphositeplus'),
        # ('Phosphosite Textmining Biological Term Annotations', 'phosphositetextmining'),
        # ('ProteomicsDB Cell Type and Tissue Protein Expression Profiles', 'proteomicsdb'),
        # ('Reactome Biomolecular Interactions', 'reactomeppi'),
        # ('Reactome Pathways', 'reactome'),
        # ('Recon X Predicted Biomolecular Interactions', 'reconx'),
        # ('Roadmap Epigenomics Cell and Tissue DNA Accessibility Profiles', 'epigenomicsdnaaccessibility'),
        # ('Roadmap Epigenomics Cell and Tissue DNA Methylation Profiles', 'epigenomicsdnamethylation'),
        # ('Roadmap Epigenomics Cell and Tissue Gene Expression Profiles', 'epigenomicsmrna'),
        # ('Roadmap Epigenomics Histone Modification Site Profiles', 'epigenomicshm'),
        # ('SILAC Phosphoproteomics Signatures of Differentially Phosphorylated Proteins for Drugs', 'silacdrug'),
        # ('SILAC Phosphoproteomics Signatures of Differentially Phosphorylated Proteins for Gene Perturbations', 'silacgene'),
        # ('SILAC Phosphoproteomics Signatures of Differentially Phosphorylated Proteins for Protein Ligands', 'silacligand'),
        # ('SNPedia SNP-Phenotype Associations', 'snpedia'),
        # ('TCGA Signatures of Differentially Expressed Genes for Tumors', 'tcga'),
        # ('TISSUES Curated Tissue Protein Expression Evidence Scores', 'jensentissuecurated'),
        # ('TISSUES Experimental Tissue Protein Expression Evidence Scores', 'jensentissueexpts'),
        # ('TISSUES Text-mining Tissue Protein Expression Evidence Scores', 'jensentissuetextmining'),
        # ('TRANSFAC Curated Transcription Factor Targets', 'transfac'),
        # ('TRANSFAC Predicted Transcription Factor Targets', 'transfacpwm'),
        # ('TargetScan Predicted Conserved microRNA Targets', 'targetscan'),
        # ('TargetScan Predicted Nonconserved microRNA Targets', 'targetscannonconserved'),
        # ('Virus MINT Protein-Viral Protein Interactions', 'virusmintppi'),
        # ('Virus MINT Protein-Virus Interactions', 'virusmint'),
        # ('Wikipathways Pathways', 'wikipathways'),
    ], [
         'gene_attribute_matrix.txt.gz',
         'gene_attribute_edges.txt.gz',
         'gene_set_library_crisp.txt.gz',
         'gene_set_library_up_crisp.txt.gz',
         'gene_set_library_dn_crisp.txt.gz',
         'attribute_set_library_crisp.txt.gz',
         'attribute_set_library_up_crisp.txt.gz',
         'attribute_set_library_dn_crisp.txt.gz',
         'gene_similarity_matrix_cosine.txt.gz',
         'attribute_similarity_matrix_cosine.txt.gz',
         'gene_list_terms.txt.gz',
         'attribute_list_entries.txt.gz',
         'processing_script.m'
    ])

Achilles Cell Line Gene Essentiality Profiles downloaded.


In [4]:
# aging_datasets_downloader.py

# pip install anndata

import os
import requests
from pathlib import Path
import pandas as pd
import gzip
import zipfile
import anndata

# Diretório raiz para salvar todos os datasets
BASE_DIR = Path("datasets")
BASE_DIR.mkdir(exist_ok=True)

# Função auxiliar para baixar arquivos
def download_file(name, url, subdir="general"):
    folder = BASE_DIR / subdir
    folder.mkdir(parents=True, exist_ok=True)
    filename = url.split("/")[-1]
    path = folder / filename

    try:
        r = requests.get(url)
        r.raise_for_status()
        with open(path, "wb") as f:
            f.write(r.content)
        print(f"[OK] {name} salvo em {path}")
    except Exception as e:
        print(f"[ERRO] {name}: {e}")

    return str(path)

# Função para resumir datasets

def summarize_dataset(filepath):
    try:
        if filepath.endswith(".xlsx"):
            df = pd.read_excel(filepath)
        elif filepath.endswith(".csv"):
            df = pd.read_csv(filepath)
        elif filepath.endswith(".txt.gz"):
            with gzip.open(filepath, 'rt') as f:
                df = pd.read_csv(f, sep='\t')
        elif filepath.endswith(".zip"):
            with zipfile.ZipFile(filepath, 'r') as zip_ref:
                extract_path = Path(filepath).parent / (Path(filepath).stem + "_extracted")
                zip_ref.extractall(extract_path)
                print(f"[INFO] Arquivos extraídos para {extract_path}")
                return None
        elif filepath.endswith(".h5ad"):
            adata = anndata.read_h5ad(filepath)
            print(f"[INFO] {filepath} | AnnData: {adata.shape} | Obs: {adata.obs.shape} | Vars: {adata.var.shape}")
            return adata
        else:
            print(f"[WARN] Formato não suportado: {filepath}")
            return None

        print(f"[INFO] {filepath} | shape: {df.shape} | colunas: {list(df.columns)[:5]}...")
        return df
    except Exception as e:
        print(f"[ERRO ao ler] {filepath}: {e}")
        return None

# -----------------------------
# MULTI-OMICS / MULTI-TISSUES
# -----------------------------

# Aging Atlas
aging_atlas_files = {
    "Transcriptome Bulk": "https://ngdc.cncb.ac.cn/aging/download/data/Aging_Atlas_Bulk_Transcriptome.xlsx",
    "Single-cell Transcriptome": "https://ngdc.cncb.ac.cn/aging/download/data/Aging_Atlas_Single_Cell.xlsx",
    "Proteome": "https://ngdc.cncb.ac.cn/aging/download/data/Aging_Atlas_Proteome.xlsx",
    "Pharmacogenomics": "https://ngdc.cncb.ac.cn/aging/download/data/Aging_Atlas_Pharmacogenomics.xlsx"
}

for name, url in aging_atlas_files.items():
    path = download_file(name, url, subdir="aging_atlas")
    summarize_dataset(path)

# GTEx
print("[INFO] GTEx requires registration and must be downloaded manually: https://gtexportal.org")

# Tabula Muris
path = download_file("Tabula Muris Senis Raw Data", "https://figshare.com/ndownloader/files/22921744", subdir="tabula_muris")
summarize_dataset(path)

# Tabula Sapiens
print("[INFO] Tabula Sapiens: acesse https://tabula-sapiens-portal.ds.czbiohub.org para download personalizado.")

# ENCODE
print("[INFO] ENCODE datasets devem ser baixados via interface: https://www.encodeproject.org")

# Human Cell Atlas
print("[INFO] HCA oferece API e interface: https://data.humancellatlas.org")

# Roadmap Epigenomics
print("[INFO] Roadmap Epigenomics: baixe por https://egg2.wustl.edu/roadmap/web_portal/")

# AFCA
path = download_file("Aging Fly Cell Atlas (AFCA)", "https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-022-05597-y/MediaObjects/41586_2022_5597_MOESM3_ESM.zip", subdir="afca")
summarize_dataset(path)

# Single Cell Portal
print("[INFO] Broad Institute Single Cell Portal: https://singlecell.broadinstitute.org/single_cell")

# -----------------------------
# LONGITUDINAL / AGE CLOCKS
# -----------------------------

# ClockBase
print("[INFO] ClockBase requer acesso via https://clockbase.org")

# ROSMAP
print("[INFO] ROSMAP disponível via Synapse: https://www.synapse.org/#!Synapse:syn3219045")

# GEO (exemplo GSE201338)
path = download_file("GSE201338 metadata", "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE201nnn/GSE201338/matrix/GSE201338_series_matrix.txt.gz", subdir="geo")
summarize_dataset(path)

# Mammalian Aging Cell Atlas (MACA)
path = download_file("MACA (Mouse Aging Single-Cell)", "https://figshare.com/ndownloader/files/33835075", subdir="maca")
summarize_dataset(path)

# SenNet
print("[INFO] SenNet data hub: https://sennetconsortium.org")

# Multi-Omics Longitudinal Fibroblasts
print("[INFO] Download pelo repositório suplementar do estudo: https://www.nature.com/articles/s41586-020-2326-1")

# BrainSpan / Allen Brain Atlas
print("[INFO] BrainSpan download: https://www.brainspan.org/static/download.html")

# LifeTime Initiative
print("[INFO] LifeTime resources: https://lifetime-fetflagship.eu")

# Framingham Heart Study
print("[INFO] FHS data via dbGaP: https://www.nhlbi.nih.gov/science/framingham-heart-study")

# -----------------------------
# REJUVENATION / INTERVENTIONS
# -----------------------------

# SINGULAR (Cell Rejuvenation Atlas)
print("[INFO] Acesse o Cell Rejuvenation Atlas (SINGULAR): https://rejuvenome.org")

# TPE-IVIG Study
print("[INFO] Dataset suplementar via artigo original: https://www.nature.com/articles/s41467-023-38028-0")

# Rejuvenation Roadmap
print("[INFO] Rejuvenation Roadmap datasets: https://www.lifespan.io/road-maps/the-rejuvenation-roadmap/")

# iPSC datasets
print("[INFO] Acesse repositórios iPSC via GEO ou https://stemcellcommons.org")

# DGIdb
print("[INFO] DGIdb API e downloads: https://www.dgidb.org")

# DrugBank
print("[INFO] DrugBank requer registro para download: https://go.drugbank.com/releases/latest")

# -----------------------------
# EXTRA DATABASES ADDED
# -----------------------------

# agingbiotech.info
print("[INFO] agingbiotech.info (empresas e intervenções): https://agingbiotech.info")

# OpenGenes
print("[INFO] OpenGenes database: https://open-genes.org")

# Biomarkers of Aging Consortium
print("[INFO] Biomarkers of Aging Consortium (data access upon request): https://www.biomarkers consortium.org")

# CZI Cellxgene Discover (Census)
print("[INFO] CZI cellxgene Discover portal: https://census.cellxgene.cziscience.com")

# gnomAD
print("[INFO] gnomAD data (genome/variant frequency): https://gnomad.broadinstitute.org/downloads")

# -----------------------------
# Finalização
# -----------------------------
print("\n[✔] Script finalizado. Datasets disponíveis foram baixados e resumidos.")


[ERROR] Transcriptome Bulk: 404 Client Error:  for url: https://ngdc.cncb.ac.cn/aging/download/data/Aging_Atlas_Bulk_Transcriptome.xlsx
[ERROR] Single-cell Transcriptome: 404 Client Error:  for url: https://ngdc.cncb.ac.cn/aging/download/data/Aging_Atlas_Single_Cell.xlsx
[ERROR] Proteome: 404 Client Error:  for url: https://ngdc.cncb.ac.cn/aging/download/data/Aging_Atlas_Proteome.xlsx
[ERROR] Pharmacogenomics: 404 Client Error:  for url: https://ngdc.cncb.ac.cn/aging/download/data/Aging_Atlas_Pharmacogenomics.xlsx
[INFO] GTEx requires registration and must be downloaded manually: https://gtexportal.org
[ERROR] Tabula Muris Senis Raw Data: 404 Client Error: Not Found for url: https://figshare.com/ndownloader/files/22921744
[INFO] Tabula Sapiens: access https://tabula-sapiens-portal.ds.czbiohub.org for personalized download.
[INFO] ENCODE datasets must be downloaded via the interface: https://www.encodeproject.org
[INFO] HCA offers API and interface: https://data.humancellatlas.org
[INFO

# PARSING DATA

In [19]:
import gzip

def read_gz_file(filepath):
    try:
        with gzip.open(filepath, 'rt') as f:
            for line in f:
                print(line, end='') # process each line as needed
    except FileNotFoundError:
        print(f"Error: File not found at {filepath}")
    except Exception as e:
        print(f"An error occurred: {e}")

read_gz_file('/content/datasets/geo/GSE201338_series_matrix.txt.gz')

!Series_title	"Combining Stem Cell Rejuvenation and Senescence Targeting to Synergistically Extend Lifespan"
!Series_geo_accession	"GSE201338"
!Series_status	"Public on Apr 24 2022"
!Series_submission_date	"Apr 22 2022"
!Series_last_update_date	"Sep 21 2022"
!Series_summary	"We combine transient stem cell rejuvenation with targeted removal of senescent cells to test the hypothesis that simultaneously targeting both cell-fate based aging mechanisms will maximize life and health span benefits."
!Series_overall_design	"Drosophila midgut RNA profiles of tubGal80ts > UAS-TdTomato (WT), armGal4; tubGal80ts > UAS-OKSM (OKSM), armGal4; tubGal80ts > UAS-Sen (Sen) and armGal4; tubGal80ts > UAS-Sen; UAS-OKSM (Sen_OKSM); tubGal80ts > UAS-TdTomato (WT), esgGal4; tubGal80ts > UAS-OKSM (OKSM), esgGal4; tubGal80ts > UAS-Sen (Sen) and esgGal4; tubGal80ts > UAS-Sen; UAS-OKSM (Sen_OKSM)"
!Series_overall_design	""
!Series_overall_design	"Updates: [Sept. 20, 2022] The GSM6058923 and GSM6058926 metadata wer