In [221]:
import textwrap
import re
import sys
from collections import defaultdict
import requests
import unicodedata
from rapidfuzz import process, fuzz
import requests
import pandas as pd
    
def fetch_tracks(assembly):
    resp = requests.get(f"https://api.genome.ucsc.edu/list/tracks?genome={assembly}")
    resp.raise_for_status()
    tracks = resp.json()[assembly]
    tups = [(name, data['longLabel']) for name, data in tracks.items()]
    return sorted(tups, key=lambda t: t[0].upper())

def list_ucsc_tracks(assembly=None, label_wrap=80):
    search_ucsc_tracks(assembly=assembly,label_wrap=label_wrap)

def search_ucsc_tracks(*queries, assembly=None, label_wrap=80):

    assert assembly is not None
    
    def normalize(name):
        name = name.lower()
        name = unicodedata.normalize("NFKD", name).encode("ascii", "ignore").decode()
        name = re.sub(r"[^\w\s]", "", name)
        return name.strip()

    tracks = fetch_tracks(assembly=assembly)
    track_names, track_labels = zip(*tracks)

    if not queries:  
        entries = tracks #names, labels = track_names, track_labels
    else:        
        normalized_track_names = [normalize(n) for n in track_names]
        matches = defaultdict(float)
        for query in queries:
            for word in query.split():
                # match = process.extractOne(query_normalized, normalized_names, scorer=fuzz.WRatio)
                # match = process.extractOne(query_normalized, normalized_names, scorer=fuzz.WRatio, score_cutoff=90.0)
                search = process.extract(normalize(word), normalized_track_names, scorer=fuzz.WRatio, score_cutoff=80.0, limit=100)
                for name, score, index in search:
                    matches[(name, index)] += score
        
        sorted_hits = sorted([(v, k) for k, v in matches.items()], reverse=True)

        entries = []
        for score, (name, index) in sorted_hits:
            entries.append((track_names[index], track_labels[index]))

    if entries:
        ljust = max([len(e[0]) for e in entries]) + 2
        for name, label in entries:
            print(name.ljust(ljust) + '\n'.join(textwrap.wrap(label, width=label_wrap, subsequent_indent=' '*ljust)))

def get_ucsc_track(track_name, assembly, chrom=None, start=None, end=None):
    url = "https://api.genome.ucsc.edu/getData/track"
    params = {"genome": assembly, "track": track_name}
    if chrom is not None:
        assert chrom.startswith('chr')
        params['chrom'] = chrom
        if start is not None and end is not None:
            params['start'] = str(start)
            params['end'] = str(end)            
    response = requests.get(url, params=params)
    if response.ok:
    #    response.raise_for_status()
        try:
            track_data = response.json().get(track_name, [])
        except json.JSONDecodeError:
            print("Paramters does not represent a valid query", file=sys.stderr)
            return

        if type(track_data) is list:
            print("Track has heterogenous data records. Returning only attributes (columns) shared by all entries.", file=sys.stderr)
            
            shared_keys = list(set([k for d in track_data for k in d]))
            _track_data = defaultdict(list)
            for d in track_data:
                for k, v in d.items():
                    _track_data[k].append(v)
            track_data = _track_data
                
        try:
            return pd.DataFrame(track_data)
        except ValueError:
            print(df)
            raise

    
df = get_ucsc_track('ensGene', assembly='rheMac10', chrom='chr19')
df.head()

Track has heterogenous data records. Returning only attributes (columns) shared by all entries.


Unnamed: 0,bin,name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds,score,name2,cdsStartStat,cdsEndStat,exonFrames
0,585,ENSMMUT00000088937.1,chr19,-,38545,43443,41531,43443,2,3854543286,4155743443,0,ENSMMUG00000064388.1,cmpl,cmpl,10
1,585,ENSMMUT00000033378.4,chr19,-,61940,113720,61940,113720,16,"61940,62572,63310,69364,69910,88526,88750,9221...","62087,62749,63368,69642,70062,88615,88875,9231...",0,ENSMMUG00000023717.4,cmpl,cmpl,20120101010010
2,585,ENSMMUT00000069568.2,chr19,-,62351,113720,62560,113720,15,"62351,63310,69364,69910,88526,88750,92219,9239...","62749,63368,69642,70062,88615,88875,92314,9247...",0,ENSMMUG00000023717.4,cmpl,cmpl,20120101010010
3,585,ENSMMUT00000085572.1,chr19,-,83828,123100,86660,123073,14,"83828,87065,88526,88750,92219,92394,92865,1035...","86682,87483,88615,88875,92314,92476,93017,1036...",0,ENSMMUG00000023717.4,cmpl,cmpl,21201010100100
4,585,ENSMMUT00000089193.1,chr19,-,83828,122781,86660,122314,14,"83828,87065,88526,88750,92219,92394,92865,1035...","86682,87483,88615,88875,92314,92476,93017,1036...",0,ENSMMUG00000023717.4,cmpl,cmpl,21201010100100


In [208]:
list_ucsc_tracks(assembly='rheMac10')

augustusGene          AUGUSTUS ab initio gene predictions v3.1
chainNetHg19          Human (Feb. 2009 (GRCh37/hg19)), Chain and Net Alignments
chainNetHg38          Human (Dec. 2013 (GRCh38/hg38)), Chain and Net Alignments
chainNetMm10          Mouse (Dec. 2011 (GRCm38/mm10)), Chain and Net Alignments
chainNetMm39          Mouse (Jun. 2020 (GRCm39/mm39)), Chain and Net Alignments
chainNetNeoSch1       Hawaiian monk seal (Jun. 2017 (ASM220157v1/neoSch1)), Chain and Net Alignments
clinvarLift           Human ClinVar variants lifted to Rhesus
cpgIslandExt          CpG Islands (Islands < 300 Bases are Light Green)
cpgIslandExtUnmasked  CpG Islands on All Sequence (Islands < 300 Bases are Light Green)
cytoBandIdeo          Ideogram for Orientation
ensGene               Ensembl Genes
est                   Rhesus ESTs Including Unspliced
evaSnpContainer       Short Genetic Variants from European Variant Archive
gap                   Gap Locations
gc5BaseBw             GC Percent in 5-Base Win

In [209]:
get_ucsc_track('ensGene', assembly='rheMac10')

       bin                  name  chrom strand   txStart     txEnd  cdsStart  \
0      585  ENSMMUT00000088937.1  chr19      -     38545     43443     41531   
1      585  ENSMMUT00000033378.4  chr19      -     61940    113720     61940   
2      585  ENSMMUT00000069568.2  chr19      -     62351    113720     62560   
3      585  ENSMMUT00000085572.1  chr19      -     83828    123100     86660   
4      585  ENSMMUT00000089193.1  chr19      -     83828    122781     86660   
...    ...                   ...    ...    ...       ...       ...       ...   
3510  1029  ENSMMUT00000099271.1  chr19      +  58249911  58271005  58249911   
3511  1029  ENSMMUT00000089503.1  chr19      +  58249911  58271005  58249911   
3512  1029  ENSMMUT00000038349.3  chr19      -  58285362  58312850  58285875   
3513  1029  ENSMMUT00000085828.1  chr19      -  58285362  58297696  58285875   
3514  1029  ENSMMUT00000030783.4  chr19      -  58285369  58312847  58285875   

        cdsEnd  exonCount              

ValueError: All arrays must be of the same length

In [210]:
get_ucsc_track('ucscToRefSeq', assembly='rheMac10')

Unnamed: 0,chr1,chrUn_NW_021162293v1,chrUn_NW_021162233v1,chrUn_NW_021162201v1,chrUn_NW_021162242v1,chrUn_NW_021162064v1,chrUn_NW_021162018v1,chrUn_NW_021162284v1,chrUn_NW_021162073v1,chrUn_NW_021162254v1,...,chr11,chr9,chr8,chrX,chr7,chr4,chr6,chr3,chr5,chr2
0,"{'chrom': 'chr1', 'chromStart': 0, 'chromEnd':...","{'chrom': 'chrUn_NW_021162293v1', 'chromStart'...","{'chrom': 'chrUn_NW_021162233v1', 'chromStart'...","{'chrom': 'chrUn_NW_021162201v1', 'chromStart'...","{'chrom': 'chrUn_NW_021162242v1', 'chromStart'...","{'chrom': 'chrUn_NW_021162064v1', 'chromStart'...","{'chrom': 'chrUn_NW_021162018v1', 'chromStart'...","{'chrom': 'chrUn_NW_021162284v1', 'chromStart'...","{'chrom': 'chrUn_NW_021162073v1', 'chromStart'...","{'chrom': 'chrUn_NW_021162254v1', 'chromStart'...",...,"{'chrom': 'chr11', 'chromStart': 0, 'chromEnd'...","{'chrom': 'chr9', 'chromStart': 0, 'chromEnd':...","{'chrom': 'chr8', 'chromStart': 0, 'chromEnd':...","{'chrom': 'chrX', 'chromStart': 0, 'chromEnd':...","{'chrom': 'chr7', 'chromStart': 0, 'chromEnd':...","{'chrom': 'chr4', 'chromStart': 0, 'chromEnd':...","{'chrom': 'chr6', 'chromStart': 0, 'chromEnd':...","{'chrom': 'chr3', 'chromStart': 0, 'chromEnd':...","{'chrom': 'chr5', 'chromStart': 0, 'chromEnd':...","{'chrom': 'chr2', 'chromStart': 0, 'chromEnd':..."


In [133]:
import requests
from requests.auth import HTTPBasicAuth


key = '4TB2XFCS'
secret = 'prgc6t46emkfuknd'

url = "https://data.4dnucleome.org/search/"
params = {
    "type": "File",
    "file_format": "bedpe",
    "output_type": "loop list",
    "status": "released",
    "assay_title": "Hi-C",
    "biosample_ontology.term_name": "GM12878",
    "limit": "5",
    "format": "json"
}
headers = {"accept": "application/json"}

r = requests.get(url, headers=headers, params=params, auth=HTTPBasicAuth(key, secret))
r.raise_for_status()
results = r.json()["@graph"]

for file in results:
    print(file["accession"], file["href"])

HTTPError: 404 Client Error: Not Found for url: https://data.4dnucleome.org/search/?type=File&file_format=bedpe&output_type=loop+list&status=released&assay_title=Hi-C&biosample_ontology.term_name=GM12878&limit=5&format=json

In [135]:
import os
import requests
from requests.auth import HTTPBasicAuth
import json
from tqdm.notebook import tqdm

def download_4dn(identifier, dowload_dir=os.getcwd(), pgbar=False):
    
    def download_file(url, dowload_dir=dowload_dir):
        if not os.path.exists(dowload_dir):
            os.makedirs(dowload_dir)
        file_name = url.split('/')[-1]
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(file_name, 'wb') as f:
                if pgbar:
                    pbar = tqdm(total=int(r.headers['Content-Length']))
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)         
                        if pgbar:
                            pbar.update(len(chunk))

    url = f"https://data.4dnucleome.org/ga4gh/drs/v1/objects/{identifier}/access/https"
    response = requests.get(url, auth=HTTPBasicAuth(key, secret))
    if not response.ok:
        assert 0
    info = response.json()    
    download_file(info["url"])

download_4dn('4DNFIA85JYD7', pgbar=True)


# 4DNESUB35TII 
# 4DNFI8HHP7VN
# 4DNFIDWGZLHX
# 4DNFIWCH6ADP
# 4DNFII3DT59U
# 4DNFILKF6IA2
# 4DNFIA85JYD7
# 4DNEXOTMVPJN 
# 4DNFIMPBTBKI
# 4DNFIEBHQU8O

  0%|          | 0/211524 [00:00<?, ?it/s]

In [24]:
def fetch_loop_files(cell_type="GM12878", limit=10):
    """Query 4DN for Hi-C loop files in the given human cell type"""
    url = f"{API_BASE}/search/"
    params = {
        "type": "File",
        "file_format": "bedpe",
        "output_type": "loop list",
        "status": "released",
        "assay_term_name": "Hi-C",
        "biosample_ontology.term_name": cell_type,
        "limit": str(limit),
        "format": "json"
    }
    r = requests.get(url, headers=HEADERS, params=params, auth=HTTPBasicAuth(key, secret))
    r.raise_for_status()
    results = r.json()["@graph"]
    print(f"Found {len(results)} loop files for {cell_type}")
    return results

fetch_loop_files()

HTTPError: 404 Client Error: Not Found for url: https://data.4dnucleome.org/search/?type=File&file_format=bedpe&output_type=loop+list&status=released&assay_term_name=Hi-C&biosample_ontology.term_name=GM12878&limit=10&format=json