In [191]:
import json
import os
import requests
import subprocess

import pandas as pd
pd.set_option('display.max_colwidth', None)

In [93]:
DNASE_TRACK_IDXS = {
    "Pancreas": 257,
    "Ovary": 439,
    "Liver": 452,
    "Uterus": 283,
    "Testis": 665,
    "Spleen": 594,
    "Lung": 245,
    "Thyroid": 241,
    "Prostate": 653,
    "Vagina": 382,
    "Stomach": 204,
    "Adrenal_Gland": 265,
    "Cells_EBV-transformed_lymphocytes": 69
}

CRE_ANNOTATIONS_PATH = "CRE_annotation_files.tsv"
TARGETS_PATH = "/clusterfs/nilah/ruchir/tools/repos/basenji/manuscripts/cross2020/targets_human.txt"

In [112]:
targets_df = pd.read_csv(TARGETS_PATH, sep="\t", index_col=0)
cre_annotations_df = pd.read_csv(CRE_ANNOTATIONS_PATH, sep="\t", skiprows=1, header=0)

In [97]:
def get_encode_assay_id_of_basenji2_target_idx(target_idx: int):
    row = targets_df.iloc[target_idx]
    fname = row["file"]
    return fname.split("/")[-3]

In [101]:
encode_assay_ids = {
    tissue: get_encode_assay_id_of_basenji2_target_idx(target_idx)
    for tissue, target_idx in DNASE_TRACK_IDXS.items()
}

In [102]:
encode_assay_ids

{'Pancreas': 'ENCSR178JBL',
 'Ovary': 'ENCSR542KIX',
 'Liver': 'ENCSR562FNN',
 'Uterus': 'ENCSR237WJY',
 'Testis': 'ENCSR978QUT',
 'Spleen': 'ENCSR850YHJ',
 'Lung': 'ENCSR164WOF',
 'Thyroid': 'ENCSR158VAT',
 'Prostate': 'ENCSR958QXU',
 'Vagina': 'ENCSR437AYW',
 'Stomach': 'ENCSR082XEU',
 'Adrenal_Gland': 'ENCSR191FOV',
 'Cells_EBV-transformed_lymphocytes': 'ENCSR000EMT'}

In [162]:
def get_tissue_donor(tissue, encode_assay_ids):
    # Get biosamples
    headers = {"accept": "application/json"}
    experiment_url = f"https://www.encodeproject.org/experiments/{encode_assay_ids[tissue]}"
    response = requests.get(experiment_url, headers=headers)
    experiment = response.json()
    biosamples = set()
    for replicate in experiment["replicates"]:
        biosamples.add(replicate["library"]["biosample"]["accession"])
    
    # Get donors
    donors = set()
    for biosample in biosamples:
        biosample_url = f"https://www.encodeproject.org/experiments/{biosample}"
        response = requests.get(biosample_url, headers=headers)
        biosample = response.json()
        donors.add(biosample["donor"]["accession"])
        
    # Could have more than one donor if there is time series data for that assay
    if len(donors) > 1:
        print(f"Found more than 1 donor for {tissue}: {donors}. Keeping first.")
    donor = donors.pop()
    return donor

In [163]:
donors = {
    t: get_tissue_donor(t, encode_assay_ids)
    for t in encode_assay_ids
}

Found more than 1 donor for Liver: {'ENCDO060OTP', 'ENCDO987XTQ'}. Keeping first.


In [164]:
def get_chromhmm_encode_id(biosample_name: str, donor: str):
    chromhmm_df = cre_annotations_df[
        (cre_annotations_df["Biosample term name"].str.contains(biosample_name))
        & (cre_annotations_df["Description"].str.contains(donor))
    ]
    assert chromhmm_df.shape[0] == 1
    return chromhmm_df.iloc[0]["Accession"]

tissue_syns = {"Adrenal_Gland": "adrenal gland", "Cells_EBV-transformed_lymphocytes": "GM12878"}

chromhmm_encode_assay_ids = {
    t: get_chromhmm_encode_id(tissue_syns.get(t, t.lower()), donors[t])
    for t in DNASE_TRACK_IDXS
}

In [165]:
chromhmm_encode_assay_ids

{'Pancreas': 'ENCSR886WUC',
 'Ovary': 'ENCSR949DNG',
 'Liver': 'ENCSR867PUL',
 'Uterus': 'ENCSR148AJV',
 'Testis': 'ENCSR334BAX',
 'Spleen': 'ENCSR912KIG',
 'Lung': 'ENCSR576UIZ',
 'Thyroid': 'ENCSR510VAP',
 'Prostate': 'ENCSR988AHU',
 'Vagina': 'ENCSR369IGY',
 'Stomach': 'ENCSR788NNJ',
 'Adrenal_Gland': 'ENCSR233NHL',
 'Cells_EBV-transformed_lymphocytes': 'ENCSR988QYW'}

In [193]:
def download_chromhmm_file(assay_id, output_dir="chromHMM_bed"):
    headers = {"accept": "application/json"}
    annotations_url = f"https://www.encodeproject.org/annotations/{assay_id}"
    response = requests.get(annotations_url, headers=headers)
    annotations = response.json()
    
    file_id = None
    for f in annotations["files"]:
        if f["file_type"] == "bed bed9" and f["assembly"] == "GRCh38":
            file_id = f["accession"]
            break
    assert file_id is not None
    
    # Download file
    os.makedirs(output_dir, exist_ok=True)
    url = f"https://www.encodeproject.org/files/{file_id}/@@download/{file_id}.bed.gz"
    command = ["wget", "-P", output_dir, url]
    try:
        subprocess.run(command, check=True)
        print(f"File downloaded successfully to {output_dir}")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred: {e}")
    
    return file_id

In [195]:
chromhmm_encode_file_ids = {
    t: download_chromhmm_file(chromhmm_encode_assay_ids[t])
    for t in chromhmm_encode_assay_ids
}

wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:22:54--  https://www.encodeproject.org/files/ENCFF419IJB/@@download/ENCFF419IJB.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/29/7b30e95d-d25e-46bb-a35b-a725985705ec/ENCFF419IJB.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF419IJB.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=ldmYHFrZf5O5RVWZ%2Fxf5vDXawsA%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVD

  2900K .......... .......... .......... .......... .......... 61% 2.36M 1s
  2950K .......... .......... .......... .......... .......... 62% 21.2M 1s
  3000K .......... .......... .......... .......... .......... 63% 2.34M 1s
  3050K .......... .......... .......... .......... .......... 64% 2.35M 1s
  3100K .......... .......... .......... .......... .......... 65% 2.30M 1s
  3150K .......... .......... .......... .......... .......... 66% 18.8M 1s
  3200K .......... .......... .......... .......... .......... 67% 2.32M 1s
  3250K .......... .......... .......... .......... .......... 68% 2.34M 1s
  3300K .......... .......... .......... .......... .......... 69% 2.31M 1s
  3350K .......... .......... .......... .......... .......... 70% 22.7M 1s
  3400K .......... .......... .......... .......... .......... 71% 2.31M 1s
  3450K .......... .......... .......... .......... .......... 72% 2.33M 1s
  3500K .......... .......... .......... .......... .......... 73% 20.8M 1s
  3550K ....

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:22:57--  https://www.encodeproject.org/files/ENCFF332FBB/@@download/ENCFF332FBB.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/29/cb29f055-ff7b-46de-94c4-e41eb3f05d90/ENCFF332FBB.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF332FBB.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=NPfqaUuXNLGfplpTI76VfaeR5ks%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVDep

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:22:58--  https://www.encodeproject.org/files/ENCFF277NKO/@@download/ENCFF277NKO.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/29/826bf764-ec86-41c8-95c9-5b13547c09ba/ENCFF277NKO.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF277NKO.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=00uLRlvL5gYHKq3Ffr7pDJNy7Ck%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVDep

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:22:59--  https://www.encodeproject.org/files/ENCFF980UPG/@@download/ENCFF980UPG.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/30/4450c815-c2b9-4b00-913a-720fd93a1acd/ENCFF980UPG.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF980UPG.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=TKQh1ZmmEvpYKWkYNbbTOrCvlqs%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVDep

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:23:00--  https://www.encodeproject.org/files/ENCFF240CKW/@@download/ENCFF240CKW.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/05/11/92e45c68-5608-4429-8226-4369f2f239bb/ENCFF240CKW.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF240CKW.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=xT%2FH4yLS0mR6rlo0%2Bd%2BKPYvSqaw%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:23:01--  https://www.encodeproject.org/files/ENCFF833BKF/@@download/ENCFF833BKF.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/29/7dd952e1-791c-4128-9dc2-49e305bcde74/ENCFF833BKF.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF833BKF.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=KI0nQfngYnoat4sxm6%2FVS1nSQx8%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVD

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:23:02--  https://www.encodeproject.org/files/ENCFF114BMO/@@download/ENCFF114BMO.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/30/1e51e906-1b50-48b1-8a46-60efc7b1282f/ENCFF114BMO.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF114BMO.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=N3C23WHe8330Qpzjhg8u3RIC0Po%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVDep

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:23:03--  https://www.encodeproject.org/files/ENCFF907VNS/@@download/ENCFF907VNS.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/30/3604baa0-405c-492f-8ead-d9d16bb10be4/ENCFF907VNS.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF907VNS.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=fp55rftoC7L6l8oxmh2Ke0Vd86Y%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVDep

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:23:04--  https://www.encodeproject.org/files/ENCFF900CEM/@@download/ENCFF900CEM.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/29/d1486576-651d-4f96-bea5-090e58206f5d/ENCFF900CEM.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF900CEM.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=cHKRKI32ZGsHXemy3J5SJL%2BwFKE%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVD

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:23:05--  https://www.encodeproject.org/files/ENCFF460IJD/@@download/ENCFF460IJD.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/30/cceeb53e-c290-4294-987f-836e2979f43e/ENCFF460IJD.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF460IJD.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=0WpJjYBbM3xjNh8wnf%2FRt%2FsSlB0%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2B

  2800K .......... .......... .......... .......... .......... 77% 2.20M 1s
  2850K .......... .......... .......... .......... .......... 78% 2.24M 1s
  2900K .......... .......... .......... .......... .......... 80% 17.5M 0s
  2950K .......... .......... .......... .......... .......... 81% 2.27M 0s
  3000K .......... .......... .......... .......... .......... 83% 2.29M 0s
  3050K .......... .......... .......... .......... .......... 84% 18.1M 0s
  3100K .......... .......... .......... .......... .......... 85% 2.24M 0s
  3150K .......... .......... .......... .......... .......... 87% 2.22M 0s
  3200K .......... .......... .......... .......... .......... 88% 2.25M 0s
  3250K .......... .......... .......... .......... .......... 89% 2.26M 0s
  3300K .......... .......... .......... .......... .......... 91% 19.1M 0s
  3350K .......... .......... .......... .......... .......... 92% 2.21M 0s
  3400K .......... .......... .......... .......... .......... 93% 2.30M 0s
  3450K ....

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:23:08--  https://www.encodeproject.org/files/ENCFF439HZQ/@@download/ENCFF439HZQ.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/29/9e3a321f-a12b-4400-aa22-1fc2e4275de2/ENCFF439HZQ.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF439HZQ.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=MDpjQI9mWURoJfto7d1u%2BbC4Z7k%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVD

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:23:09--  https://www.encodeproject.org/files/ENCFF884SEY/@@download/ENCFF884SEY.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/29/c63e654c-1959-46f0-9ff3-078bd282ca87/ENCFF884SEY.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF884SEY.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=5imidzfN5nfPBhQzVEig4gDc8n4%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVDep

File downloaded successfully to chromHMM_bed


wget: /global/software/sl-7.x86_64/modules/langs/python/3.7/lib/libuuid.so.1: no version information available (required by wget)
--2023-12-11 19:23:10--  https://www.encodeproject.org/files/ENCFF338RIC/@@download/ENCFF338RIC.bed.gz
Resolving www.encodeproject.org (www.encodeproject.org)... 34.211.244.144
Connecting to www.encodeproject.org (www.encodeproject.org)|34.211.244.144|:443... connected.
HTTP request sent, awaiting response... 307 Temporary Redirect
Location: https://encode-public.s3.amazonaws.com/2021/03/29/4ea8f378-7643-44f4-8a2b-8ac310790e27/ENCFF338RIC.bed.gz?response-content-disposition=attachment%3B%20filename%3DENCFF338RIC.bed.gz&AWSAccessKeyId=ASIATGZNGCNXQCCBUVF3&Signature=UGygGw7HwvYRS7sDkJrcptAWNzo%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEKv%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FwEaCXVzLXdlc3QtMiJIMEYCIQD9crdLkYSEO2K6uph1b1rHPkBoBX2wUW72U8yAaWXSdQIhAJ9jYK%2FJr0jwmMo%2F4AnC30hsgh4lPEJaqwwdh2FhYiU%2FKrMFCCQQABoMMjIwNzQ4NzE0ODYzIgxSIZkag7ShLJhxIjEqkAUfQ%2B2nD9CTC0xBe0TG%2BVDep

File downloaded successfully to chromHMM_bed


......... .......... .......... .......... .......... 12%  143M 1s
   600K .......... .......... .......... .......... .......... 13% 2.55M 1s
   650K .......... .......... .......... .......... .......... 14% 73.7M 1s
   700K .......... .......... .......... .......... .......... 15% 80.9M 1s
   750K .......... .......... .......... .......... .......... 16% 79.1M 1s
   800K .......... .......... .......... .......... .......... 17% 79.0M 1s
   850K .......... .......... .......... .......... .......... 19% 80.4M 0s
   900K .......... .......... .......... .......... .......... 20% 83.3M 0s
   950K .......... .......... .......... .......... .......... 21% 93.2M 0s
  1000K .......... .......... .......... .......... .......... 22% 91.6M 0s
  1050K .......... .......... .......... .......... .......... 23% 2.79M 0s
  1100K .......... .......... .......... .......... .......... 24% 73.6M 0s
  1150K .......... .......... .......... .......... .......... 25% 79.8M 0s
  1200K .......... ..

In [197]:
# Combine dicts into one dataframe
tissues = list(DNASE_TRACK_IDXS.keys())
files_df = pd.DataFrame(
    {
        "basenji2_target_idx": [DNASE_TRACK_IDXS[t] for t in tissues],
        "DNase_encode_assay_id": [encode_assay_ids[t] for t in tissues],
        "donor_id": [donors[t] for t in tissues],
        "chromHMM_encode_assay_id": [chromhmm_encode_assay_ids[t] for t in tissues],
        "chromHMM_encode_file_id": [chromhmm_encode_file_ids[t] for t in tissues],
    },
    index=tissues
)
files_df.to_csv("chromhmm_file_metadata.tsv", sep="\t")

In [198]:
files_df

Unnamed: 0,basenji2_target_idx,DNase_encode_assay_id,donor_id,chromHMM_encode_assay_id,chromHMM_encode_file_id
Pancreas,257,ENCSR178JBL,ENCDO058AAA,ENCSR886WUC,ENCFF419IJB
Ovary,439,ENCSR542KIX,ENCDO793LXB,ENCSR949DNG,ENCFF332FBB
Liver,452,ENCSR562FNN,ENCDO060OTP,ENCSR867PUL,ENCFF277NKO
Uterus,283,ENCSR237WJY,ENCDO793LXB,ENCSR148AJV,ENCFF980UPG
Testis,665,ENCSR978QUT,ENCDO451RUA,ENCSR334BAX,ENCFF240CKW
Spleen,594,ENCSR850YHJ,ENCDO451RUA,ENCSR912KIG,ENCFF833BKF
Lung,245,ENCSR164WOF,ENCDO845WKR,ENCSR576UIZ,ENCFF114BMO
Thyroid,241,ENCSR158VAT,ENCDO845WKR,ENCSR510VAP,ENCFF907VNS
Prostate,653,ENCSR958QXU,ENCDO845WKR,ENCSR988AHU,ENCFF900CEM
Vagina,382,ENCSR437AYW,ENCDO793LXB,ENCSR369IGY,ENCFF460IJD
