# Parse json 

In [16]:
import requests, json
import pandas as pd
from pandas import json_normalize
from functools import reduce

In [17]:
# https://www.encodeproject.org/matrix/?type=Experiment
metadata_file = "../assets/encode_test.tsv"
outfile = "../assets/encode_test.csv"

In [18]:
# Load Metadata 
metadata = pd.read_csv(metadata_file, sep = "\t", skiprows=1)


file_formats = ["bed", "bigWig", "bigBed"]
output_types = ["fold change over control", "IDR thresholded peaks"]

# Extract columns of interest 
prefix_link = "https://www.encodeproject.org"
file_columns = ["accession", "file_format","file_type","file_format_type","output_type", "assay_title","assembly", "href" ]
file_columns = ["accession", "file_format","file_type","output_type", "assay_title","assembly", "href" ]

experiment_columns = ["Accession", "Biosample term name", "Biosample accession", "Organism", "Biosample ontology", "Perturbed", "Biological replicate", "Technical replicate"]

# Filter values
# assay_title = "ATAC-seq"
# organism = "Homo sapiens"
# biosample_classification = "tissue"
# perturbation = False
# Filter metadata
#metadata_filtered = metadata[metadata['Assay title'] == assay_title]
#metadata_filtered = metadata_filtered[metadata_filtered['Organism'] == organism]
#metadata_filtered = metadata_filtered[metadata_filtered["Biosample ontology"].str.contains(biosample_classification)]
#metadata_filtered = metadata_filtered[metadata_filtered["Perturbed"] == perturbation]
#metadata_filtered = metadata_filtered.reset_index().drop("index", axis = 1)


In [19]:
# Extract file ids for each experiment 
def get_file_ids(row): 
    def get_file_name(name):
        return name.split("/")[2]

    # For each line 
    files = list(row["Files"].split(","))
    file_ids = list(map(get_file_name, files))
    return(file_ids)

def get_biosample(biosample_id):
    # Force return from the server in JSON format
    headers = {'accept': 'application/json'}
    # This URL locates the ENCODE biosample with accession number ENCBS000AAA
    url = 'https://www.encodeproject.org/biosample/'+biosample_id+'/?frame=object'
    # GET the object
    response = requests.get(url, headers=headers)
    # Extract the JSON response as a Python dictionary
    biosample = response.json()
    # Print the Python object
    biosample = json_normalize(json.loads(json.dumps(biosample)))
    return(biosample)

def get_files_df(row):
    # Get files 
    file_ids = get_file_ids(row)
    biosamples = pd.DataFrame()
    for bio_id in file_ids: 
        biosamples = pd.concat([biosamples,get_biosample(bio_id)]) 
    filtered_files = biosamples[biosamples.file_format.isin(file_formats)]
    return(filtered_files)

def filter_files(row, prefix_link, file_columns):
    # Filter files 
    files = get_files_df(row)
    files_filtered = files[file_columns]
    files_filtered["link"] = prefix_link+files_filtered.href
    files_filtered = files_filtered.drop("href", axis = 1)
    files_filtered = files_filtered.rename(columns = {"accession": "File accession"})
    return(files_filtered)

def extract_summary(row): 
    files = filter_files(row, prefix_link, file_columns)
    experiment = pd.DataFrame(row[experiment_columns]).T
    files['tmp'] = 1
    experiment['tmp'] = 1
    summary = pd.merge(files, experiment, on=['tmp'])
    summary = summary.drop('tmp', axis=1)
    return(summary)

In [20]:
summary = pd.concat(list(metadata.apply(lambda row : extract_summary(row), axis = 1)))

In [24]:
summary

Unnamed: 0,File accession,file_format,file_type,output_type,assay_title,assembly,link,Experiment accession,Biosample term name,Biosample accession,Organism,Biosample ontology,Perturbed,Biological replicate,Technical replicate,Biosample type
0,ENCFF452QGQ,bigWig,bigWig,minus strand signal of unique reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF452QG...,ENCSR748DUR,adrenal gland,ENCBS999WBI,Mus musculus,/biosample-types/tissue_UBERON_0002369/,False,1,1,tissue
1,ENCFF062JVN,bigWig,bigWig,plus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF062JV...,ENCSR748DUR,adrenal gland,ENCBS999WBI,Mus musculus,/biosample-types/tissue_UBERON_0002369/,False,1,1,tissue
2,ENCFF347XMG,bigWig,bigWig,plus strand signal of unique reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF347XM...,ENCSR748DUR,adrenal gland,ENCBS999WBI,Mus musculus,/biosample-types/tissue_UBERON_0002369/,False,1,1,tissue
3,ENCFF155ZDL,bigWig,bigWig,minus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF155ZD...,ENCSR748DUR,adrenal gland,ENCBS999WBI,Mus musculus,/biosample-types/tissue_UBERON_0002369/,False,1,1,tissue
0,ENCFF096UXA,bigWig,bigWig,plus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF096UX...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue
1,ENCFF310TKX,bigWig,bigWig,minus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF310TK...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue
2,ENCFF074CQZ,bigWig,bigWig,minus strand signal of unique reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF074CQ...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue
3,ENCFF648EZR,bigWig,bigWig,plus strand signal of unique reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF648EZ...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue
4,ENCFF849WRD,bigWig,bigWig,plus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF849WR...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue
5,ENCFF126WNO,bigWig,bigWig,minus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF126WN...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue


In [None]:
summary

Unnamed: 0,File accession,file_format,file_type,output_type,assay_title,assembly,link,Experiment accession,Biosample term name,Biosample accession,Organism,Biosample ontology,Perturbed,Biological replicate,Technical replicate,Biosample type
0,ENCFF452QGQ,bigWig,bigWig,minus strand signal of unique reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF452QG...,ENCSR748DUR,adrenal gland,ENCBS999WBI,Mus musculus,/biosample-types/tissue_UBERON_0002369/,False,1,1,tissue
1,ENCFF062JVN,bigWig,bigWig,plus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF062JV...,ENCSR748DUR,adrenal gland,ENCBS999WBI,Mus musculus,/biosample-types/tissue_UBERON_0002369/,False,1,1,tissue
2,ENCFF347XMG,bigWig,bigWig,plus strand signal of unique reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF347XM...,ENCSR748DUR,adrenal gland,ENCBS999WBI,Mus musculus,/biosample-types/tissue_UBERON_0002369/,False,1,1,tissue
3,ENCFF155ZDL,bigWig,bigWig,minus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF155ZD...,ENCSR748DUR,adrenal gland,ENCBS999WBI,Mus musculus,/biosample-types/tissue_UBERON_0002369/,False,1,1,tissue
0,ENCFF096UXA,bigWig,bigWig,plus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF096UX...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue
1,ENCFF310TKX,bigWig,bigWig,minus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF310TK...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue
2,ENCFF074CQZ,bigWig,bigWig,minus strand signal of unique reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF074CQ...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue
3,ENCFF648EZR,bigWig,bigWig,plus strand signal of unique reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF648EZ...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue
4,ENCFF849WRD,bigWig,bigWig,plus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF849WR...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue
5,ENCFF126WNO,bigWig,bigWig,minus strand signal of all reads,total RNA-seq,mm10,https://www.encodeproject.org/files/ENCFF126WN...,ENCSR795WFC,heart,"ENCBS540JOT,ENCBS897MIA",Mus musculus,/biosample-types/tissue_UBERON_0000948/,False,12,1,tissue


In [33]:
# finish prepping experiment 
summary["Biosample type"] = summary["Biosample ontology"].str.split("/", expand = True)[2].str.split("_", expand = True)[0]
summary = summary.rename(columns = {"Accession": "Experiment accession"})
summary = summary.drop_duplicates()
# retain onlu plus strand in output_type
summary = summary[summary["output_type"] == "plus strand signal of unique reads"]
# Keep only mm10 and GRCh38 assemblies
summary = summary[summary["assembly"].isin(["mm10", "GRCh38"])]
# if same experiment accession, keep only one row
summary = summary.drop_duplicates(subset = ["Experiment accession"])


# SAVE

In [36]:
# Save
# substitue space with _ in header 
summary.columns = summary.columns.str.replace(" ", "_")
summary.to_csv(outfile, index=False)  