# Parse json 

In [5]:
import requests, json
import pandas as pd
from pandas import json_normalize
from functools import reduce

In [6]:
# https://www.encodeproject.org/matrix/?type=Experiment
metadata_file = "../assets/encode_test.tsv"
outfile = "../assets/encode_test.csv"

metadata_file = "../assets/gopher_2.tsv"
outfile = "../assets/gopher_2.csv"

In [7]:
# Load Metadata 
metadata = pd.read_csv(metadata_file, sep = "\t", skiprows=1)


file_formats = ["bed", "bigWig", "bigBed"]
output_types = ["fold change over control", "IDR thresholded peaks"]

# Extract columns of interest 
prefix_link = "https://www.encodeproject.org"
file_columns = ["accession", "file_format","file_type","file_format_type","output_type", "assay_title","assembly", "href" ]
file_columns = ["accession", "file_format","file_type","output_type", "assay_title","assembly", "href" ]

experiment_columns = ["Accession", "Biosample term name", "Biosample accession", "Organism", "Biosample ontology", "Perturbed", "Biological replicate", "Technical replicate"]

# Filter values
# assay_title = "ATAC-seq"
# organism = "Homo sapiens"
# biosample_classification = "tissue"
# perturbation = False
# Filter metadata
#metadata_filtered = metadata[metadata['Assay title'] == assay_title]
#metadata_filtered = metadata_filtered[metadata_filtered['Organism'] == organism]
#metadata_filtered = metadata_filtered[metadata_filtered["Biosample ontology"].str.contains(biosample_classification)]
#metadata_filtered = metadata_filtered[metadata_filtered["Perturbed"] == perturbation]
#metadata_filtered = metadata_filtered.reset_index().drop("index", axis = 1)


In [8]:
# Extract file ids for each experiment 
def get_file_ids(row): 
    def get_file_name(name):
        return name.split("/")[2]

    # For each line 
    files = list(row["Files"].split(","))
    file_ids = list(map(get_file_name, files))
    return(file_ids)

def get_biosample(biosample_id):
    # Force return from the server in JSON format
    headers = {'accept': 'application/json'}
    # This URL locates the ENCODE biosample with accession number ENCBS000AAA
    url = 'https://www.encodeproject.org/biosample/'+biosample_id+'/?frame=object'
    # GET the object
    response = requests.get(url, headers=headers)
    # Extract the JSON response as a Python dictionary
    biosample = response.json()
    # Print the Python object
    biosample = json_normalize(json.loads(json.dumps(biosample)))
    return(biosample)

def get_files_df(row):
    # Get files 
    file_ids = get_file_ids(row)
    biosamples = pd.DataFrame()
    for bio_id in file_ids: 
        biosamples = pd.concat([biosamples,get_biosample(bio_id)]) 
    filtered_files = biosamples[biosamples.file_format.isin(file_formats)]
    return(filtered_files)

def filter_files(row, prefix_link, file_columns):
    # Filter files 
    files = get_files_df(row)
    files_filtered = files[file_columns]
    files_filtered["link"] = prefix_link+files_filtered.href
    files_filtered = files_filtered.drop("href", axis = 1)
    files_filtered = files_filtered.rename(columns = {"accession": "File accession"})
    return(files_filtered)

def extract_summary(row): 
    files = filter_files(row, prefix_link, file_columns)
    experiment = pd.DataFrame(row[experiment_columns]).T
    files['tmp'] = 1
    experiment['tmp'] = 1
    summary = pd.merge(files, experiment, on=['tmp'])
    summary = summary.drop('tmp', axis=1)
    return(summary)

In [13]:
summary = pd.concat(list(metadata.apply(lambda row : extract_summary(row), axis = 1)))

In [14]:
summary

Unnamed: 0,File accession,file_format,file_type,output_type,assay_title,assembly,link,Accession,Biosample term name,Biosample accession,Organism,Biosample ontology,Perturbed,Biological replicate,Technical replicate
0,ENCFF062JUV,bigWig,bigWig,fold change over control,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF062JU...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
1,ENCFF800DVG,bigBed,bigBed narrowPeak,IDR thresholded peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF800DV...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
2,ENCFF980BJM,bed,bed narrowPeak,pseudoreplicated peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF980BJ...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
3,ENCFF712PMG,bigBed,bigBed narrowPeak,pseudoreplicated peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF712PM...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
4,ENCFF650GDO,bigWig,bigWig,signal p-value,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF650GD...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
5,ENCFF766BZE,bed,bed narrowPeak,IDR thresholded peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF766BZ...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
6,ENCFF802CQD,bed,bed idr_ranked_peak,IDR ranked peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF802CQ...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
0,ENCFF581SQD,bigWig,bigWig,signal p-value,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF581SQ...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1
1,ENCFF239SMJ,bed,bed narrowPeak,IDR thresholded peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF239SM...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1
2,ENCFF705QZC,bed,bed idr_ranked_peak,IDR ranked peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF705QZ...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1


In [15]:
# deep copy of the dataframe
summary_safe = summary.copy(deep=True)
summary

Unnamed: 0,File accession,file_format,file_type,output_type,assay_title,assembly,link,Accession,Biosample term name,Biosample accession,Organism,Biosample ontology,Perturbed,Biological replicate,Technical replicate
0,ENCFF062JUV,bigWig,bigWig,fold change over control,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF062JU...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
1,ENCFF800DVG,bigBed,bigBed narrowPeak,IDR thresholded peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF800DV...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
2,ENCFF980BJM,bed,bed narrowPeak,pseudoreplicated peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF980BJ...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
3,ENCFF712PMG,bigBed,bigBed narrowPeak,pseudoreplicated peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF712PM...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
4,ENCFF650GDO,bigWig,bigWig,signal p-value,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF650GD...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
5,ENCFF766BZE,bed,bed narrowPeak,IDR thresholded peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF766BZ...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
6,ENCFF802CQD,bed,bed idr_ranked_peak,IDR ranked peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF802CQ...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
0,ENCFF581SQD,bigWig,bigWig,signal p-value,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF581SQ...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1
1,ENCFF239SMJ,bed,bed narrowPeak,IDR thresholded peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF239SM...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1
2,ENCFF705QZC,bed,bed idr_ranked_peak,IDR ranked peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF705QZ...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1


In [21]:
summary

Unnamed: 0,File accession,file_format,file_type,output_type,assay_title,assembly,link,Accession,Biosample term name,Biosample accession,Organism,Biosample ontology,Perturbed,Biological replicate,Technical replicate
0,ENCFF062JUV,bigWig,bigWig,fold change over control,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF062JU...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
1,ENCFF800DVG,bigBed,bigBed narrowPeak,IDR thresholded peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF800DV...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
2,ENCFF980BJM,bed,bed narrowPeak,pseudoreplicated peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF980BJ...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
3,ENCFF712PMG,bigBed,bigBed narrowPeak,pseudoreplicated peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF712PM...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
4,ENCFF650GDO,bigWig,bigWig,signal p-value,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF650GD...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
5,ENCFF766BZE,bed,bed narrowPeak,IDR thresholded peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF766BZ...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
6,ENCFF802CQD,bed,bed idr_ranked_peak,IDR ranked peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF802CQ...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1
0,ENCFF581SQD,bigWig,bigWig,signal p-value,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF581SQ...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1
1,ENCFF239SMJ,bed,bed narrowPeak,IDR thresholded peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF239SM...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1
2,ENCFF705QZC,bed,bed idr_ranked_peak,IDR ranked peaks,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF705QZ...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1


In [22]:
# finish prepping experiment 
summary["Biosample type"] = summary["Biosample ontology"].str.split("/", expand = True)[2].str.split("_", expand = True)[0]
summary = summary.rename(columns = {"Accession": "Experiment accession"})
summary = summary.drop_duplicates()

In [24]:
summary[summary["output_type"] == "fold change over control"]

Unnamed: 0,File accession,file_format,file_type,output_type,assay_title,assembly,link,Experiment accession,Biosample term name,Biosample accession,Organism,Biosample ontology,Perturbed,Biological replicate,Technical replicate,Biosample type
0,ENCFF062JUV,bigWig,bigWig,fold change over control,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF062JU...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1,cell
5,ENCFF276QZQ,bigWig,bigWig,fold change over control,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF276QZ...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1,cell
9,ENCFF095DJH,bigWig,bigWig,fold change over control,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF095DJ...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1,cell
11,ENCFF957TWY,bigWig,bigWig,fold change over control,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF957TW...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1,cell
12,ENCFF378ZGI,bigWig,bigWig,fold change over control,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF378ZG...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1,cell


In [25]:

# retain onlu plus strand in output_type
# total rna seq
#summary = summary[summary["output_type"] == "plus strand signal of unique reads"]
# atac seq
summary = summary[summary["output_type"] == "fold change over control"]
# Keep only mm10 and GRCh38 assemblies
summary = summary[summary["assembly"].isin(["mm10", "GRCh38"])]
# if same experiment accession, keep only one row
summary = summary.drop_duplicates(subset = ["Experiment accession"])


In [26]:
summary

Unnamed: 0,File accession,file_format,file_type,output_type,assay_title,assembly,link,Experiment accession,Biosample term name,Biosample accession,Organism,Biosample ontology,Perturbed,Biological replicate,Technical replicate,Biosample type
0,ENCFF062JUV,bigWig,bigWig,fold change over control,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF062JU...,ENCSR512YXO,GM21381,ENCBS295WIH,Homo sapiens,/biosample-types/cell_line_CLO_0015895/,False,1,1,cell
5,ENCFF276QZQ,bigWig,bigWig,fold change over control,ATAC-seq,GRCh38,https://www.encodeproject.org/files/ENCFF276QZ...,ENCSR485TLP,GM23338,"ENCBS057LKL,ENCBS994YIB,ENCBS247UHY",Homo sapiens,/biosample-types/cell_line_EFO_0007950/,False,132,1,cell


# SAVE

In [27]:
# Save
# substitue space with _ in header 
summary.columns = summary.columns.str.replace(" ", "_")
summary.to_csv(outfile, index=False)  