In [2]:
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import os
import random

## Get histone metadata

In [3]:
metadata=pd.read_csv("../encode-histone/metadata.tsv",delimiter="\t")
print(metadata.shape)

(1861, 56)


In [4]:
metadata["Output type"].value_counts()

peaks               1314
stable peaks         243
replicated peaks     236
hotspots              68
Name: Output type, dtype: int64

In [5]:
metadata_stable=metadata[metadata["Output type"].isin(["stable peaks","replicated peaks"])]

In [6]:
metadata_stable_noaudits=metadata_stable[metadata_stable["Audit ERROR"].isnull()]

In [7]:
metadata_stable_noaudits.head()

Unnamed: 0,File accession,File format,File type,File format type,Output type,File assembly,Experiment accession,Assay,Biosample term id,Biosample term name,...,Assembly,Genome annotation,Platform,Controlled by,File Status,s3_uri,Audit WARNING,Audit INTERNAL_ACTION,Audit NOT_COMPLIANT,Audit ERROR
3,ENCFF243QBZ,bed narrowPeak,bed,narrowPeak,replicated peaks,hg19,ENCSR692ICP,ChIP-seq,CL:0000624,"CD4-positive, alpha-beta T cell",...,,,,released,s3://encode-public/2018/01/20/34fb463a-0625-4e...,,,,,
4,ENCFF660BIT,bed narrowPeak,bed,narrowPeak,stable peaks,hg19,ENCSR681OSD,ChIP-seq,CL:0000625,"CD8-positive, alpha-beta T cell",...,,,,released,s3://encode-public/2018/01/21/2590a4a6-cc7b-4a...,,,,,
8,ENCFF912DJF,bed narrowPeak,bed,narrowPeak,replicated peaks,hg19,ENCSR161XBV,ChIP-seq,CL:0000895,"naive thymus-derived CD4-positive, alpha-beta ...",...,,,,released,s3://encode-public/2018/01/26/d9c2eb95-d7ff-47...,,,,,
10,ENCFF277QQY,bed narrowPeak,bed,narrowPeak,stable peaks,hg19,ENCSR347HBG,ChIP-seq,NTR:0003079,fibroblast of breast,...,,,,released,s3://encode-public/2018/01/21/8390eefe-b18e-4d...,,,,,
13,ENCFF874WPQ,bed narrowPeak,bed,narrowPeak,stable peaks,hg19,ENCSR311XVL,ChIP-seq,NTR:0003079,fibroblast of breast,...,,,,released,s3://encode-public/2018/01/21/337bb77b-8f78-4f...,,,,,


In [8]:
df_exp=metadata_stable_noaudits[["Biosample term name","Experiment target"]].groupby(["Biosample term name","Experiment target"]).size().reset_index()

In [9]:
len(df_exp["Biosample term name"].unique())

52

In [30]:
len(df_exp["Experiment target"].unique())

11

In [31]:
df_exp[0].sum()

479

In [32]:
def create_combine_script(metadata):
    new_script_file = "/cellar/users/mpagadal/Data/projects/germline-immune/chip-seq/scripts/combine-bed-no-gz.sh"
    with open(new_script_file, 'w') as out_file:
        # header 
        out_file.write('#! /bin/bash\n')
        out_file.write('#SBATCH --mem=10G\n')
        out_file.write('#SBATCH -o ./out/%A.%x.%a.out # STDOUT\n')
        out_file.write('#SBATCH -e ./err/%A.%x.%a.err # STDERR\n')
        out_file.write("\n")
        # list of genes
        out_file.write("date\n")
        out_file.write("\n")
        for x in metadata["Biosample term name"].unique():
            df=metadata[metadata["Biosample term name"]==x]
            for y in df["Experiment target"].unique():
                files=df[df["Experiment target"]==y]["File accession"].tolist()
                files=["../encode-histone/"+x+".bed" for x in files]
                print(x)
                print(y)
                x=x.replace(" ","")
                x=x.replace("'","")
                x=x.replace(",","")
                y=y.split("-")[0]
                y=y.replace(" ","")
                out_file.write("cat {} > ../combined-beds/histone/{}.{}.bed".format(" ".join(files),x,y))
                out_file.write("\n")
                out_file.write("\n")
        
        out_file.write("date\n")
        #out_file.write("\n")

In [33]:
create_combine_script(metadata_stable_noaudits)

CD4-positive, alpha-beta T cell
H3K9me3-human
CD4-positivealpha-betaTcell
H3K27ac-human
CD4-positivealpha-betaTcell
H3K4me3-human
CD4-positivealpha-betaTcell
H3K27me3-human
CD4-positivealpha-betaTcell
H3K36me3-human
CD4-positivealpha-betaTcell
H3K4me1-human
CD8-positive, alpha-beta T cell
H3K36me3-human
CD8-positivealpha-betaTcell
H3K27ac-human
CD8-positivealpha-betaTcell
H3K9me3-human
CD8-positivealpha-betaTcell
H3K4me3-human
CD8-positivealpha-betaTcell
H3K4me1-human
CD8-positivealpha-betaTcell
H3K27me3-human
CD8-positivealpha-betaTcell
H3K9ac-human
naive thymus-derived CD4-positive, alpha-beta T cell
H3K4me3-human
naivethymus-derivedCD4-positivealpha-betaTcell
H3K4me1-human
naivethymus-derivedCD4-positivealpha-betaTcell
H3K27me3-human
naivethymus-derivedCD4-positivealpha-betaTcell
H3K27ac-human
naivethymus-derivedCD4-positivealpha-betaTcell
H3K36me3-human
naivethymus-derivedCD4-positivealpha-betaTcell
H3K9me3-human
fibroblast of breast
H3K36me3-human
fibroblastofbreast
H3K9me3-human


astrocyte
H3K4me3-human
astrocyte
H3K9ac-human
astrocyte
H3K27ac-human
astrocyte
H4K20me1-human
natural killer cell
H3K9me3-human
naturalkillercell
H3K27me3-human
naturalkillercell
H3K4me3-human
naturalkillercell
H3K36me3-human
naturalkillercell
H3K4me1-human
naturalkillercell
H3K27ac-human
kidney epithelial cell
H3K4me3-human
kidneyepithelialcell
H3K36me3-human
kidneyepithelialcell
H3K27me3-human
skeletal muscle myoblast
H3K79me2-human
skeletalmusclemyoblast
H3K9me3-human
skeletalmusclemyoblast
H3K4me3-human
skeletalmusclemyoblast
H3K9ac-human
skeletalmusclemyoblast
H3K4me1-human
skeletalmusclemyoblast
H3K36me3-human
skeletalmusclemyoblast
H3K27me3-human
skeletalmusclemyoblast
H4K20me1-human
skeletalmusclemyoblast
H3K27ac-human
skeletalmusclemyoblast
H3K4me2-human
skeletalmusclemyoblast
H2AFZ-human
fibroblast of dermis
H3K9ac-human
fibroblastofdermis
H3K27me3-human
fibroblastofdermis
H3K9me3-human
fibroblastofdermis
H3K79me2-human
fibroblastofdermis
H3K4me2-human
fibroblastofdermis
H3

## Get TF metadata

In [10]:
metadata=pd.read_csv("../encode-tf/metadata.tsv",delimiter="\t")
print(metadata.shape)

(1357, 55)


In [11]:
metadata["Output type"].value_counts()

peaks and background as input for IDR     621
optimal IDR thresholded peaks             291
peaks                                     180
pseudoreplicated IDR thresholded peaks    141
conservative IDR thresholded peaks        124
Name: Output type, dtype: int64

In [12]:
metadata_stable=metadata[metadata["Output type"].isin(["optimal IDR thresholded peaks","conservative IDR thresholded peaks"])]

In [13]:
metadata_stable_noaudits=metadata_stable[metadata_stable["Audit ERROR"].isnull()]

In [14]:
df_exp=metadata_stable_noaudits[["Biosample term name","Experiment target"]].groupby(["Biosample term name","Experiment target"]).size().reset_index()

In [15]:
len(df_exp["Biosample term name"].unique())

84

In [16]:
len(df_exp["Experiment target"].unique())

12

In [19]:
df_exp[0].sum()

323

In [45]:
def create_combine_script(metadata):
    new_script_file = "/cellar/users/mpagadal/Data/projects/germline-immune/chip-seq/scripts/combine-bed-no-gz.sh"
    with open(new_script_file, 'w') as out_file:
        # header 
        out_file.write('#! /bin/bash\n')
        out_file.write('#SBATCH --mem=10G\n')
        out_file.write('#SBATCH -o ./out/%A.%x.%a.out # STDOUT\n')
        out_file.write('#SBATCH -e ./err/%A.%x.%a.err # STDERR\n')
        out_file.write("\n")
        # list of genes
        out_file.write("date\n")
        out_file.write("\n")
        for x in metadata["Biosample term name"].unique():
            df=metadata[metadata["Biosample term name"]==x]
            for y in df["Experiment target"].unique():
                files=df[df["Experiment target"]==y]["File accession"].tolist()
                files=["../encode-tf/"+x+".bed" for x in files]
                print(x)
                print(y)
                x=x.replace(" ","")
                x=x.replace("'","")
                x=x.replace(",","")
                y=y.split("-")[0]
                y=y.replace(" ","")
                out_file.write("cat {} > ../combined-beds/tf/{}.{}.bed".format(" ".join(files),x,y))
                out_file.write("\n")
                out_file.write("\n")
        
        out_file.write("date\n")


In [46]:
create_combine_script(metadata_stable_noaudits)

gastrocnemius medialis
CTCF-human
RWPE1
CTCF-human
22Rv1
CTCF-human
MCF-7
MYC-human
MCF-7
CTCF-human
RWPE2
CTCF-human
SK-N-SH
CTCF-human
K562
MYC-human
K562
STAT1-human
K562
NR4A1-human
K562
ETS2-human
K562
GATA2-human
K562
CTCF-human
K562
JUNB-human
K562
JUND-human
K562
ETS1-human
K562
HDAC8-human
K562
RUNX1-human
K562
FOS-human
OCI-LY1
CTCF-human
foreskin keratinocyte
CTCF-human
VCaP
CTCF-human
spleen
CTCF-human
C4-2B
CTCF-human
LNCAP
CTCF-human
PC-9
CTCF-human
bipolar neuron
CTCF-human
T47D
CTCF-human
epithelial cell of prostate
CTCF-human
neutrophil
CTCF-human
transverse colon
CTCF-human
MCF 10A
MYC-human
HepG2
ETS1-human
HepG2
CTCF-human
prostate gland
CTCF-human
A549
CTCF-human
A549
MYC-human
A549
ETS1-human
HeLa-S3
STAT1-human
HeLa-S3
CTCF-human
HCT116
CTCF-human
LNCaP clone FGC
CTCF-human
A673
CTCF-human
NCI-H929
CTCF-human
liver
CTCF-human
MM.1S
CTCF-human
OCI-LY3
CTCF-human
SU-DHL-6
CTCF-human
KMS-11
CTCF-human
GM23338
CTCF-human
GM23338
ETS1-human
Ishikawa
CTCF-human
DOHH2
C