# ChIP AML PiPeline v2

In [None]:
import os
import pandas as pd
import sys
sys.path.insert(0, '../..')
import itertools
from scipy import stats
import numpy as np

from JKBio.epigenetics import ChIP_helper as chiphelper
from JKBio import Helper as helper
import igv
import SimpSOM as sps
from scipy import stats

import seaborn as sns
from matplotlib import cm
from matplotlib import pyplot as plt
from IPython.display import IFrame
import seaborn as sns
from bokeh.plotting import *
import igv

import numba
from numba import jit

from scipy.cluster.hierarchy import linkage, leaves_list
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans, OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from IPython.display import IFrame

from pybedtools import BedTool
import pyBigWig

output_notebook()
%load_ext autoreload
%autoreload 2

In [None]:
project="Cobinding_ChIP"
version="v2"

## adding the data bucket to path

In [None]:
! gcsfuse --only-dir Chip/fastqs amlproject ../data/seqs

## processing using Nextflow

In [None]:
singleend, pairedend = chiphelper.extractPairedSingleEndFrom('../data/seqs')

## Pipeline

![](images/gcpjup.png)


- Raw read QC (FastQC)
- Adapter trimming (Trim Galore!)
- Alignment (BWA)
- Mark duplicates (picard)
- Merge alignments from multiple libraries of the same sample (picard)
- Re-mark duplicates (picard)
- Filtering to remove: blacklisted regions, duplicates, primary alignments,unmapped,multiple locations, containing >  4 mismatches, insert size > 2kb, map to different chromosomes 
- Alignment-level QC and estimation of library complexity (picard, Preseq)
- Create normalised bigWig files scaled to 1 million mapped reads (BEDTools, bedGraphToBigWig)
- Generate gene-body meta-profile from bigWig files (deepTools)
- Calculate genome-wide IP enrichment relative to control (deepTools)
- Calculate strand cross-correlation peak and ChIP-seq quality measures including NSC and RSC (phantompeakqualtools)
- Call broad/narrow peaks (MACS2)
- Annotate peaks relative to gene features (HOMER)
- Create consensus peakset across all samples and create tabular file to aid in the filtering of the data (BEDTools)
- Count reads in consensus peaks (featureCounts)

![](images/nfcore.png)


In [None]:
! nextflow cloud create 'JKcluster' -c 4

In [None]:
! nextflow cloud create jkcluster -c "../nextflow/nextflow.config" 40 && \
nextflow nf-core/chipseq -c "../nextflow/nextflow.config" \
--singleEnd \
--seq_center 'DFCI' \
--email 'jkobject@gmail.com' \
--bucket-dir 'gs://jkobject/Chip_AML/nextflow/CHIPprocess_2/' \
--keyfile '~/jkobject-b6f1adaffcb8.json' \
--projectname 'jkobject' \
--zone 'us-east1-b' \
--skipDiffAnalysis \
--narrowPeak \
--design "../nextflow/design.csv" \ 
--genome 'GRCh38' \
--profile gcp \
--resume \
--skipPreseq \
--max_cpus 8 && \
nextflow cloud shutdown jkclustert

## Gathering data

we are using a folder outside our repository as there is too many huge files.

In [None]:
!gsutil -m cp -r gs://amlproject/Chip/results/bwa/mergedLibrary/macs/narrowPeak/ ../data/$project/

In [None]:
!gsutil -m cp -r gs://amlproject/Chip/results/bwa/mergedLibrary/bigwig/ ../../data/$project/

In [None]:
!cp ../data/$project/narrowPeak/*MV411*.narrowPeak ../data/$project/MV4narrow

In [None]:
! mkdir ../data/BroadPeaks/MV411 && !gsutil -m cp -r gs://amlproject/Chip/results/bwa/mergedLibrary/macs/BroadPeaks/ ../data/$project/ && mv ../../data/$project/BroadPeaks/MV411_* ../data/$project/BroadPeaks/MV411/

In [None]:
bindings = chiphelper.loadPeaks('../data/'+project+'MV4narrow/', isMacs=False,skiprows=0)

In [None]:
broadbindings = chiphelper.loadPeaks('../data/'+project+'BroadPeaks/MV411/', isMacs=False,skiprows=0)

In [None]:
SEgenes = pd.read_csv('../data/superenhancer/SEgenes.csv')
CTF = pd.read_csv('../data/CTF.csv', header=None)[0].tolist()

In [None]:
CTF.extend(['GATA2','IKZF1','LYL1' ,'PU1','SMC1'])
CTF

In [None]:
CTF = list(set(CTF))

In [None]:
peaks = !ls ../data/MV4narrow/*.narrowPeak
broadpeaks = ! ls ../data/BroadPeaks/MV411/*.broadPeak
peaks = set([i.split('/')[-1].split('.')[0] for i in broadpeaks]) | set([i.split('/')[-1].split('.')[0] for i in peaks])

## preprocessing the data and renaming

In [None]:
peaks

In [None]:
set(bindings['name'])

In [None]:
bindings

In [None]:
broadbindings

In [None]:
bindings = bindings[~bindings.name.isin(set(broadbindings.name))]

In [None]:
bindings = bindings.append(broadbindings)

In [None]:
len(bindings)

In [None]:
bindings['replicate']= [i.split('-')[-1][-1] for i in bindings['name']]
bindings['tf'] = [i.split('-')[2] for i in bindings['name']]

In [None]:
bindings['peak_number'] = ['_'.join([i.split('_')[2],i.split('_')[5]]) for i in bindings['peak_number']]

In [None]:
bindings

In [None]:
bindings.to_csv('../results/'+project+'/all_bindings.bed',sep='\t',index=False)

In [None]:
bindings= pd.read_csv('../results/'+project+'/all_bindings.bed',sep='\t',header=None, index_col=None,
                     names=["-log10pvalue","-log10qvalue", "chrom", "end", "foldchange", "name", "peak_number", "relative_summit_pos", "start", "replicate","tf"])

In [None]:
bindings

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()
gsheet

In [None]:
bw = ! ls ../../data/bigwig
bw

In [None]:
len(set(bindings.name))

In [None]:
len(bw)

In [None]:
# ONE off
for i in bw[2:]:
    a = gsheet[gsheet.id=='mp'+i.split('_')[2]].name.values[0]
    i = '../../data/bigwig/'+i
    a = '../../data/bigwig/'+a+'.mLb.clN.bigWig'
    ! mv $i $a
    print(a)

In [None]:
set(bindings.name)

In [None]:
replicates = chiphelper.findReplicates(folder='../data/seqs/results/bwa/', sep='_', namings='_R([0-9])',namepos=0)

## Visual inspection of the features and and look at QCs

### [igv tracks](https://igv.org/app/?sessionURL=blob:3Z3rU9rcFsb_lU6.vO.Zg0EIcvGbolinYB0vp5czHSeEGFIhscmOaJ3.790hwMbzsrZZ9rTJIx.YAfIk62HxY7Mfcnk0IvfajdzAcY3dR8MfGbvG2Kt1jIoR2FP5nPE2mdrBm7.7Z92x1aqmr_1Lvnhtx8K.POuniwtxG.9Wq7FljhJ7Ip93bswk3nLlIls1057a38PAnsWmE06rvndnDqPQHvlBLHyRCNcMI6_quUE4deNq7H6bb2J.Z843IjfmByP3_o9sTN77coPOgwiHdjD6jdtMN7EvN2GKe2H8qBiT0EliY_e_hj2ZGF8qhojkVtInHg3xcJs2QkqTeZ8qRhiN3MjYlRtvdVrNjlWzGu1m3arttNx_W9tt2bvw_NYPgnQhESXuj8qjkUQTuRIvNfL1Jhx.dR1R7Y7926u9Qb8auXEyEXF1OLOrUzfy3FHfH0Z29FAd.t7M96q9_nHt6qxmTvtD05mcmPLpD773pDdBMpmsSttefX5IpRNOQrmkEXnDv7cr25Vaq51.suyJ6G5.Zez63lgYu5ZcuZ2I8NyxJ3ILqcGKMZOFhLNeEjjCDwMpnrp2IEV3fuwP_YkvHj7Ml5CvbNXk85PQW.ivZUvlCv7nLZMfcX_iPuchDpPIcS.yBqWCFI0wmtqyTCN76.QziwZmD9LKY1U5qzWBHUXh7NS1b5YNubqVD2JTvUC3pLZqyV9T28mEf_1KH.pP.7B4H__RCCdMAqHpRI43fpNL7VufLZmq1NtvB0Eo7HlRFWNq35.FM8nXzrZ0IeXCjdJ1zRF0xlE4DWMJrFxSkusaX36doKMej6D6ql2kEogg0kNxBC1KYhBkARKkcYlG0PHJ6eUFB6HGql.0FIgh2sQfhmhjdwaHvfoBpzs7q.7QUqDu0CaK.4pb1sT4jmsCfsfpbKJ9yZ1esn4ltFbtooRACFEWigMoq4iBTzr3QsOHNokGz_kpC56amhJRSiB6KAvF0ZNVxJmiqkkPDj60SzR8Ph_u11n8qAkRKQUCiPRQHEGLkjgIqUkPDkIam2gMdQ_3T_dYEKl5EK0Foog2URxGy5o4HCFOhHQ.QUGq5wdJzYVoLR5IG0wUDlKdAxLilEjnEw6ki26PNSB1FEeUFAkjykOBFGUlMSBKP_J4ENE20Rg6GJy_v2qPr95a7.ot.f5z_sdTIUOetQCRlcdOcZD9szoOb4hBRD7HqOgN9j6ysFPZxHNrAESOslI8blllHNQQA4vn3UJi9pLhTcUXz64CDbQSD2wvG9UQM40cdtFYO.z3WDl7XSUbpBSILdJDcUwtSuKwhBhraGxiMpQ_HayrVIOUwjFUqmxwURJnf0rEVENjE46hi_80OeOQpaIMUorEEOWhQIaykjgMISYVGpuYDOUfhyyVS5BSOIbKNQ5lJXEYQowgNDYxGbLyM6QyB1IKx9AGD0UzZHEYQswWNDbhDi_bu9hjhQqWChVoLdIRZqSJAg8xW9TE4QgxV9D5BAWJ8YtOJQu0Fg.kUv2mW9bEAKmBGC7ofMKBdH7IOhqjodIFUoqEEeWhQIqykjgQIaYLGpuYDOUfixoqXSClcAyVayTKSuIwhJguaGyiMfSCnRoaKmB4JfszlHNXhhftxdBATBpe1w4Mx0dHLJxU1EApkU7DQVgoDqOsIg5CiCED7RIOn3efe7ypkYoYaC0SQqSJAiFa1MTAaAcxYtD5BAUp__xoR2UMtBYPpFLNkJY1cUBCjBl0PuFAOuvV909Z_x_tqKhBp0aCSWOjQJxWVXGAQswc9E5hkWKMTip20KkRkSrXCLWqioMUYuygdwqLVP5dhXbWogeNGhGpUu0wpKriIAUZQ2idwiLVyI_UWhihUSMitcFGCZBqcE7fChlJaJ0CItXmTKSaa5kEJcWCabOHQklKS.JgBBlI0DbRGOoP3rPCiKYKI0gpEEOkh.IYWpTEYQgxg9DYxGQof_rQVOkDKYVjqFS5w6IkDkOIoYPGJhxDn_qsP2qbKm4gpUgMUR4KZCgricMQYsqgsYnGEPPEXU0VL7yCE3aV70Rd7BN0tRAjhddzYq7B4QHvijAqTiClUJdVIjwUeVWleUkchBDjBI1NPIZ69S4LIpUn0Fqwi5NtNlHsxcnSmjgcIUYKOp.gIOUPFVpPr_K3WYsHUqlihWVNHJBQL_NH.QQFKf.ODC2VLNBaPJBKtRPDsiYOSIjhgs4nHkjH57z50Vq.QGqhQKJMFAlSVhPnwpmQKYPGJxxIn_Y5GLXXYgZCiQQRYaFAhOYVcQCCzBhIl5D45J8XtdcCBkKJhk.55kTzijj4QEYLpEs8fFj5XHstViCUUPiULpubV8TBBzJQIF1C4sMYfdbCBEKJhk_JRh9mHteGjBFIl2j4nL7vHx.zxh8VItBaIIRoE8VBtKyJgVEHMUTQ.UQD6ezy5CMrjeuoGIHWAoFEmygOpGVNHJAQwwSdT0SQWEdAdFSgQGvBQCrbMRDLmjggIcYKOp9oIJ0PurwBSSULpBQII9JDcRQtSuJAhBguaGyiMfT5cJ91JFFHxQukFIgh0kNxDC1K4jCEmDBobMIxNPh0csA6LryjMgaNGIkj2kWBJC2LYrBUSz80eDDpnKLilH9Qqm2rqEGjBuTpt41My.mAex27394cydbF6.uJ3Ot5OyuLXo6FuE3bGVvmKJHlC9u5MZN4y7VjsVUz7an9PQzsWWw64bTqe3dmGHmmXIP8ZMTVsVfryLZfp1sx4zAS7sgU98L0vj9p6e_aiCmG_sa.bMlPTeROwzt7uNYW9UWQ3vRtokxt6tH_Ba4vP34C)

### [multiQC](http://35.184.213.1:8888/view/data/results/multiqc/multiqc_report.html)

### process: 

look at all t with a very low frip score as noted by encode. 

look at all peaks tracks together and see for location of intense co binding. 

- if we can discern peaks and if, for some reasons, some good peaks are not called by macs. 
- if looks good and we can see a lot of peaks. 
- if a lot of noise but seems consistent with replicates. 
- if just seems to have very few peaks.

Validate still but flag as potentially bad.

Else remove.

### results:

In [None]:
#badquality samples:
bad=[
"mp168",
"mp129",
"mp128",
"mp773",
"mp774",
"mp575",
"mp614",
"mp714",
"mp433",
"mp156",
"mp650",
"mp604",
"mp27",
"mp627",
"mp117",
"mp771",
"mp118",
"mp431",
"mp430",
"mp324",
"mp565",
"mp569",
"mp125",
"mp627",
"mp568",
"mp427",
"mp124",
"mp716",
"mp581",
"mp589",
"mp321",
"mp601",
"mp745",
"mp772",
"mp770",
"mp590",
"mp623",
"mp718"]

## merging duplicates

In [None]:
%matplotlib inline
mergedpeak, tomergebam, remove, ratiosofunique = chiphelper.mergeReplicatePeaks(bindings,'../../data/bigwig/',markedasbad=bad, window=150, mincov=4, doPlot=True, minKL=10, cov={}, use='poisson', MINOVERLAP=0.25,lookeverywhere=True, only='',saveloc='../results/'+project+'/plots/'+version+'/')

In [None]:
tomergebam

In [None]:
mergedpeak = mergedpeak[mergedpeak.columns[[2,9,3,5,6,4,0,1,7,10]]]

In [None]:
mergedpeak.to_csv('../results/'+project+'/large/merged_replicates_'+version+'.csv.gz')

In [None]:
mergedpeak = pd.read_csv('../results/'+project+'/large/merged_replicates_'+version+'.csv.gz', index_col=0)

## sorting and removing samples

In [None]:
bigwigs=os.listdir('../../data/bigwig/')
for val in bigwigs:
    for v in remove + toremove + ['scale','POLII','IGG','CTCF','INPUT']:
        if v in val:
            bigwigs.remove(val)
            break
bigwigs = ['data/bigwig/'+ i for i in bigwigs]

In [None]:
set(mergedpeak.tf)

In [None]:
mergedpeak.foldchange.min()

In [None]:
mergedpeak['name']=mergedpeak.tf

In [None]:
mergedpeak

In [None]:
## Removing bad ChIP protein
mergedpeak = mergedpeak[~mergedpeak['name'].isin(['CDK13','GSE1'])]

## Consensus set

In [None]:
window = 150

In [None]:
merging_version = "simpleMerge"

In [None]:
merged = chiphelper.simpleMergePeaks(mergedpeak[~mergedpeak.tf.isin(['MED1','SMC1','CTCF','POLII','IRF2BP2_FLAG','IRF2BP2', 'H3K27ac', 'H3K27me3', 'H3K4me3', 'WRD5', 'H3K79me2','H3K36me3', 'H3K4me1','H3K18','H3K9ac','H3K36me2','PU1_FLAG'])], window=window)

In [None]:
window=str(window)

In [None]:
len(merged)

In [None]:
len(mergedpeak)

In [None]:
merged

In [None]:
merged.to_csv('../results/'+project+'/merged_'+version+'_'+merging_version+'_'+window+'.bed.gz', sep='\t',index=None)

In [None]:
merged = pd.read_csv('../results/'+project+'/merged_'+version+'_'+merging_version+'_'+window+'.bed.gz', sep='\t')

## Plotting similarity kernels over TF binding profiles

In [None]:
fig = sns.pairplot(merged[merged.columns[8:14]], corner=True, diag_kind="kde", kind="reg", plot_kws ={"scatter_kws":{"alpha":.05}})
def col_nan_scatter(x,y, **kwargs):
    df = pd.DataFrame({'x':x[:],'y':y[:]})
    df = df[df.sum(0)!=0]
    x = df['x']
    y = df['y']
    plt.gca()
    plt.scatter(x,y)
def col_nan_kde_histo(x, **kwargs):
    df = pd.DataFrame({'x':x[:]})
    df = df[df['x']!=0]
    x = df['x']
    plt.gca()
    sns.kdeplot(x)
fig = fig.map_upper(col_nan_scatter)
fig = fig.map_upper(col_nan_kde_histo)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_pairplot_experiments.pdf')
plt.show()
counts,val = np.unique(merged[merged.columns[8:]].astype(bool).sum(1).values, return_counts=True)
fig = sns.barplot(data=pd.DataFrame(val, index=counts,columns=['counts']).T)
fig.set_yscale("log")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'pairplot_experiments.pdf')
plt.show()
i = merged[merged.columns[8:]].astype(bool).sum(1)
print(i.max(),i.mean(),i.min())

## Comparison to a random distribution 

### computation:

we are evalutating each event's probability 1 binding, 2 binding, n binding.., as a binomial over the amount of proability p_i with n retries corresponding to the size of the conscensus peak set.
the probability p_i of this binomial is the sum of probabilities of having tf a binding with b for all possible combination of tf. 
the number of combination is k amongst n, n being 33, k going from 1 to 29
we compute 

$p(a & b) = p(a)\*p(b) =p(ab)$

and 

$p(a & b) | p(a & c) = p(ab) + p(ac) - p(abc)$

for a,b,c,d:

$p(ab) + p(ac) + p(ad) + p(bc) + p(bd) + p(cd) - {3\choose 2}*(p(abc) - p(abd) - p(bcd) - p(acd)) - {4\choose 2}*p(abcd)$

In [None]:
#we are looking at the distribution of pseudo-enhancers per cobound regions
counts,val = np.unique(merged[merged.columns[8:]].astype(bool).sum(1).values, return_counts=True)
fig = sns.barplot(data=pd.DataFrame(val, index=counts,columns=['counts']).T)
fig.set_yscale("log")
fig.figure.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_cobinding_distribution.pdf")

In [None]:
len(merged)

now we will be making the expected distribution

In [None]:
proba = (merged[merged.columns[8:]].astype(bool).sum(0)/len(merged)).tolist()
sums = {i:0 for i in range(1,30)}

In [None]:
for i in range(29,0,-1):
    print(i)
    if sums[i]> 0:
        continue
    print(helper.combin(33,i))
    v=0
    for j in itertools.combinations(proba, i):
        v+=np.prod(j) 
    sums[i] = v

In [None]:
for i in range(29,0,-1):
    for j in range(i+1,30):
        icomb = helper.combin(j,i)
        sums[i] -= icomb*sums[j]

In [None]:
sums[0] = 1-sum(list(sums.values()))

In [None]:
from scipy.stats import binom
for i in range(29,0,-1):
    print(i,binom.mean(len(merged), sums[i]),binom.var(len(merged), sums[i]))
    sums[i] = [binom.mean(len(merged), sums[i]),binom.var(len(merged), sums[i])]

In [None]:
helper.dictToFile(sums,'../results/' + project + '/' + version + '_' + merging_version +  '_' + window+'_sums.json')

In [None]:
version = "v2"
merging_version = "simpleMerge"
window = "150"

In [None]:
a = '../results/' + project + '/*_sums.json'
!ls $a

In [None]:
sums = helper.fileToDict('../results/' + project + '/' + version + '_' + merging_version +  '_' + window+'_sums.json')
sums

In [None]:
cobind = pd.DataFrame(sums).T.rename(columns={0:'mean',1:'var'})

In [None]:
merged[merged.columns[8:]].astype(bool).sum().sum()

In [None]:
(val*counts).sum()

In [None]:
int((cobind['mean'] * cobind.index.astype(int)).sum())

In [None]:
cobind.sum()

In [None]:
cobind

In [None]:
cobind['cobinding']=cobind.index
fig = sns.barplot("cobinding","mean", data=cobind, ci=None)
plt.errorbar(x=range(0,len(cobind)),y=cobind['mean'],
            yerr=cobind['var'], fmt='none', c= 'r')
plt.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_expected_cobinding_distribution.pdf")

In [None]:
res = pd.DataFrame()
res['change']=val/cobind['mean']
res['count']=list(res.index)

In [None]:
fig = sns.barplot(data=res.T).set_yscale("log")
plt.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_cobinding_enrichment.pdf")

now plotting the enrichment

In [None]:
fig = plt.bar(res['count'],res['change'],log=True)
plt.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window + "_cobinding_enrichment_matplotlib.pdf")
plt.show()

In [None]:
m =12
fig = plt.bar(res.iloc[:m]['count'],res.iloc[:m]['change'],log=True)
plt.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_cobinding_enrichment_zoomed.pdf")
plt.show()

## The cobinding Matrix

In [None]:
merged.columns[8:]

In [None]:
merging_version = "remove_single"

In [None]:
#merged = merged.drop(columns='PU1_FLAG')
merged = merged[merged[merged.columns[8:]].astype(bool).sum(1)>1]

In [None]:
merged[merged.columns[:8]].to_csv('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_conscensus.bed.gz',sep='\t',index=None, columns=None)

### correlations over cobindings

In [None]:
#raw correlation over cobindings
fig = sns.clustermap(np.corrcoef(stats.zscore(merged[merged.columns[8:]].values.T, axis=1)), figsize=(20, 20), xticklabels=merged.columns[8:], yticklabels=merged.columns[8:])
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("raw correlation over cobindings")
fig.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_correlation_cobinding_regular.pdf")

In [None]:
rand = np.random.choice(merged.index,5000)
viridis = cm.get_cmap('viridis', 256)

In [None]:
#clustermap of cobindings
fig = sns.clustermap(merged[merged.columns[8:]].loc[rand].T, standard_scale=0, figsize=(30,20),xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("clustermap of cobindings")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_clustermap_cobinding_scaled.pdf')
plt.show()

## Peak annotations

In [None]:
additional = {}
additional['activation'] = chiphelper.simpleMergePeaks(mergedpeak[mergedpeak.tf.isin(["H3K27ac",'H3K79me2','H3K36me3','H3K4me3','H3K9ac','H3K4me1'])], window=10, mergedFold="max")
additional['repression'] = mergedpeak[mergedpeak.tf=='H3K27me3']
additional['IRF2BP2'] = mergedpeak[mergedpeak.tf=='IRF2BP2_FLAG']
additional['MED1'] = mergedpeak[mergedpeak.tf=='MED1']
additional['SMC1'] = mergedpeak[mergedpeak.tf=='SMC1']
additional['CTCF'] = mergedpeak[mergedpeak.tf=='CTCF']
additional['POLII'] = mergedpeak[mergedpeak.tf=='POLII']
additional['H3K18'] = mergedpeak[mergedpeak.tf=='H3K18']
additional['H3K36me2'] = mergedpeak[mergedpeak.tf=='H3K36me2']
additional['WDR5'] = mergedpeak[mergedpeak.tf=='WDR5']
additional["H3K27ac"] =mergedpeak[mergedpeak.tf=='H3K27ac']
additional["H3K79me2"] =mergedpeak[mergedpeak.tf=='H3K79me2']
additional["H3K36me3"] =mergedpeak[mergedpeak.tf=='H3K36me3']
additional["H3K4me3"] =mergedpeak[mergedpeak.tf=='H3K4me3']
additional["H3K9ac"] =mergedpeak[mergedpeak.tf=='H3K9ac']
additional["H3K4me1"] =mergedpeak[mergedpeak.tf=='H3K4me1']
additional["KMT2A"] =mergedpeak[mergedpeak.tf=='KMT2A']

In [None]:
for key, val in additional.items():
    if len(val)>1:
        merged[key] = chiphelper.putInConscensus(merged[merged.columns[:8]],val)
        merged = merged.replace(np.nan,0)
        merged[key].astype(bool).sum()

In [None]:
ls = '../data/'+project+"/BroadPeaks"
!ls $ls

In [None]:
# adding ATACseq
ATAC= chiphelper.loadPeaks(peakFile='../data/'+project+'/BroadPeaks/ATAC_MV411.mRp.clN_peaks.broadPeak')

In [None]:
len(ATAC)

In [None]:
merged['ATAC'] = chiphelper.putInConscensus(merged[merged.columns[:8]],ATAC)
merged = merged.replace(np.nan,0)

In [None]:
merged['ATAC'].astype(bool).sum()

In [None]:
#compute enhancers at TSS in the matrix (promoters)
promoters = pd.read_csv('../data/'+project+'/compute_genes/human_epdnew_TeLy2.bed', sep='\t',header=None).rename(columns={0:'chrom',1:'start',2:'end',3:'name',5:'strand'}).drop(4,1)

In [None]:
promoters['foldchange']=1

In [None]:
promoters['name']=[i[:-2] for i in promoters['name']]

In [None]:
merged['promoters'] = chiphelper.putInConscensus(merged[merged.columns[:8]],promoters)
merged = merged.replace(np.nan,0)

In [None]:
merged['promoters'].astype(bool).sum()

### adding super enhancers

In [None]:
set(bindings[bindings.tf=="H3K27ac"].name)

In [None]:
! mkdir ../../data/MV411_H3K27ac
! gsutil cp gs://amlproject/Chip/results/bwa/mergedLibrary/*MV411*H3K27* ../../data/MV411_H3K27ac/
! gsutil cp gs://amlproject/Chip/results/bwa/mergedLibrary/macs/NarrowPeaks/*MV411*H3K27* ../../data/MV411_H3K27ac/
! gsutil cp gs://amlproject/Chip/results/bwa/mergedLibrary/macs/BroadPeaks/*MV411*H3K27* ../../data/MV411_H3K27ac/

In [None]:
peaks = [
"../../data/"+project+"/MV411_H3K27ac/mp70-MV411-H3K27ac-r2.narrowPeak",
"../../data/"+project+"/MV411_H3K27ac/mp734-MV411_DMSO-H3K27ac-r1.narrowPeak",
"../../data/"+project+"/MV411_H3K27ac/mp88-MV411-H3K27ac-r3.broadPeak",
"../../data/"+project+"/MV411_H3K27ac/mp702-MV411_DMSO-H3K27ac-r1.broadPeak",
"../../data/"+project+"/MV411_H3K27ac/mp183-MV411_DMSO-H3K27ac-r1.broadPeak",
"../../data/"+project+"/MV411_H3K27ac/mp136-MV411-H3K27ac-r1.broadPeak"
]

In [None]:
for val in peaks:
    valbed = val +".bed"
    ! mv $val $valbed

In [None]:
peaks[1:]

In [None]:
! mkdir ../results/$project/ROSE/MV411/
for peak in peaks[1:]:
    chiphelper.MakeSuperEnhancers(peak+'.bed',
                             bamFile='.'.join(peak.split('.')[:-1])+'.mLb.clN.sorted.bam',
                             baiFile='.'.join(peak.split('.')[:-1])+'.mLb.clN.sorted.bam.bai',
                             controlBam= '../../data/diffBinding_hist/INPUT_R1.mLb.clN.sorted.bam',
                             controlBai= '../../data/diffBinding_hist/INPUT_R1.mLb.clN.sorted.bam.bai',
                             outdir ='../results/'+project+'/ROSE/MV411/',
                             rosePath="../src/ROSE/")

In [None]:
! rm ../data/$project/MV411_H3K27ac/*.bam*

In [None]:
rose = chiphelper.ReadRoseSuperEnhancers("../results/"+project+"/ROSE/MV411/")

In [None]:
rose = chiphelper.simpleMergePeaks(rose,window=1000).drop(columns=["relative_summit_pos","-log10pvalue","-log10qvalue"])

In [None]:
rose = rose[rose[rose.columns[5:]].astype(bool).sum(1)>1]
rose = rose.sort_values(by=['chrom','start','end']).reset_index(drop=True)

In [None]:
merged['super_enhancer'] = chiphelper.putInConscensus(merged[merged.columns[:8]],rose)
merged = merged.replace(np.nan,0)

In [None]:
merged['super_enhancer'].astype(bool).sum()

In [None]:
## making regulat enhancers merged["regular_enhancers"]
merged['regular_enhancer'] = (merged['activation'].astype(bool) & ~merged[['super_enhancer','promoters']].astype(bool).sum(1).astype(bool)).astype(float)

In [None]:
set(mergedpeak.tf)

### adding ATACseq datta

In [None]:
! cp ../temp/MV411Merged/ATAC.bed ../../data/ATACseq/ATAC_MV411.mRp.clN_peaks.broadPeak 

In [None]:
#! mkdir ../results/$project/MV411Merged
for i in set(mergedpeak.tf):
    a = mergedpeak[mergedpeak.tf==i][['chrom','start',"end",'peak_number',"foldchange"]]
    a['strand']='+'
    a.to_csv("../results/"+project+"/"+version+'_'+merging_version+'_'+window+"_ MV411Merged/"+i+'.bed', sep='\t', index=False)

In [None]:
a = ATAC[['chrom','start',"end","peak_number",'foldchange']]
a['strand'] = '+'
a.to_csv('../results/'+project+"/"+version+'_'+merging_version+'_'+window+'_MV411Merged/ATAC.bed',index=False,sep='\t')

### Running and adding ChromHMM

In [None]:
l = ['H3K27ac', 
     'H3K27me3', 
     "H3K79me2",
     "H3K18",
     "H3K4me1",
     "H3K36me3",
     "H3K4me3", 
     "H3K9ac",
     "H3K36me2",
     "CTCF", 
     'SMC1', 
     "POLII", 
     "MED1", 
     'ATAC', 
     'CEBPA',
     "CEBPB",
     "PU1_FLAG",
     'KMT2A',
     'WRD5']

In [None]:
## computing CHROMHMM
#!mkdir ../results/chromHMM/
chrombed = chiphelper.runChromHMM(numstates=8, outdir = '../results/chromHMM/' + version + '_'+merging_version+'_'+window+'_/', data = pd.DataFrame([['MV411'] * len(l), l,["AMLproject/results/" + project + "/" + version + '_' + merging_version + '_'+window+'_MV411Merged/'+i+'.bed' for i in l]]).T, datatype = 'bed', folderPath="", chromHMMFolderpath="../src/Chro mHMM/", control_bam_dir = None)['MV411']

In [None]:
from IPython.display import Image, display
im = ! ls ../results/$project/chromHMM/v2_all_$window/*.png
for val in im:
    display(Image(filename=val))

In [None]:
version = "v2"
merging_version = "all"
window = "150"

In [None]:
chrombed = pd.read_csv('../results/'+project+'/chromHMM/'+version+'_'+merging_version+'_'+window+'/MV411_8_dense.bed',sep='\t',header=None, skiprows=1).drop(columns=[4,5,6,7]).rename(columns={0:'chrom',1:'start',2:'end',3:'state',8:"color"})

In [None]:
chrombed['foldchange']= chrombed['state']

In [None]:
merged['HMM_states'] = chiphelper.putInConscensus(merged[merged.columns[:8]],chrombed,window=1,mergetype='first')
merged = merged.replace(np.nan,0)

In [None]:
merged['regular_enhancer'] = merged['regular_enhancer'].astype(float)

In [None]:
merged = merged.rename(columns={'SMC1':'cohesin','MED1':'mediator','ATAC':'open regions'})

for i in range(1,1+states):
    merged['state_'+str(i)] = (merged.HMM_states==i).astype(float)
merged = merged.drop(columns=['HMM_states'])

merged = merged[list(merged.columns[:-22]) + list(merged.columns[-20:-8]) + list(merged.columns[-22:-20]) + list(merged.columns[-8:])]

In [None]:
merging_version="remove_single"
window="150"
version

In [None]:
merged.to_csv('../results/'+project+'/merged_'+version+'_'+merging_version+'_'+window+'_with_annotations.bed.gz', sep='\t',index=None)

In [None]:
merged = pd.read_csv('../results/'+project+'/merged_'+version+'_'+merging_version+'_'+window+'_with_annotations.bed.gz', sep='\t')

## In depth look at the cobinding matrix

Look at AUC for all ChIPs over all peaks of all ChIPs

In [None]:
# Basic Information
info = pd.concat([merged[merged.columns[8:]].astype(bool).sum(0),
           merged[merged.columns[8:]].max(),
           merged[merged.columns[8:]].replace(0, np.NaN).mean(),
          merged[merged.columns[8:]].replace(0, np.NaN).var()],axis=1).rename(columns={0:'sum',1:'max',2:'mean',3:'std'})
info.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'info.tsv')
info

In [None]:
endata = 21
merged.columns[-endata:]

In [None]:
rand = np.random.choice(merged.index,5000)
viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[-endata:-states]]
for val in data.columns[:-5].tolist()+data.columns[-2:].tolist():
    #data[val] =stats.zscore(np.log2(1+data[val]))
    data[val] = (((data[val] -data[val].min())/ (data[val].max()))*256).astype(int)
#print(data['HMM_states'])
m = data.max()
data = data.loc[rand]
data = data/m
for val in data.columns:
    a = [viridis(v*256) for v in data[val]]
    data[val] = a

In [None]:
statetocol={i: chrombed[chrombed['state']==i].iloc[0]['color'] for i in set(chrombed['state'])}
statetocol.update({0:'0,0,0'})
for i,v in statetocol.items():
    statetocol[i] = tuple([int(i)/256 for i in v.split(',')])
data['HMM_states'] = [statetocol[i] for i in data['HMM_states']]

In [None]:
#clustermap of cobindings with annotation
fig = sns.clustermap(merged[merged.columns[8:-endata]].loc[rand].T, standard_scale=0, colors_ratio=0.018, col_colors = data, figsize=(20,15),xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("clustermap of cobindings with annotation")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_clustermap_cobinding_scaled_full_annotations.pdf')
plt.show()

In [None]:
#sorted clustermap of cobindings with annotation
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[8:-endata]].loc[rand].T), vmin=0,vmax=1,figsize=(20,15),colors_ratio=0.018, standard_scale=0,col_colors=data, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("sorted clustermap of cobindings with annotation")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_clustermap_cobinding_scaled_full_annotation_sorted.pdf')
plt.show()

In [None]:
#scatter plot of distance of TF in cobinding space
helper.scatter(TSNE(2,3).fit_transform(PCA(30).fit_transform(stats.zscore(data).T)), radi=10, labels=merged.columns[8:-endata], showlabels=True, title="scatter plot of distance of TF in cobinding space", folder="../results/"+project+'/plots/'+version+"_"+window+"_"+merging_version+"_")

## Saving

In [None]:
%store merged
%store promoters
#ATAC
%store a 
%store chrombed
%store mergedpeak
%store endata
%store l