# ChIP AML PiPeline v2

In [None]:
import os
import pandas as pd
import sys
import numpy as np
import itertools

sys.path.insert(0, '../..')

from JKBio.epigenetics import ChIP_helper as chiphelper
from JKBio import Helper as helper
import igv
import SimpSOM as sps
from scipy import stats

import seaborn as sns
from matplotlib import cm
from matplotlib import pyplot as plt
from matplotlib.ticker import FuncFormatter
from bokeh.plotting import *

from scipy.cluster.hierarchy import linkage, leaves_list
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans, OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from IPython.display import IFrame

from pybedtools import BedTool
import pyBigWig

output_notebook()
%load_ext autoreload
%autoreload 2

In [None]:
project="cobinding"
version="v2"

## adding the data bucket to path

In [None]:
! gcsfuse --only-dir Chip_AML jkobject ../data/seqs

## processing using Nextflow

In [None]:
singleend, pairedend = chiphelper.extractPairedSingleEndFrom('../data/seqs')

## Pipeline

![](images/gcpjup.png)


- Raw read QC (FastQC)
- Adapter trimming (Trim Galore!)
- Alignment (BWA)
- Mark duplicates (picard)
- Merge alignments from multiple libraries of the same sample (picard)
- Re-mark duplicates (picard)
- Filtering to remove: blacklisted regions, duplicates, primary alignments,unmapped,multiple locations, containing >  4 mismatches, insert size > 2kb, map to different chromosomes 
- Alignment-level QC and estimation of library complexity (picard, Preseq)
- Create normalised bigWig files scaled to 1 million mapped reads (BEDTools, bedGraphToBigWig)
- Generate gene-body meta-profile from bigWig files (deepTools)
- Calculate genome-wide IP enrichment relative to control (deepTools)
- Calculate strand cross-correlation peak and ChIP-seq quality measures including NSC and RSC (phantompeakqualtools)
- Call broad/narrow peaks (MACS2)
- Annotate peaks relative to gene features (HOMER)
- Create consensus peakset across all samples and create tabular file to aid in the filtering of the data (BEDTools)
- Count reads in consensus peaks (featureCounts)

![](images/nfcore.png)


In [None]:
! nextflow cloud create 'JKcluster' -c 4

In [None]:
! nextflow cloud create jkcluster -c "../nextflow/nextflow.config" 40 && \
nextflow nf-core/chipseq -c "../nextflow/nextflow.config" \
--singleEnd \
--seq_center 'DFCI' \
--email 'jkobject@gmail.com' \
--bucket-dir 'gs://jkobject/Chip_AML/nextflow/CHIPprocess_2/' \
--keyfile '~/jkobject-b6f1adaffcb8.json' \
--projectname 'jkobject' \
--zone 'us-east1-b' \
--skipDiffAnalysis \
--narrowPeak \
--design "../nextflow/design.csv" \ 
--genome 'GRCh38' \
--profile gcp \
--resume \
--skipPreseq \
--max_cpus 8 && \
nextflow cloud shutdown jkclustert

## Gathering data

we are using a folder outside our repository as there is too many huge files.

In [None]:
!gsutil -m cp -r gs://amlproject/Chip/results/bwa/mergedLibrary/macs/narrowPeak/ ../data/$project/

In [None]:
! mkdir ../../data
!gsutil -m cp -r gs://amlproject/Chip/results/bwa/mergedLibrary/bigwig/ ../../data

In [None]:
!cp ../data/$project/narrowPeak/*MV411*.narrowPeak ../data/$project/MV4narrow

In [None]:
! mkdir ../data/$project/BroadPeaks/MV411 && mv ../data/$project/BroadPeaks/MV411_* ../data/$project/BroadPeaks/MV411/

In [None]:
bindings = chiphelper.loadPeaks('../data/'+project+'MV4narrow/', isMacs=False,skiprows=0)

In [None]:
broadbindings = chiphelper.loadPeaks('../data/'+project+'BroadPeaks/MV411/', isMacs=False,skiprows=0)

In [None]:
SEgenes = pd.read_csv('../data/superenhancer/SEgenes.csv')
CTF = pd.read_csv('../data/CTF.csv', header=None)[0].tolist()

In [None]:
CTF.extend(['GATA2','IKZF1','LYL1' ,'PU1','SMC1'])
CTF

In [None]:
CTF = list(set(CTF))

In [None]:
peaks = !ls ../data/MV4narrow/*.narrowPeak
broadpeaks = ! ls ../data/BroadPeaks/MV411/*.broadPeak
peaks = set([i.split('/')[-1].split('.')[0] for i in broadpeaks]) | set([i.split('/')[-1].split('.')[0] for i in peaks])

## looking at the data and renaming

In [None]:
peaks

In [None]:
set(bindings['name'])

In [None]:
bindings

In [None]:
broadbindings

In [None]:
bindings = bindings[~bindings.name.isin(set(broadbindings.name))]

In [None]:
bindings = bindings.append(broadbindings)

In [None]:
len(bindings)

In [None]:
bindings['replicate']= [i.split('-')[-1][-1] for i in bindings['name']]
bindings['tf'] = [i.split('-')[2] for i in bindings['name']]

In [None]:
bindings['peak_number'] = ['_'.join([i.split('_')[2],i.split('_')[5]]) for i in bindings['peak_number']]

In [None]:
bindings

In [None]:
bindings.to_csv('../results/'+project+'/all_bindings.bed',sep='\t',index=False)

In [None]:
bindings= pd.read_csv('../results/'+project+'/all_bindings.bed',sep='\t',header=None, index_col=None,
                     names=["-log10pvalue","-log10qvalue", "chrom", "end", "foldchange", "name", "peak_number", "relative_summit_pos", "start", "replicate","tf"])

In [None]:
bindings

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()
gsheet

In [None]:
bw = ! ls ../../data/bigwig
bw

In [None]:
len(set(bindings.name))

In [None]:
len(bw)

In [None]:
# ONE off
for i in bw[2:]:
    a = gsheet[gsheet.id=='mp'+i.split('_')[2]].name.values[0]
    i = '../../data/bigwig/'+i
    a = '../../data/bigwig/'+a+'.mLb.clN.bigWig'
    ! mv $i $a
    print(a)

In [None]:
set(bindings.name)

In [None]:
replicates = chiphelper.findReplicates(folder='../data/seqs/results/bwa/', sep='_', namings='_R([0-9])',namepos=0)

## Visual inspection of the features and and look at QCs

### [igv tracks](https://igv.org/app/?sessionURL=blob:3Z3rU9rcFsb_lU6.vO.Zg0EIcvGbolinYB0vp5czHSeEGFIhscmOaJ3.790hwMbzsrZZ9rTJIx.YAfIk62HxY7Mfcnk0IvfajdzAcY3dR8MfGbvG2Kt1jIoR2FP5nPE2mdrBm7.7Z92x1aqmr_1Lvnhtx8K.POuniwtxG.9Wq7FljhJ7Ip93bswk3nLlIls1057a38PAnsWmE06rvndnDqPQHvlBLHyRCNcMI6_quUE4deNq7H6bb2J.Z843IjfmByP3_o9sTN77coPOgwiHdjD6jdtMN7EvN2GKe2H8qBiT0EliY_e_hj2ZGF8qhojkVtInHg3xcJs2QkqTeZ8qRhiN3MjYlRtvdVrNjlWzGu1m3arttNx_W9tt2bvw_NYPgnQhESXuj8qjkUQTuRIvNfL1Jhx.dR1R7Y7926u9Qb8auXEyEXF1OLOrUzfy3FHfH0Z29FAd.t7M96q9_nHt6qxmTvtD05mcmPLpD773pDdBMpmsSttefX5IpRNOQrmkEXnDv7cr25Vaq51.suyJ6G5.Zez63lgYu5ZcuZ2I8NyxJ3ILqcGKMZOFhLNeEjjCDwMpnrp2IEV3fuwP_YkvHj7Ml5CvbNXk85PQW.ivZUvlCv7nLZMfcX_iPuchDpPIcS.yBqWCFI0wmtqyTCN76.QziwZmD9LKY1U5qzWBHUXh7NS1b5YNubqVD2JTvUC3pLZqyV9T28mEf_1KH.pP.7B4H__RCCdMAqHpRI43fpNL7VufLZmq1NtvB0Eo7HlRFWNq35.FM8nXzrZ0IeXCjdJ1zRF0xlE4DWMJrFxSkusaX36doKMej6D6ql2kEogg0kNxBC1KYhBkARKkcYlG0PHJ6eUFB6HGql.0FIgh2sQfhmhjdwaHvfoBpzs7q.7QUqDu0CaK.4pb1sT4jmsCfsfpbKJ9yZ1esn4ltFbtooRACFEWigMoq4iBTzr3QsOHNokGz_kpC56amhJRSiB6KAvF0ZNVxJmiqkkPDj60SzR8Ph_u11n8qAkRKQUCiPRQHEGLkjgIqUkPDkIam2gMdQ_3T_dYEKl5EK0Foog2URxGy5o4HCFOhHQ.QUGq5wdJzYVoLR5IG0wUDlKdAxLilEjnEw6ki26PNSB1FEeUFAkjykOBFGUlMSBKP_J4ENE20Rg6GJy_v2qPr95a7.ot.f5z_sdTIUOetQCRlcdOcZD9szoOb4hBRD7HqOgN9j6ysFPZxHNrAESOslI8blllHNQQA4vn3UJi9pLhTcUXz64CDbQSD2wvG9UQM40cdtFYO.z3WDl7XSUbpBSILdJDcUwtSuKwhBhraGxiMpQ_HayrVIOUwjFUqmxwURJnf0rEVENjE46hi_80OeOQpaIMUorEEOWhQIaykjgMISYVGpuYDOUfhyyVS5BSOIbKNQ5lJXEYQowgNDYxGbLyM6QyB1IKx9AGD0UzZHEYQswWNDbhDi_bu9hjhQqWChVoLdIRZqSJAg8xW9TE4QgxV9D5BAWJ8YtOJQu0Fg.kUv2mW9bEAKmBGC7ofMKBdH7IOhqjodIFUoqEEeWhQIqykjgQIaYLGpuYDOUfixoqXSClcAyVayTKSuIwhJguaGyiMfSCnRoaKmB4JfszlHNXhhftxdBATBpe1w4Mx0dHLJxU1EApkU7DQVgoDqOsIg5CiCED7RIOn3efe7ypkYoYaC0SQqSJAiFa1MTAaAcxYtD5BAUp__xoR2UMtBYPpFLNkJY1cUBCjBl0PuFAOuvV909Z_x_tqKhBp0aCSWOjQJxWVXGAQswc9E5hkWKMTip20KkRkSrXCLWqioMUYuygdwqLVP5dhXbWogeNGhGpUu0wpKriIAUZQ2idwiLVyI_UWhihUSMitcFGCZBqcE7fChlJaJ0CItXmTKSaa5kEJcWCabOHQklKS.JgBBlI0DbRGOoP3rPCiKYKI0gpEEOkh.IYWpTEYQgxg9DYxGQof_rQVOkDKYVjqFS5w6IkDkOIoYPGJhxDn_qsP2qbKm4gpUgMUR4KZCgricMQYsqgsYnGEPPEXU0VL7yCE3aV70Rd7BN0tRAjhddzYq7B4QHvijAqTiClUJdVIjwUeVWleUkchBDjBI1NPIZ69S4LIpUn0Fqwi5NtNlHsxcnSmjgcIUYKOp.gIOUPFVpPr_K3WYsHUqlihWVNHJBQL_NH.QQFKf.ODC2VLNBaPJBKtRPDsiYOSIjhgs4nHkjH57z50Vq.QGqhQKJMFAlSVhPnwpmQKYPGJxxIn_Y5GLXXYgZCiQQRYaFAhOYVcQCCzBhIl5D45J8XtdcCBkKJhk.55kTzijj4QEYLpEs8fFj5XHstViCUUPiULpubV8TBBzJQIF1C4sMYfdbCBEKJhk_JRh9mHteGjBFIl2j4nL7vHx.zxh8VItBaIIRoE8VBtKyJgVEHMUTQ.UQD6ezy5CMrjeuoGIHWAoFEmygOpGVNHJAQwwSdT0SQWEdAdFSgQGvBQCrbMRDLmjggIcYKOp9oIJ0PurwBSSULpBQII9JDcRQtSuJAhBguaGyiMfT5cJ91JFFHxQukFIgh0kNxDC1K4jCEmDBobMIxNPh0csA6LryjMgaNGIkj2kWBJC2LYrBUSz80eDDpnKLilH9Qqm2rqEGjBuTpt41My.mAex27394cydbF6.uJ3Ot5OyuLXo6FuE3bGVvmKJHlC9u5MZN4y7VjsVUz7an9PQzsWWw64bTqe3dmGHmmXIP8ZMTVsVfryLZfp1sx4zAS7sgU98L0vj9p6e_aiCmG_sa.bMlPTeROwzt7uNYW9UWQ3vRtokxt6tH_Ba4vP34C)

### [multiQC](http://35.184.213.1:8888/view/data/results/multiqc/multiqc_report.html)

### process: 

look at all t with a very low frip score as noted by encode. 

look at all peaks tracks together and see for location of intense co binding. 

- if we can discern peaks and if, for some reasons, some good peaks are not called by macs. 
- if looks good and we can see a lot of peaks. 
- if a lot of noise but seems consistent with replicates. 
- if just seems to have very few peaks.

Validate still but flag as potentially bad.

Else remove.

### results:

In [None]:
bad=[
"mp168",
"mp129",
"mp128",
"mp773",
"mp774",
"mp575",
"mp614",
"mp714",
"mp433",
"mp156",
"mp650",
"mp604",
"mp27",
"mp627",
"mp117",
"mp771",
"mp118",
"mp431",
"mp430",
"mp324",
"mp565",
"mp569",
"mp125",
"mp627",
"mp568",
"mp427",
"mp124",
"mp716",
"mp581",
"mp589",
"mp321",
"mp601",
"mp745",
"mp772",
"mp770",
"mp590",
"mp623",
"mp718"]

## merging duplicates

In [None]:
merging_version = "simpleMerge"

In [None]:
%matplotlib inline
mergedpeak, tomergebam, remove, ratiosofunique = chiphelper.mergeReplicatePeaks(bindings,'../../data/bigwig/',markedasbad=bad, window=150, mincov=4, doPlot=True, minKL=10, cov={}, use='poisson', MINOVERLAP=0.25,lookeverywhere=True, only='',saveloc='../results/'+project+'/plots/'+version+'/')

In [None]:
tomergebam

In [None]:
mergedpeak = mergedpeak[mergedpeak.columns[[2,9,3,5,6,4,0,1,7,10]]]

In [None]:
mergedpeak.to_csv('../results/'+project+'/merged_replicates_'+version+'_'+merging_version+'.csv')

In [None]:
mergedpeak = pd.read_csv('../results/'+project+'/merged_replicates_'+version+'_'+merging_version+'.csv', index_col=0)

## show replicates overlap

## sorting and removing samples

In [None]:
bigwigs=os.listdir('../../data/bigwig/')
for val in bigwigs:
    for v in remove + toremove + ['scale','POLII','IGG','CTCF','INPUT']:
        if v in val:
            bigwigs.remove(val)
            break
bigwigs = ['data/bigwig/'+ i for i in bigwigs]

In [None]:
set(mergedpeak.tf)

In [None]:
mergedpeak.foldchange.min()

In [None]:
mergedpeak['name']=mergedpeak.tf

In [None]:
## Removing bad ChIP protein
mergedpeak = mergedpeak[~mergedpeak['name'].isin(['CDK13','GSE1'])]

## Create a consensus set

In [None]:
window = 150
merged = chiphelper.simpleMergePeaks(mergedpeak[~mergedpeak.tf.isin(['MED1','SMC1','CTCF','POLII','IRF2BP2_FLAG','IRF2BP2', 'H3K27ac', 'H3K27me3', 'H3K4me3', 'H3K79me2',])], window=window)

In [None]:
len(merged)

In [None]:
len(mergedpeak)

In [None]:
merged

In [None]:
merged.to_csv('../results/'+project+'/merged_'+version+'_'+merging_version+'_'+window+'.bed', sep='\t',index=None)

In [None]:
merged = pd.read_csv('../results/'+project+'/merged_'+version+'_'+merging_version+'_'+window+'.bed', sep='\t')

In [None]:
fig = sns.pairplot(merged[merged.columns[8:14]], corner=True, diag_kind="kde", kind="reg", plot_kws ={"scatter_kws":{"alpha":.05}})
def col_nan_scatter(x,y, **kwargs):
    df = pd.DataFrame({'x':x[:],'y':y[:]})
    df = df[df.sum(0)!=0]
    x = df['x']
    y = df['y']
    plt.gca()
    plt.scatter(x,y)
def col_nan_kde_histo(x, **kwargs):
    df = pd.DataFrame({'x':x[:]})
    df = df[df['x']!=0]
    x = df['x']
    plt.gca()
    sns.kdeplot(x)
fig = fig.map_upper(col_nan_scatter)
fig = fig.map_upper(col_nan_kde_histo)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_pairplot_experiments.pdf')
plt.show()
counts,val = np.unique(merged[merged.columns[8:]].astype(bool).sum(1).values, return_counts=True)
fig = sns.barplot(data=pd.DataFrame(val, index=counts,columns=['counts']).T).set_yscale("log")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'pairplot_experiments.pdf')
plt.show()
i = merged[merged.columns[8:]].astype(bool).sum(1)
print(i.max(),i.mean(),i.min())

In [None]:
counts,val = np.unique(merged[merged.columns[8:]].astype(bool).sum(1).values, return_counts=True)
fig = sns.barplot(data=pd.DataFrame(val, index=counts,columns=['counts']).T).set_yscale("log")
fig.savefig("../results"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_cobinding_distribution.pdf")

In [None]:
len(merged)

## Random distribution compare

### computation:

we are evalutating each event's probability 1 binding, 2 binding, n binding.., as a binomial over the amount of proability p_i with n retries corresponding to the size of the conscensus peak set.
the probability p_i of this binomial is the sum of probabilities of having tf a binding with b for all possible combination of tf. 
the number of combination is k amongst n, n being 33, k going from 1 to 29
we compute 

$p(a & b) = p(a)\*p(b) =p(ab)$

and 

$p(a & b) | p(a & c) = p(ab) + p(ac) - p(abc)$

for a,b,c,d:

$p(ab) + p(ac) + p(ad) + p(bc) + p(bd) + p(cd) - {3\choose 2}*(p(abc) - p(abd) - p(bcd) - p(acd)) - {4\choose 2}*p(abcd)$

In [None]:
proba = merged[merged.columns[8:]].astype(bool).sum(0)/len(merged)
sums = {i:0 for i in range(1,30)}

In [None]:
#sums = {i:0 for i in range(1,30)}
for i in range(29,0,-1):
    print(i)
    if sums[i]> 0:
        continue
    print(helper.combin(33,i))
    for j in itertools.combinations(proba, i):
        sums[i]+=np.prod(j) 

In [None]:
sums = helper.fileToDict('../results/' + project + '/' + version + '_' + merging_version +  '_' + window'_sums.json')
sums

In [None]:
for i in range(29,0,-1):
    for j in range(i+1,30):
        icomb = helper.combin(j,i)
        sums[str(i)] -= icomb*sums[str(j)]

In [None]:
sums

In [None]:
from scipy.stats import binom
for i in range(29,0,-1):
    print(i,binom.mean(len(merged), sums[str(i)]),binom.var(len(merged), sums[str(i)]))
    sums[str(i)] = [binom.mean(len(merged), sums[str(i)]),binom.var(len(merged), sums[str(i)])]

In [None]:
data = pd.DataFrame(sums).T.rename(columns={0:'mean',1:'var'})

In [None]:
merged[merged.columns[8:]].astype(bool).sum().sum()

In [None]:
(val*counts).sum()

In [None]:
int((data['mean'] * data.index.astype(int)).sum())

In [None]:
data.sum()

In [None]:
fig = sns.barplot(data=data.T).set_yscale("log")
fig.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_expected_cobinding_distribution.pdf")

In [None]:
res = pd.DataFrame()
res['change']=val/data['mean']
res['count']=list(res.index)

In [None]:
res

In [None]:
fig = sns.barplot(data=res.T).set_yscale("log")
fig.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_cobinding_enrichment.pdf")

In [None]:
res.T

In [None]:
fig = plt.bar(res['count'],res['change'],log=True)
fig.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window + "_cobinding_enrichment_matplotlib.pdf")
plt.show()

In [None]:
fig = plt.bar(ares['count'],ares['change'],log=True)
fig.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_cobinding_enrichment_zoomed.pdf")
plt.show()

## Correlation over consensus set

In [None]:
sns.clustermap(np.corrcoef(stats.zscore(merged[merged.columns[8:]].values.T, axis=1)), figsize=(20, 20), xticklabels=merged.columns[8:], yticklabels=merged.columns[8:]).ax_col_dendrogram.set_visible(False)
fig.savefig("../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_correlation_cobinding_regular.pdf")

In [None]:
merged[merged.columns[:8]].to_csv('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_conscensus.bed',sep='\t',index=None, columns=None)

## annotatePeaks

In [None]:
additional = {}
additional['activation'] = chiphelper.simpleMergePeaks(mergedpeak[mergedpeak.tf.isin(["H3K27ac",'H3K79me2','H3K36me3','H3K4me3'])], window=10, mergedFold="max")
additional['repression'] = mergedpeak[mergedpeak.tf=='H3K27me3']
additional['IRF2BP2'] = mergedpeak[mergedpeak.tf=='IRF2BP2_FLAG']
additional['MED1'] = mergedpeak[mergedpeak.tf=='MED1']
additional['SMC1'] = mergedpeak[mergedpeak.tf=='SMC1']
additional['CTCF'] = mergedpeak[mergedpeak.tf=='CTCF']
additional['POLII'] = mergedpeak[mergedpeak.tf=='POLII']

In [None]:
for key, val in additional.items():
    merged[key] = chiphelper.putInConscensus(merged[merged.columns[:8]],val)
    merged = merged.replace(np.nan,0)
    merged[key].astype(bool).sum()

In [None]:
# adding ATACseq
ATAC= chiphelper.loadPeaks(peakFile='../data/'+project+'/ATAC_MV411.mRp.clN_peaks.broadPeak')

In [None]:
len(ATAC)

In [None]:
merged['ATAC'] = chiphelper.putInConscensus(merged[merged.columns[:8]],ATAC)
merged = merged.replace(np.nan,0)

In [None]:
merged['ATAC'].astype(bool).sum()

In [None]:
#compute enhancers at TSS in the matrix (promoters)
promoters = pd.read_csv('../data/'+project+'/compute_genes/human_epdnew_TeLy2.bed', sep='\t',header=None).rename(columns={0:'chrom',1:'start',2:'end',3:'name',5:'strand'}).drop(4,1)

In [None]:
promoters['foldchange']=1

In [None]:
promoters['name']=[i[:-2] for i in promoters['name']]

In [None]:
merged['promoters'] = chiphelper.putInConscensus(merged[merged.columns[:8]],promoters)
merged = merged.replace(np.nan,0)

In [None]:
merged['promoters'].astype(bool).sum()

## add super enhancers and compute other enhancer

In [None]:
set(bindings[bindings.tf=="H3K27ac"].name)

In [None]:
! mkdir ../../data/MV411_H3K27ac
! gsutil cp gs://amlproject/Chip/results/bwa/mergedLibrary/*MV411*H3K27* ../../data/MV411_H3K27ac/
! gsutil cp gs://amlproject/Chip/results/bwa/mergedLibrary/macs/NarrowPeaks/*MV411*H3K27* ../../data/MV411_H3K27ac/
! gsutil cp gs://amlproject/Chip/results/bwa/mergedLibrary/macs/BroadPeaks/*MV411*H3K27* ../../data/MV411_H3K27ac/

In [None]:
peaks = [
"../../data/"+project+"/MV411_H3K27ac/mp70-MV411-H3K27ac-r2.narrowPeak",
"../../data/"+project+"/MV411_H3K27ac/mp734-MV411_DMSO-H3K27ac-r1.narrowPeak",
"../../data/"+project+"/MV411_H3K27ac/mp88-MV411-H3K27ac-r3.broadPeak",
"../../data/"+project+"/MV411_H3K27ac/mp702-MV411_DMSO-H3K27ac-r1.broadPeak",
"../../data/"+project+"/MV411_H3K27ac/mp183-MV411_DMSO-H3K27ac-r1.broadPeak",
"../../data/"+project+"/MV411_H3K27ac/mp136-MV411-H3K27ac-r1.broadPeak"
]

In [None]:
for val in peaks:
    valbed = val +".bed"
    ! mv $val $valbed

In [None]:
peaks[1:]

In [None]:
! mkdir ../results/$project/ROSE/MV411/
for peak in peaks[1:]:
    chiphelper.MakeSuperEnhancers(peak+'.bed',
                             bamFile='.'.join(peak.split('.')[:-1])+'.mLb.clN.sorted.bam',
                             baiFile='.'.join(peak.split('.')[:-1])+'.mLb.clN.sorted.bam.bai',
                             controlBam= '../../data/diffBinding_hist/INPUT_R1.mLb.clN.sorted.bam',
                             controlBai= '../../data/diffBinding_hist/INPUT_R1.mLb.clN.sorted.bam.bai',
                             outdir ='../results/'+project+'/ROSE/MV411/',
                             rosePath="../src/ROSE/")

In [None]:
! rm ../data/$project/MV411_H3K27ac/*.bam*

In [None]:
rose = chiphelper.ReadRoseSuperEnhancers("../results/"+project+"/ROSE/MV411/")

In [None]:
rose = chiphelper.simpleMergePeaks(rose,window=1000).drop(columns=["relative_summit_pos","-log10pvalue","-log10qvalue"])

In [None]:
rose = rose[rose[rose.columns[5:]].astype(bool).sum(1)>1]
rose = rose.sort_values(by=['chrom','start','end']).reset_index(drop=True)

In [None]:
merged['super_enhancer'] = chiphelper.putInConscensus(merged[merged.columns[:8]],rose)
merged = merged.replace(np.nan,0)

In [None]:
merged['super_enhancer'].astype(bool).sum()

In [None]:
## loading super enhancer from max's files

In [None]:
## comparing

In [None]:
## making regulat enhancers merged["regular_enhancers"]
merged['regular_enhancer'] = (merged['activation'].astype(bool) & ~merged[['super_enhancer','promoters']].astype(bool).sum(1).astype(bool)).astype(float)

In [None]:
set(mergedpeak.tf)

In [None]:
l = ['H3K27ac','ATAC','H3K27me3','SMC1',"POLII","MED1","H3K79me2","H3K4me3","CTCF","H3K36me3","H3K4me1"]

### saving all merged peak files

In [None]:
#! mkdir ../results/$project/MV411Merged
for i in set(mergedpeak.tf):
    a = mergedpeak[mergedpeak.tf==i][['chrom','start',"end",'peak_number',"foldchange"]]
    a['strand']='+'
    a.to_csv("../results/"+project+"/"+version+'_'+merging_version+'_'+window+"_ MV411Merged/"+i+'.bed', sep='\t', index=False)

In [None]:
a = ATAC[['chrom','start',"end","peak_number",'foldchange']]
a['strand'] = '+'
a.to_csv('../results/'+project+"/"+version+'_'+merging_version+'_'+window+'_MV411Merged/ATAC.bed',index=False,sep='\t')

In [None]:
## computing CHROMHMM
#!mkdir ../results/chromHMM/
chrombed = chiphelper.runChromHMM(numstates=8, outdir='../results/chromHMM/'+version+'_'+merging_version+'_'+window+'_/', data=pd.DataFrame([['MV411']*len(l),l,["AMLproject/results/"+project+"/"+version+'_'+merging_version+'_'+window+'_MV411Merged/'+i+'.bed' for i in l]]).T, datatype='bed', folderPath="", chromHMMFolderpath="../src/Chro mHMM/", control_bam_dir=None)['MV411']

In [None]:
chrombed = pd.read_csv('../results/chromHMM/'+version+'_'+merging_version+'_'+window+'_/MV411_8_dense.bed',sep='\t',header=None, skiprows=1).drop(columns=[4,5,6,7]).rename(columns={0:'chrom',1:'start',2:'end',3:'state',8:"color"})

In [None]:
statetocol={i: chrombed[chrombed['state']==i].iloc[0]['color'] for i in set(chrombed['state'])}

In [None]:
chrombed['foldchange']= chrombed['state']

In [None]:
merged['HMM_states'] = chiphelper.putInConscensus(merged[merged.columns[:8]],chrombed,window=1,mergetype='first')
merged = merged.replace(np.nan,0)

In [None]:
merged['regular_enhancer'] = merged['regular_enhancer'].astype(float)

In [None]:
merged.to_csv('../results/'+project+'/merged_'+version+'_'+merging_version+'_'+window+'.bed', sep='\t',index=None)

In [None]:
merged = pd.read_csv('../results/'+project+'/merged_'+version+'_'+merging_version+'_'+window+'.bed, sep='\t')

# Co Binding Matrix

Look at AUC for all ChIPs over all peaks of all ChIPs

In [None]:
statetocol.update({0:'0,0,0'})

In [None]:
for i,v in statetocol.items():
    statetocol[i] = tuple([int(i)/256 for i in v.split(',')])

In [None]:
rand = np.random.choice(merged.index,5000)
viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[-12:]]
for val in data.columns[:-4]:
    data[val] =stats.zscore(np.log2(1+data[val]))
    data[val] = (((data[val] -data[val].min())/ (data[val].max()))*256).astype(int)
#print(data['HMM_states'])
data = data.loc[rand]
for val in data.columns[:-1]:
    a = [viridis(v) for v in data[val]]
    data[val] = a
data['HMM_states'] = [statetocol[i] for i in data['HMM_states']]
data = data.rename(columns={'SMC1':'cohesin','MED1':'mediator','ATAC':'open regions'})

In [None]:
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[8:-12]].loc[rand].T),col_cluster=False, z_score=0, vmin=0,vmax=3, col_colors = data, figsize=(30,20),xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_clustermap_cobinding_scaled_full_annotations.pdf')
plt.show()

In [None]:
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[8:-12]].loc[rand].T), vmin=0,vmax=3,figsize=(20,15),z_score=0,col_colors=data, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_clustermap_cobinding_scaled_full_annotation_sorted.pdf')
plt.show()

## clustering

I have tried gaussian mixtures and Agglomerative clustering algorithm. Only the second can create a hierarchical clustering.

It seems that gaussian mixture makes more sense given the data we have, for now, is more "homogeneous". 

**I am still not so happy with the clustering.** It can be because of the how much importance, outlier values and the high number of noisy values from locations with no peaks.

We can use similar methods to RNAseq to improve this (clamping values, log transform, first round of PCA..)


In [None]:
labels = DBSCAN(n_components=2, covariance_type='diag').fit_predict(subcor)

In [None]:
names = np.array([i.split('.')[-4].split('/')[-1] for i in bigwigs])
sort = labels.argsort()
p = helper.plotCorrelationMatrix(data=cor[sort],
                            names=names[sort],
                            colors=labels[sort],
                            folder='../results/' + project +'/plots/' + version + '_' + merging_version + '_' + window + '/'
                            title="correlation between TF occupancy",
                            interactive=True)

In [None]:
show(p)

In [None]:
p = helper.scatter(TSNE(2,5).fit_transform(subcor),labels=names, colors=labels)

In [None]:
show(p)

In [None]:
sns.clustermap(subcor)

## Looking at peak overlap 

How many of peak in A (column) overlaps with peak in B (rows)

in other words:

what is the percentage of B's peaks that are overlaped by A's peaks 

In [None]:
merged[merged.columns[8:]].sum()

In [None]:
data = pd.DataFrame(stats.zscore(0.01+merged[merged.columns[8:]]).T, columns=merged.index, index=merged.columns[8:])
link = linkage(data[:-11])
col = data[-11:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
for val in col.columns:
    col[val] = [viridis(v) for v in col[val]]
fig = sns.clustermap(np.corrcoef(data.iloc[:-11])[data.columns[np.concatenate((leaves_list(link),[26,27,28,29,30,31,32,33,34,35,36]))]], row_linkage=link, col_colors=col.T, col_cluster=False)
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_with_annotation.pdf')
plt.show()

In [None]:
overlap, correlation,_ = chiphelper.computePairwiseOverlap(merged, norm=True,enrichment=False)

In [None]:
data = pd.DataFrame(data=overlap,index=merged.columns[8:], columns=merged.columns[8:])
link = linkage(data.iloc[:-11]) # D being the measurement
col = data[-11:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
for val in col.columns:
    a = [viridis(v) for v in col[val]]
    col[val] = a
fig = sns.clustermap(data.iloc[:-11][data.columns[np.concatenate((leaves_list(link),[26,27,28,29,30,31,32,33,34,35,36]))]], row_linkage=link, col_colors=col.T, col_cluster=False,figsize=(12,12))
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'pairwise_overlap_clustermap.pdf')
plt.show()

## data on the experiments

In [None]:
info = pd.concat([merged[merged.columns[8:]].astype(bool).sum(0),
           merged[merged.columns[8:]].max(),
           merged[merged.columns[8:]].replace(0, np.NaN).mean(),
          merged[merged.columns[8:]].replace(0, np.NaN).var()],axis=1).rename(columns={0:'sum',1:'max',2:'mean',3:'std'})
info.tocsv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'info.tsv')

## Correlation only on overlaps 

on the overlaps given above

In [None]:
data = pd.DataFrame(data=correlation,index=merged.columns[8:], columns=merged.columns[8:])
link = linkage(data.iloc[:-11]) # D being the measurement
col = data.iloc[-11:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
for val in col.columns:
    a = [viridis(v) for v in col[val]]
    col[val] = a
fig = sns.clustermap(data.iloc[:-11][data.columns[np.concatenate((leaves_list(link),[26,27,28,29,30,31,32,33,34,35,36]))]], row_linkage=link, col_colors=col.T, col_cluster=False)
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_onoverlap.pdf')
plt.show()

In [None]:
data = pd.DataFrame(data=np.corrcoef(stats.zscore(merged[merged.columns[8:]]).T), index=merged.columns[8:], columns=merged.columns[8:])
link = linkage(data.iloc[:-11]) # D being the measurement
col = data.iloc[-11:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
for val in col.columns:
    a = [viridis(v) for v in col[val]]
    col[val] = a
fig = sns.clustermap(data.iloc[:-11][data.columns[np.concatenate((leaves_list(link),[26,27,28,29,30,31,32,33,34,35,36]))]], row_linkage=link, col_colors=col.T, col_cluster=False,figsize=(12,12))
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'correlation_withannotation.pdf')
plt.show()

## get percentage data between TF and other:
        superenhancer, regular, promoter, HMM states*8, cohesin, mediator
promoter
superenhancer
cohesin
mediator
TFs

In [None]:
dat  = {}
for val in merged.columns[8:-12].tolist() + ['activation', 'repression', 'IRF2BP2',
    'ATAC', 'MED1', 'SMC1', 'CTCF', 'promoters', 'super_enhancer']:
    w = merged[merged[val]!=0]
    dat[val] = []
    for i in range(1,9):
        dat[val].append(len(w[w['HMM_states']==i])/len(w))
    for i in ['regular_enhancer','MED1','SMC1','CTCF','promoters','super_enhancer']:
        dat[val].append(len(w[w[i]!=0])/len(w))
dat = pd.DataFrame(data=dat,index= ['state_'+ str(i) for i in range(1,9)] + ['regular_enhancer','mediator','cohesin','CTCF','promoters','superenhancer']).T

In [None]:
ig, ax = plt.subplots(figsize=(10,10)) 
ax = sns.heatmap(dat,ax=ax)
ax.xaxis.set_ticks_position('top')
cbar = ax.collections[0].colorbar
cbar.set_ticks([0, .2, .4, .6,.8,1.0])
cbar.set_ticklabels(['0%', '20%', '40%', '60%', '80%', '100%'])
plt.xticks(rotation=40,ha='left')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_percentage_overlap_toannotations.pdf")

## Enrichment

In [None]:
for i in range(1,9):
    merged['state_'+str(i)] = (merged.HMM_states==i).astype(float)
merged = merged.drop(columns=['HMM_states'])

In [None]:
overlap, _, enrichment  = chiphelper.computePairwiseOverlap(merged, norm=False,docorrelation=False)
enrichment = enrichment.replace(-np.inf,-100)
enrichment

In [None]:
fig = sns.clustermap(enrichment,figsize=(12,12), vmin=-7, cmap='RdBu_r')
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_enrichment_clustermap_all_to_all.pdf')
plt.show()

### improving merge

- analyse only when peak is supported by two replicates
- remove GSE1 and CDK13 from cobinding matrix
- do quantile normalization over the signals (after setting up the zeroes)
- removing bad chips:
    - look, when badchip only does not correlate with anything else..
- rerun CHIPseq pipeline with better data to create merge between bam files (see if we can only run it on mergers and use the merged bam enhancer calls when in `tomerge`
- remove all peaks that have less than 2 cobound TFs 

## clustering cobinding signal

### using DBSCAN

In [None]:
cols = list(merged.columns[8:-12])
cols.remove('PU1')
data = stats.zscore(merged[cols].values)
cols

In [None]:
rand = np.random.choice(merged.index,20000)

In [None]:
#print(merged.columns[8:-4])
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.OPTICS.html#sklearn.cluster.OPTICS
min_samples=200
groups = OPTICS(min_samples=min_samples,n_jobs=8).fit_predict(data)
np.save('../data/'+project+'/'+version+'_'+merging_version+'_'+window+'_groups.npy',groups)

In [None]:
groups = np.load('../data/'+project+'/'+version+'_'+merging_version+'_'+window+'_groups.npy')

In [None]:
subgroups = groups[rand]
print(subgroups.max())
sorting = np.argsort(subgroups)
viridis = cm.get_cmap('viridis', len(set(groups)))
colors = [viridis(i) for i in subgroups[sorting]]

viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[-12:]]
for val in data.columns[:-4]:
    data[val] =stats.zscore(np.log2(1+data[val]))
data = data.iloc[rand].iloc[sorting]
for val in data.columns:
    a = [viridis(v) for v in data[val]]
    data[val] = a
data = data.rename(columns={'SMC1':'cohesin','MED1':'mediator','ATAC':'open regions'})
data["clusters"]  = colors

In [None]:
fig = sns.clustermap(np.log2(1.01+merged[cols].iloc[rand].iloc[sorting].T), vmin=0,vmax=3,figsize=(20,25),z_score=0,col_cluster=False,col_colors=data, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'clustermap_cobinding_optics_minsamp_'+str(min_samples)+'_clustered.pdf')
plt.show()

### using KMeans

In [None]:
cols = list(merged.columns[8:-12])
cols.remove('PU1')
data = stats.zscore(merged[cols].values)
cols

In [None]:
#https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
n_clust=50
kmean = KMeans(n_clusters=n_clust,n_jobs=8)
groups = kmean.fit_predict(data)
centroid = kmean.cluster_centers_

In [None]:
fig, ax = plt.subplots(figsize=(10,10)) 
viridis = cm.get_cmap('viridis', n_clust)
sns.heatmap(pd.DataFrame(centroid,columns=cols).T,vmax=10,ax=ax,)#col_color=[viridis(i) for i in range(n_clust)])
plt.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust)+'_centroids.pdf')

In [None]:
subgroups = groups[rand]
sorting = np.argsort(subgroups)
viridis = cm.get_cmap('viridis', len(set(groups)))
colors = [viridis(i) for i in subgroups[sorting]]

viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[-12:]]
for val in data.columns[:-4]:
    data[val] =stats.zscore(np.log2(1+data[val]))
data = data.iloc[rand].iloc[sorting]
for val in data.columns:
    a = [viridis(v) for v in data[val]]
    data[val] = a
data = data.rename(columns={'SMC1':'cohesin','MED1':'mediator','ATAC':'open regions'})
data["clusters"]  = colors

In [None]:
fig = sns.clustermap(np.log2(1.01+merged[cols].iloc[rand].iloc[sorting].T), vmin=0,vmax=3,figsize=(20,15),z_score=0,col_cluster=False,col_colors= colors, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust)+'_clustermap_cobinding.pdf')
plt.show()

### Enrichment

In [None]:
enr, _ = chiphelper.enrichment(merged,groups=groups)

In [None]:
enr = enr.replace(-np.inf, -13.7)

In [None]:
fig = sns.clustermap(enr.T,figsize=(16,16), vmax=7, vmin=-7, cmap='RdBu_r')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust)+'enrichment_on_cluster.pdf")
plt.show()

### Plot TSNE density map

In [None]:
cols = list(merged.columns[8:-19])
cols.remove('PU1')
data = stats.zscore(merged[cols].values)
cols

In [None]:
scaled_data = (data - data.min(0))/data.max(0)

In [None]:
rand = np.random.choice(merged.index,30000)

In [None]:
red_data = TSNE(2,10,verbose=10,n_iter=1500).fit_transform(scaled_data)
np.save(red_data.npy',red_data)

In [None]:
sns.kdeplot(red_data[:,0], red_data[:,1], shade=True)
plt.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_density_TSNE.pdf')

In [None]:
helper.bigScatter(red_data,binsize=0.2,showpoint=False,precomputed=False, logscale=True, title='density plot of enhancers in TF cobinding space with TSNE', folder='../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+"_")

### Using SOMs

In [None]:
cols

In [None]:
#Import the library
size = 20
#Build a network 20x20 with a weights format taken from the raw_data and activate Periodic Boundary Conditions. 
net = sps.somNet(size,size, data, PBC=True)

#Train the network for 10000 epochs and with initial learning rate of 0.01. 
net.train(0.01, 10000)

#Save the weights to file
net.save('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'_cobinding_SOMweights_'+str(size))

In [None]:
net = sps.somNet(0,0, data, loadFile='../results/'+project+'/'+version+'_'+merging_version+'_'+window+'_cobinding_SOMweights_'+str(size), PBC=True)

In [None]:
cols

In [None]:
col=9
#Print a map of the network nodes and colour them according to the first feature (column number 0) of the dataset
#and then according to the distance between each node and its neighbours.
print(cols[col])
net.nodes_graph(colnum=col)

In [None]:
diffs = net.diff_graph(show=False, returns=True)
plt.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_cobinding_SOM_'+str(size)+'.pdf')

In [None]:
somnodes = {'r':[],'q':[],'c':diffs,'features':[]} 
for i, node in enumerate(net.nodeList):
    somnodes['q'].append(node.pos[0]+(i%size)*0.535+(i//size)*0.055)
    somnodes['r'].append(-node.pos[1]-(i%size)*0.2)
    somnodes['features'].append([cols[i] for i in np.argsort(node.weights) if abs(node.weights[i])>0.4])
somnodes=pd.DataFrame(somnodes)

In [None]:
for i, v in somnodes.iterrows():
    tot=""
    for e, j in enumerate(v.features):
        if e%5==4:
            tot+='\n'
        tot += " "+str(j)
    somnodes.loc[i, 'features'] = tot

In [None]:
helper.bigScatter(somnodes,precomputed=True, features=True,binsize=1, title='Cobinding SOM cluster of '+str(size), folder='../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window)

In [None]:
#Cluster the datapoints according to the Quality Threshold algorithm.
clusts = net.cluster(data, type='qthresh')

## Doing Motif analysis

In [None]:
! mkdir ../../data/MEME

In [None]:
## computing motif across the open region of the genome of MV411 from ATACseq with MEME.mast
! bedtools getfasta -fi ../../data/ref/Homo_sapiens_assembly38.fasta -bed ../data/$project/ATACseq/ATAC_MV411.mRp.clN_peaks.broadPeak | fold -w 500 > ../../data/MEME/ATAC.fasta
! mkdir ../../data/MEME/ATAC/
#! export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.1.1:$PATH && mast --oc ../../data/MEME/ATAC/ --remcorr ../../motif_databases/HUMAN/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme ../../data/MEME/ATAC.fasta
#! export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.1.1:$PATH && mcast --oc ../../data/MEME/ATAC/ --max-gap 80 ../../motif_databases/HUMAN/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme ../../data/MEME/ATAC.fasta
#! export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.1.1:$PATH && fimo --oc ../../data/MEME/ATAC/ ../../motif_databases/HUMAN/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme ../../data/MEME/ATAC.fasta
# ! gff2bed < ../../data/MEME/ATAC/fimo.gff > ../../data/MEME/ATAC/fimo.gff.bed

In [None]:
merged['strand'] = '.'
merged[merged.columns[[0,1,2,3,4,-1]]].to_csv('../results/'+project+"/"+version+'_'+merging_version+'_'+window+'_ merged_true.bed',index=False, header=False sep='\t')

In [None]:
## What are the motifs enriched for each cluster groups in the conscensus peak set? 
! bedtools getfasta -fi ../../data/ref/Homo_sapiens_assembly38.fasta -bed ../results/$project/$version_$merging_version_$window_merged_true.bed | fold -w 500 > ../../data/MEME/merged$version_$merging_version_$window.fasta
#! mkdir ../../data/MEME/merged/
! export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.1.1:$PATH && mast --oc ../../data/MEME/merged/ --remcorr ../../motif_databases/HUMAN/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme ../../data/MEME/merged.fasta
! export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.1.1:$PATH && fimo --oc ../../data/MEME/merged/ ../../motif_databases/HUMAN/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme ../../data/MEME/merged.fasta
! gff2bed < ../../data/MEME/merged/fimo.gff > ../../data/MEME/merged/fimo.gff.bed

In [None]:
## What are the motifs of our CRC members in ATACseq but not in our matrix
pd.read_csv('')

In [None]:
## do we have better correlation if we remove cobinding event that don't have a related motif?

In [None]:
## what enrichment do we have in each group? what enrichment do we have for each ChipSeq?

In [None]:
cols

In [None]:
## computing predicted motif for each TF from Chip data MEME-Chip
import subprocess
files = os.listdir("../data/single_bed/")
for val in cols:
    for v in [f for f in files if val.split('_')[0] in f]:
        name = v.split('.')[0]
        res = subprocess.run('bedtools getfasta -fi ../../data/ref/Homo_sapiens_assembly38.fasta -bed ../results/' + project + "/" + version + '_' + merging_version + '_' + window + '_ MV411Merged/'+v+' | fold -w 500 > ../../data/MEME/'+name+'.fasta',capture_output=True, shell=True)
        if res.returncode != 0:
            raise ValueError('issue with the command: ' + str(res.stderr))
        else:
            print(res.stdout.decode("utf-8"))
        subprocess.run('mkdir ../../data/MEME/'+name,capture_output=True, shell=True)
        res = subprocess.run('export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.1.1:$PATH && meme-chip -meme-nmotifs 6 --meme-p 8 --oc ../../data/MEME/'+name+'/ -db ../../motif_databases/HUMAN/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme ../../data/MEME/'+name+'.fasta',capture_output=True, shell=True)
        if res.returncode != 0:
            raise ValueError('issue with the command: ' + str(res.stderr))
        else:
            print(res.stdout.decode("utf-8"))
    merged[merged[val]>0][merged.columns[[0,1,2,3,4,-1]]].to_csv('../temp/'+ val + '_cobinding.bed', index=False, header=False, sep='\t')
    res = subprocess.run('bedtools getfasta -fi ../../data/ref/Homo_sapiens_assembly38.fasta -bed ../temp/'+val+'_cobinding.bed | fold -w 500 > ../../data/MEME/'+val+'.fasta',capture_output=True, shell=True)
    if res.returncode != 0:
        raise ValueError('issue with the command: ' + str(res.stderr))
    else:
        print(str(res.stdout))
    subprocess.run('mkdir ../../data/MEME/'+val,capture_output=True, shell=True)
    res = subprocess.run('export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.1.1:$PATH && meme-chip -meme-nmotifs 6 --meme-p 8 --oc ../../data/MEME/'+val+'/ -db ../../motif_databases/HUMAN/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme ../../data/MEME/'+val+'.fasta',capture_output=True, shell=True)
    if res.returncode != 0:
        raise ValueError('issue with the command: ' + str(res.stderr))
    else:
        print(res.stdout.decode("utf-8"))
    #! export PATH=$HOME/meme/bin:$HOME/meme/libexec/meme-5.1.1:$PATH && fimo --oc ../../data/MEME/merged/ ../../motif_databases/HUMAN/HOCOMOCOv11_full_HUMAN_mono_meme_format.meme ../../data/MEME/merged.fasta
    #! gff2bed < ../../data/MEME/merged/fimo.gff > ../../data/MEME/merged/fimo.gff.bed

In [None]:
## using DeepBind

## Assigning genes

### based on closest expressed gene

In [None]:
chiphelper.AssignToClosestExpressed()

In [None]:
### recompute cobinding based on this.

In [None]:
## redo the plots. do we get better plots?/correlations?...

### based on the ABC model

![](images/ABCtitle.png)

They tested a new model based on and validated by CRISPRi-FlowFISH which is basically able to find enhancer mapping to genes. 
They used it to compute their model's Accuracy and found a 70% accuracy compared to less than 50% for closest expressed gene. 

Way to integrate our HiC data (need ATAC-seq like data as well, but openly available) 


![](images/ABCmodel.png)

In [None]:
Helper.scatter(TSNE(2,5).fit_transform(data.T), labels=zones.columns[11:],colors=labels)

In [None]:
### recompute cobinding based on this.

In [None]:
### redo the plots. do we get better looking plots?

In [None]:
### compare presence of CTCF and transcription of linked RNA

## Predict

In [None]:
## enrichment at the gene set level

In [None]:
## are our cobinding clusters enriched in some sets of genes / CRC? 

In [None]:
## make a linear model to with marks and cobinding data + motifs and TFs+cobinding motifs + expression of cobound proteins + expression , can we predict expression/dependenccy?

In [None]:
## can we predict better for some set of genes / CRCs?

In [None]:
## do the same on closest expressed gene

In [None]:
## do this prediction on each enhancer.what is the best predicting enhancer? do that correlate with ABC model data?

In [None]:
## do we get, for some gene, better single enhancer prediction?

In [None]:
## what are thhe most explanatory regressors

## RNP data

In [None]:
## can we predict RNP data (setting this TF to zero and lookingg at expected RNA change)
## do we see genes with opposite effects? 

In [None]:
## if we add RNP data, can we increase our model's prediction? (we have expression change and we set all RNPed-TF values to 0) 

In [None]:
### to predict remaining X% RNA expression
### to predict remaining X TFs RNP
### to predict regular RNA expression

In [None]:
## same thing with filtering base on motif presence (actual DNA binding)
## same thing with closest expressed gene

In [None]:
## Do on all cell lines

In [None]:
## repeat the process across all samples with H3K27ac+RNAseq data we have.

In [None]:
## call mutations from H3k27ac data

In [None]:
## MEME analysis of likely bound TFs, except if TF is not expressed

In [None]:
## compute enrichments

In [None]:
## from same gene assignements, as found on the general consensus peak set, can we find good dependency/expression prediction

### Compare data with other labs (H3K27, HiC..)

we need to redo everything for similar normal cell type, getting TFs based on the CRC (find it with CRCmapper or on litterature)