In [1]:
import os
import pandas as pd
import sys
import numpy as np
import sys
sys.path.insert(0, '../..')
import itertools

from genepy.utils import helper as h
from genepy.epigenetics import chipseq as chip

import dalmatian as dm
import pyBigWig

from bokeh.plotting import *
from IPython.display import IFrame
import igv
from biomart import BiomartServer
import io

from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture

output_notebook()
%load_ext autoreload
%autoreload 2

# ChIP

In [None]:
project="IRF2BP2_degron_ChIP"

In [None]:
!gsutil ls gs://amlproject/Chip/

In [None]:
# install bwa
! mkdir ../../ref
! curl ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/001/215/GCF_000001215.4_Release_6_plus_ISO1_MT/GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna.gz -o ../../ref/reference_droso.fna.gz
! gunzip ref/reference_droso.fna.gz
! bwa index -a bwtsw ../../ref/reference_droso.fna

## V1

### analysis

In [None]:
rename1 = {
 'gs://transfer-amlproject/20191211_10_MP7613_S8_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp779-MV411_IRF2BP2_DMSO_6h-FLAG_IRF2BP2-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_10_MP7613_S8_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp779-MV411_IRF2BP2_DMSO_6h-FLAG_IRF2BP2-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_11_MP7613_S9_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp780-MV411_IRF2BP2_DMSO_6h-MED1-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_11_MP7613_S9_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp780-MV411_IRF2BP2_DMSO_6h-MED1-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_12_MP7613_S10_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp781-MV411_IRF2BP2_DMSO_6h-POLII_total-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_12_MP7613_S10_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp781-MV411_IRF2BP2_DMSO_6h-POLII_total-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_13_MP7613_S11_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp782-MV411_IRF2BP2_DMSO_6h-POLII_S2-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_13_MP7613_S11_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp782-MV411_IRF2BP2_DMSO_6h-POLII_S2-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_14_MP7613_S12_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp783-MV411_IRF2BP2_DMSO_6h-POLII_S5-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_14_MP7613_S12_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp783-MV411_IRF2BP2_DMSO_6h-POLII_S5-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_15_MP7613_S13_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp784-MV411_IRF2BP2_DMSO_6h-MYC-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_15_MP7613_S13_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp784-MV411_IRF2BP2_DMSO_6h-MYC-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_16_MP7613_S14_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp785-MV411_IRF2BP2_DMSO_6h-MYB-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_16_MP7613_S14_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp785-MV411_IRF2BP2_DMSO_6h-MYB-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_1_MP7613_S1_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp786-MV411_IRF2BP2_DMSO_6h-SPI1-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_1_MP7613_S1_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp786-MV411_IRF2BP2_DMSO_6h-SPI1-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_2_MP7613_S2_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp787-MV411_IRF2BP_VHL_6h-FLAG_IRF2BP2-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_2_MP7613_S2_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp787-MV411_IRF2BP_VHL_6h-FLAG_IRF2BP2-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_3_MP7613_S3_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp788-MV411_IRF2BP_VHL_6h-MED1-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_3_MP7613_S3_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp788-MV411_IRF2BP_VHL_6h-MED1-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_4_MP7613_S4_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp789-MV411_IRF2BP_VHL_6h-POLII_total-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_4_MP7613_S4_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp789-MV411_IRF2BP_VHL_6h-POLII_total-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_5R_MP7613_S15_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp790-MV411_IRF2BP_VHL_6h-POLII_S2-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_5R_MP7613_S15_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp790-MV411_IRF2BP_VHL_6h-POLII_S2-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_6R_MP7613_S16_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp791-MV411_IRF2BP_VHL_6h-POLII_S5-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_6R_MP7613_S16_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp791-MV411_IRF2BP_VHL_6h-POLII_S5-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_7_MP7613_S5_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp792-MV411_IRF2BP_VHL_6h-MYC-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_7_MP7613_S5_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp792-MV411_IRF2BP_VHL_6h-MYC-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_8_MP7613_S6_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp793-MV411_IRF2BP_VHL_6h-MYB-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_8_MP7613_S6_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp793-MV411_IRF2BP_VHL_6h-MYB-r1_2.fastq.gz",
 'gs://transfer-amlproject/20191211_9_MP7613_S7_R1_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp794-MV411_IRF2BP_VHL_6h-SPI1-r1_1.fastq.gz",
 'gs://transfer-amlproject/20191211_9_MP7613_S7_R2_001.fastq.gz':"gs://amlproject/Chip/IRF2BP2_degraded_rep1/fastqs/mp794-MV411_IRF2BP_VHL_6h-SPI1-r1_2.fastq.gz"}

In [None]:
for k, val in rename1.items():
    os.system('gsutil mv '+k+' '+val)

In [None]:
! gsutil -m cp -r gs://amlproject/Chip/IRF2BP2_degraded_rep1 ../../data/
inputfastq="gs://amlproject/Chip/fastqs/mp99-MV411-INPUT-r1.fastq.gz"
! gsutil cp $inputfastq ../../data/IRF2BP2_degraded_rep1/fastqs/
a = ! ls ../../data/IRF2BP2_degraded_rep1/fastqs/

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [None]:
gsheet

In [None]:
df = {
"fastq1": [],
"fastq2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in helper.grouped(a[:-1],2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('_')[0]]
    df['group'].append((row.id.values[0] + '_'+ row.name_replicate.values[0] +"_"+row.protein.values[0]))
    df['replicate'].append(1)
    df['fastq1'].append(val[0])
    #df['fastq2'].append(val[1])
    df['antibody'].append(row.protein.values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq1'].append(a[-1])
#df['fastq2'].append(val[1])
df['antibody'].append("INPUT")
df['control'].append("INPUT")
df = pd.DataFrame(df)

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/design_IRF2BP2_degraded_rep1.csv')

In [None]:
#process chips
! sudo ./nextflow run nf-core/chipseq --single_end --seq_center 'DFCI' --email 'jkobject@gmail.com' --narrow_peak --input ../nextflow/design_IRF2BP2_degraded_rep1.csv --genome GRCh38 --skip_preseq --max_cpus 24 -profile docker -w work

In [None]:
#!gsutil cp results/* ../../data/IRF2BP2_degraded_rep1/ && sudo rm -r results && sudo rm -r work

In [None]:
# get scaling values
mappedreads, umappedreads_norm, mapped = chip.getSpikeInControlScales(refgenome="../../ref/reference_droso.fna",FastQfolder='../../data/IRF2BP2_degraded_rep1/fastqs/',pairedEnd=True, cores=8)
mappedreads, umappedreads_norm, mapped

computing scales from the excel sheet

In [None]:
scales = [[562285,1496707],
[1686168,7198567],
[3642441,2612624],
[3992589,3474812],
[3347901,3829477],
[6181136,989703],
[7523840,4173047],
[922482,1195857]]

In [None]:
scales = [[1.0, 0.3756814126], #IRF2BP2
[1.0, 0.2342366196], #MED1
[0.5546432407, 1.0], #MYB
[0.1601166841, 1.0], #MYC
[0.870315477, 1.0], #POL2S2
[1.0, 0.8742449687], #POL2S5
[0.7172728398, 1.0], #POL2TOT
[1.0, 0.7713982525]] #SPI1

### on scalled data


In [None]:
bams = !ls ../../data/IRF2BP2_degraded_rep1/diffData/*treat_pileup.bdg
bams = ['_'.join(i.split('/')[-1].split('_')[:-2]) for i in bams]
bams

In [None]:
# diffPeak on scalled data
sizes = [220, 191, 211, 204, 285, 222, 228, 194]
for i in range(int(len(bams)/2)):
    name1 = bams[i]
    name2 = bams[i+8]
    print(name1,name2)
    print(chip.diffPeak(name1,name2, directory= "../../data/IRF2BP2_degraded_rep1/diffData/", res_directory='../../data/IRF2BP2_degraded_rep1/diffPeaks/', scaling1=scales[i][1], scaling2=scales[i][0], size=size[i]))

In [None]:
# diffPeak on scalled data
for i in range(int(len(bams[1:])/2)-1):
    name1 = bams[1+i]
    name2 = bams[9+i]
    chip.fullDiffPeak(name1,name2, bams[0], scaling = scales[i],compute_size=True)

In [None]:
scales = [1.0,
1.0,
0.5546432407,
0.1601166841,
0.870315477,
1.0,
0.7172728398,
1.0,
0.3756814126,
0.2342366196,
1.0,
1.0,
1.0,
0.8742449687,
1.0,
0.7713982525]

In [None]:
chip.bigWigFrom(bams[1:], genome='GRCh38',scaling=scales)

In [None]:
! mv ../../data/recalib_bigwig_rep1/* ../../data/IRF2BP2_degraded_rep1/recalib_bigwig/
bw = ! ls ../../data/IRF2BP2_degraded_rep1/recalib_bigwig/*.bw
bw

In [None]:
cond1peak = ! ls ../../data/IRF2BP2_degraded_rep1/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/IRF2BP2_degraded_rep1/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/IRF2BP2_degraded_rep1/diffPeaks/*common.bed
commonpeak

In [None]:
names = ["FLAG_IRF2BP2","MED1","MYB","MYC","POLII_S2","POLII_S5","POLII_total","SPI1"]

In [None]:
for i in range(int(len(bw)/2)):
    if i<0:
        continue
    name1 = bw[i]
    name2 = bw[8+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    for val in peak:
        chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, refpoint='center', folder="", title=name, numthreads=7, torecompute=True, name='../../data/IRF2BP2_degraded_rep1/diffPeaks/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
peaks = ! ls ../../data/IRF2BP2_degraded_rep1/bwa/mergedLibrary/macs/narrowPeak/*.narrowPeak
peaks

In [None]:
names = ["FLAG_IRF2BP2","MED1","MYB","MYC","POLII_S2","POLII_S5","POLII_total","SPI1"]
names.extend([i+'_VHL' for i in names])
names

In [None]:
for i in range(len(bw)):
    chip.getPeaksAt(peaks[i], bw[i], window=3000, folder="", title=names[i], numthreads=7, torecompute=True, onlyProfile=True, refpoint='center', name='../../data/IRF2BP2_degraded_rep1/diffPeaks/'+names[i]+'_mat_profile.pdf', withDeeptools=True)
    chip.getPeaksAt(peaks[i], bw[i], window=3000, folder="", title=names[i], numthreads=7, torecompute=True, refpoint='center',onlyProfile=True,name='../../data/IRF2BP2_degraded_rep1/diffPeaks/'+names[i]+'_mat_profile_clust3.pdf', withDeeptools=True, cluster=3)

### on unscalled data

In [None]:
bams = !ls ../../data/IRF2BP2_degraded_rep1/bwa/mergedLibrary/mp*.bam
bams

In [None]:
! mkdir ../../data/IRF2BP2_degraded_rep1/diffPeaks_unscaled

In [None]:
bams= list(bams)

In [None]:
bams

In [None]:
size

In [None]:
# on unscalled data 
for i in range(int((len(bams)-1)/2)-1):
    if i < 0:
        continue
    name1 = bams[1+i]
    name2 = bams[9+i]
    chip.fullDiffPeak(name1,name2, control1=bams[0], size=size[i], compute_size=False, directory = "../../data/IRF2BP2_degraded_rep1/diffData_unscaled/", res_directory = "../../data/IRF2BP2_degraded_rep1/diffPeaks_unscaled/",pairedend=False)

In [None]:
bw = ! ls ../../data/IRF2BP2_degraded_rep1/bwa/mergedLibrary/bigwig/*.bigWig

In [None]:
commonpeak = ! ls ../../data/IRF2BP2_degraded_rep1/diffPeaks_unscaled/*common.bed
cond1peak = ! ls ../../data/IRF2BP2_degraded_rep1/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/IRF2BP2_degraded_rep1/diffPeaks_unscaled/*cond2.bed
cond2peak

In [None]:
names = ["FLAG_IRF2BP2","MED1","POLII_total","POLII_S2","POLII_S5","MYC","MYB","SPI1"]

In [None]:
for i in range(int(len(bw[1:])/2)-1):
    name1 = bw[1+i]
    name2 = bw[9+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    for val in peak:
        chip.dropWeirdChromosomes(val)
    name = names[i]
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, title=name, numthreads=7, refpoint='center', folder="../../data/IRF2BP2_degraded_rep1/bwa/mergedLibrary/bigwig/", torecompute=True, name='../../data/IRF2BP2_degraded_rep1/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
! gsutil -m cp -r ../../data/IRF2BP2_degraded_rep1 gs://amlproject/Chip/
# rm -r ../../data/IRF2BP2_degraded_rep1

## v2

In [None]:
# mkdir ../../data/IRF2BP2_degraded_rep2/ && mkdir ../../data/IRF2BP2_degraded_rep2/fastqs && gsutil -m cp "gs://transfer-amlproject/20191219_MP7659*" ../../data/IRF2BP2_degraded_rep2/fastqs/

### analysis

In [None]:
rename = {
"20191219_MP7659_1_S1_R1_001.fastq.gz":"mp795-MV411_IRF2BP2_DMSO_6h-FLAG_IRF2BP2-r2_R1.fastq.gz",
"20191219_MP7659_1_S1_R2_001.fastq.gz":"mp795-MV411_IRF2BP2_DMSO_6h-FLAG_IRF2BP2-r2_R2.fastq.gz",
"20191219_MP7659_2_S2_R1_001.fastq.gz":"mp796-MV411_IRF2BP2_DMSO_6h-MED1-r2_R1.fastq.gz",
"20191219_MP7659_2_S2_R2_001.fastq.gz":"mp796-MV411_IRF2BP2_DMSO_6h-MED1-r2_R2.fastq.gz",
"20191219_MP7659_3_S3_R1_001.fastq.gz":"mp797-MV411_IRF2BP2_DMSO_6h-POLII_total-r2_R1.fastq.gz",
"20191219_MP7659_3_S3_R2_001.fastq.gz":"mp797-MV411_IRF2BP2_DMSO_6h-POLII_total-r2_R2.fastq.gz",
"20191219_MP7659_4_S4_R1_001.fastq.gz":"mp798-MV411_IRF2BP2_DMSO_6h-POLII_S2-r2_R1.fastq.gz",
"20191219_MP7659_4_S4_R2_001.fastq.gz":"mp798-MV411_IRF2BP2_DMSO_6h-POLII_S2-r2_R2.fastq.gz",
"20191219_MP7659_5_S5_R1_001.fastq.gz":"mp799-MV411_IRF2BP2_DMSO_6h-POLII_S5-r2_R1.fastq.gz",
"20191219_MP7659_5_S5_R2_001.fastq.gz":"mp799-MV411_IRF2BP2_DMSO_6h-POLII_S5-r2_R2.fastq.gz",
"20191219_MP7659_6_S6_R1_001.fastq.gz":"mp800-MV411_IRF2BP2_DMSO_6h-MYC-r2_R1.fastq.gz",
"20191219_MP7659_6_S6_R2_001.fastq.gz":"mp800-MV411_IRF2BP2_DMSO_6h-MYC-r2_R2.fastq.gz",
"20191219_MP7659_7_S7_R1_001.fastq.gz":"mp801-MV411_IRF2BP2_DMSO_6h-MYB-r2_R1.fastq.gz",
"20191219_MP7659_7_S7_R2_001.fastq.gz":"mp801-MV411_IRF2BP2_DMSO_6h-MYB-r2_R2.fastq.gz",
"20191219_MP7659_8_S8_R1_001.fastq.gz":"mp802-MV411_IRF2BP2_DMSO_6h-SPI1-r2_R1.fastq.gz",
"20191219_MP7659_8_S8_R2_001.fastq.gz":"mp802-MV411_IRF2BP2_DMSO_6h-SPI1-r2_R2.fastq.gz",
"20191219_MP7659_9_S9_R1_001.fastq.gz":"mp803-MV411_IRF2BP_VHL_6h-FLAG_IRF2BP2-r2_R1.fastq.gz",
"20191219_MP7659_9_S9_R2_001.fastq.gz":"mp803-MV411_IRF2BP_VHL_6h-FLAG_IRF2BP2-r2_R2.fastq.gz",
"20191219_MP7659_10_S10_R1_001.fastq.gz":"mp804-MV411_IRF2BP_VHL_6h-MED1-r2_R1.fastq.gz",
"20191219_MP7659_10_S10_R2_001.fastq.gz":"mp804-MV411_IRF2BP_VHL_6h-MED1-r2_R2.fastq.gz",
"20191219_MP7659_11_S11_R1_001.fastq.gz":"mp805-MV411_IRF2BP_VHL_6h-POLII_total-r2_R1.fastq.gz",
"20191219_MP7659_11_S11_R2_001.fastq.gz":"mp805-MV411_IRF2BP_VHL_6h-POLII_total-r2_R2.fastq.gz",
"20191219_MP7659_12_S12_R1_001.fastq.gz":"mp806-MV411_IRF2BP_VHL_6h-POLII_S2-r2_R1.fastq.gz",
"20191219_MP7659_12_S12_R2_001.fastq.gz":"mp806-MV411_IRF2BP_VHL_6h-POLII_S2-r2_R2.fastq.gz",
"20191219_MP7659_13_S13_R1_001.fastq.gz":"mp807-MV411_IRF2BP_VHL_6h-POLII_S5-r2_R1.fastq.gz",
"20191219_MP7659_13_S13_R2_001.fastq.gz":"mp807-MV411_IRF2BP_VHL_6h-POLII_S5-r2_R2.fastq.gz",
"20191219_MP7659_14_S14_R1_001.fastq.gz":"mp808-MV411_IRF2BP_VHL_6h-MYC-r2_R1.fastq.gz",
"20191219_MP7659_14_S14_R2_001.fastq.gz":"mp808-MV411_IRF2BP_VHL_6h-MYC-r2_R2.fastq.gz",
"20191219_MP7659_15_S15_R1_001.fastq.gz":"mp809-MV411_IRF2BP_VHL_6h-MYB-r2_R1.fastq.gz",
"20191219_MP7659_15_S15_R2_001.fastq.gz":"mp809-MV411_IRF2BP_VHL_6h-MYB-r2_R2.fastq.gz",
"20191219_MP7659_16_S16_R1_001.fastq.gz":"mp810-MV411_IRF2BP_VHL_6h-SPI1-r2_R1.fastq.gz",
"20191219_MP7659_16_S16_R2_001.fastq.gz":"mp810-MV411_IRF2BP_VHL_6h-SPI1-r2_R2.fastq.gz"
}

In [None]:
for k,v in rename.items():
    ! mv ../../data/IRF2BP2_degraded_rep2/fastqs/$k ../../data/IRF2BP2_degraded_rep2/fastqs/$v


In [None]:
inputfastq="gs://amlproject/Chip/fastqs/mp99-MV411-INPUT-r1.fastq.gz"
! gsutil cp $inputfastq ../../data/IRF2BP2_degraded_rep2/fastqs
a = ! ls ../../data/IRF2BP2_degraded_rep2/fastqs

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [None]:
df = {
"fastq1": [],
"fastq2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in helper.grouped(a[:-1],2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('_')[0]]
    df['group'].append((row.id.values[0] + '_'+ row.name_replicate.values[0] +"_"+row.protein.values[0]))
    df['replicate'].append(1)
    df['fastq1'].append(val[0])
    #df['fastq2'].append(val[1])
    df['antibody'].append(row.protein.values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq1'].append(a[-1])
#df['fastq2'].append(val[1])
df['antibody'].append("INPUT")
df['control'].append("INPUT")
df = pd.DataFrame(df)

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/IRF2BP2_degraded_rep2_design.csv')

In [None]:
#process chips
! sudo ./nextflow run nf-core/chipseq --single_end --seq_center 'DFCI' --email 'jkobject@gmail.com' --narrow_peak --input ../nextflow/IRF2BP2_degraded_rep2_design.csv --genome GRCh38 --skip_preseq --max_cpus 24 -profile docker -w work

In [None]:
#!gsutil cp results/* ../../data/IRF2BP2_degraded_rep2/ && sudo rm -r work && sudo rm -r results

In [None]:
# get scaling values
mappedreads, umappedreads_norm, mapped = chip.getSpikeInControlScales(refgenome="../../ref/reference_droso.fna",FastQfolder='../../data/IRF2BP2_degraded_rep2/fastqs/',pairedEnd=True, cores=8)
mappedreads, umappedreads_norm, mapped

computing scales from the excel sheet

In [None]:
scales = [[508878,637972],
[1929129,11595],
[669536,429562],
[1272730,810802],
[743859,893304],
[312888,1154119],
[1086031,880901],
[850181,1019640]]

In [None]:
scales = [[1.0, 0.7976494266],
[0.006010484524, 1.0],
[0.6415816327, 1.0],
[0.6370573492, 1.0],
[1.0, 0.8327053276],
[1.0, 0.2711054926],
[0.8111195721, 1.0],
[1.0, 0.8338050685]]

### on scalled data

In [None]:
# on scaled data
bams = ! ls ../../IRF2BP2_degraded_rep2/bwa/mergedLibrary/*.bam
for i in range(int(len(bams[1:])/2)-1):
    name1 = bams[1+val]
    name2 = bams[9+val]
    chip.fullDiffPeak(name1,name2,control1='../../data/IRF2BP2_degraded_rep2/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/IRF2BP2_degraded_rep2/diffData_unscaled/", res_directory = "../../data/IRF2BP2_degraded_rep2/diffPeaks_unscaled/", scaling = scales[val][::-1])

In [None]:
bams = ! ls ../../data/IRF2BP2_degraded_rep2/bwa/mergedLibrary/*.bam
bams = [i.split('/')[-1].split('.')[0] for i in bams]
bams

In [None]:
# diffPeak on scalled data
sizes = [220, 191, 228, 285, 222, 204, 211, 194]
for i in range(int(len(bams[1:])/2)-1):
    name1 = bams[1+i]
    name2 = bams[9+i]
    chip.diffPeak(name1,name2, directory= "../../data/IRF2BP2_degraded_rep2/diffData/", res_directory='../../data/IRF2BP2_degraded_rep2/diffPeaks/', scaling1=scales[i][1], scaling2=scales[i][0], size=size[i])

In [None]:
scales = [1.0,
0.006010484524,
0.6415816327,
0.6370573492,
1.0,
1.0,
0.8111195721,
1.0,
0.7976494266,
1.0,
1.0,
1.0,
0.8327053276,
0.2711054926,
1.0,
0.8338050685]

In [None]:
chip.bigWigFrom(bams[1:],genome='GRCh38',scaling=scales)

In [None]:
! mv ../../data/recalib_bigwig_rep2/* ../../data/IRF2BP2_degraded_rep2/recalib_bigwig/
bw = ! ls ../../data/IRF2BP2_degraded_rep2/recalib_bigwig/*.bw
bw

In [None]:
cond1peak = ! ls ../../data/IRF2BP2_degraded_rep2/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/IRF2BP2_degraded_rep2/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/IRF2BP2_degraded_rep2/diffPeaks/*common.bed
commonpeak

In [None]:
names = ["FLAG_IRF2BP2","MED1","POLII_total","POLII_S2","POLII_S5","MYC","MYB","SPI1"]

In [None]:
for i in range(int(len(bw)/2)):
    name1 = bw[i]
    name2 = bw[8+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    for val in peak:
        chip.dropWeirdChromosomes(val)
    name = names[i]
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=20, name='../../data/IRF2BP2_degraded_rep2/diffPeaks/'+name+'_mat.pdf', refpoint='center', withDeeptools=True, torecompute= True,)

In [None]:
peaks = ! ls ../../data/IRF2BP2_degraded_rep2/bwa/mergedLibrary/macs/narrowPeak/*.narrowPeak
peaks

In [None]:
names = ["FLAG_IRF2BP2","MED1","POLII_total","POLII_S2","POLII_S5","MYC","MYB","SPI1"]
names.extend([i+'_VHL' for i in names])
names

In [None]:
bw

In [None]:
for i in range(len(bw)-1):
    chip.getPeaksAt(peaks[i], bw[1+i], window=3000, folder="", title=names[i], numthreads=7, torecompute=True, onlyProfile=True, name='../../data/IRF2BP2_degraded_rep2/diffPeaks/'+names[i]+'_mat_profile.pdf', refpoint='center', withDeeptools=True)
    chip.getPeaksAt(peaks[i], bw[1+i], window=3000, folder="", title=names[i], numthreads=7, torecompute=True, refpoint='center', onlyProfile=True, name='../../data/IRF2BP2_degraded_rep2/diffPeaks/'+names[i]+'_mat_profile_clust3.pdf', withDeeptools=True, cluster=3)

### unscalled

In [None]:
bams = !ls ../../data/IRF2BP2_degraded_rep2/bwa/mergedLibrary/mp*.bam
bams

In [None]:
!mkdir  ../../data/IRF2BP2_degraded_rep2/diffPeaks_unscaled
bw = ! ../../data/IRF2BP2_degraded_rep2/bwa/mergedLibrary/bigwig/*.bigWig

In [None]:
# on unscalled data 
for i in range(int(len(bams)/2)):
    if i <0:
        continue
    name1 = bams[i]
    name2 = bams[8+i]
    chip.fullDiffPeak(name1,name2, control1='../../data/results3/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/IRF2BP2_degraded_rep2/diffData_unscaled/", res_directory = "../../data/IRF2BP2_degraded_rep2/diffPeaks_unscaled/", pairedend=False)

In [None]:
commonpeak = ! ls ../../data/IRF2BP2_degraded_rep2/diffPeaks_unscaled/*common.bed
cond1peak = ! ls ../../data/IRF2BP2_degraded_rep2/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/IRF2BP2_degraded_rep2/diffPeaks_unscaled/*cond2.bed
cond2peak

In [None]:
names = ["FLAG_IRF2BP2","MED1","POLII_total","POLII_S2","POLII_S5","MYC","MYB","SPI1"]

In [None]:
for i in range(int(len(bw[1:])/2)):
    name1 = bw[1+i]
    name2 = bw[9+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    for val in peak:
        chip.dropWeirdChromosomes(val)
    name = names[i]
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, title=name, numthreads=7, torecompute=True, refpoint='center', folder="", name='../../data/IRF2BP2_degraded_rep2/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
! gsutil -m cp -r "../../data/IRF2BP2_degraded_rep2" gs://amlproject/Chip/

## v3

In [None]:
! gsutil mv gs://transfer-amlproject/*MP7781*  gs://transfer-amlproject/IRF2BP2_v3/

### analysis

In [None]:
! mkdir ../../data/IRF2BP2_degraded_rep3 && mkdir ../../data/IRF2BP2_degraded_rep3/fastqs && gsutil -m cp gs://transfer-amlproject/IRF2BP2_v3/* ../../data/IRF2BP2_degraded_rep3/fastqs/

In [None]:
rename = {
"20200203_1_MP7781_S67_R1_001.fastq.gz":"mp831-MV411_IRF2BP_DMSO_6h-CDK8-r1_R1.fastq.gz",
"20200203_1_MP7781_S67_R2_001.fastq.gz":"mp831-MV411_IRF2BP_DMSO_6h-CDK8-r1_R2.fastq.gz",
"20200203_2_MP7781_S68_R1_001.fastq.gz":"mp832-MV411_IRF2BP_DMSO_6h-BRD4-r1_R1.fastq.gz",
"20200203_2_MP7781_S68_R2_001.fastq.gz":"mp832-MV411_IRF2BP_DMSO_6h-BRD4-r1_R2.fastq.gz",
"20200203_3_MP7781_S69_R1_001.fastq.gz":"mp833-MV411_IRF2BP_DMSO_6h-IRF8-r1_R1.fastq.gz",
"20200203_3_MP7781_S69_R2_001.fastq.gz":"mp833-MV411_IRF2BP_DMSO_6h-IRF8-r1_R2.fastq.gz",
"20200203_4_MP7781_S70_R1_001.fastq.gz":"mp834-MV411_IRF2BP_DMSO_6h-SMC1-r1_R1.fastq.gz",
"20200203_4_MP7781_S70_R2_001.fastq.gz":"mp834-MV411_IRF2BP_DMSO_6h-SMC1-r1_R2.fastq.gz",
"20200203_5_MP7781_S71_R2_001.fastq.gz":"mp835-MV411_IRF2BP_DMSO_6h-MED1-r3_R2.fastq.gz",
"20200203_5_MP7781_S71_R1_001.fastq.gz":"mp835-MV411_IRF2BP_DMSO_6h-MED1-r3_R1.fastq.gz",
"20200203_6_MP7781_S72_R1_001.fastq.gz":"mp836-MV411_IRF2BP_DMSO_6h-ZEB2-r1_R1.fastq.gz",
"20200203_6_MP7781_S72_R2_001.fastq.gz":"mp836-MV411_IRF2BP_DMSO_6h-ZEB2-r1_R2.fastq.gz",
"20200203_7_MP7781_S73_R1_001.fastq.gz":"mp837-MV411_IRF2BP_DMSO_6h-CEBPA-r1_R1.fastq.gz",
"20200203_7_MP7781_S73_R2_001.fastq.gz":"mp837-MV411_IRF2BP_DMSO_6h-CEBPA-r1_R2.fastq.gz",
"20200203_8_MP7781_S74_R1_001.fastq.gz":"mp838-MV411_IRF2BP_VHL_6h-CDK8-r1_R1.fastq.gz",
"20200203_8_MP7781_S74_R2_001.fastq.gz":"mp838-MV411_IRF2BP_VHL_6h-CDK8-r1_R2.fastq.gz",
"20200203_9_MP7781_S75_R1_001.fastq.gz":"mp839-MV411_IRF2BP_VHL_6h-BRD4-r1_R1.fastq.gz",
"20200203_9_MP7781_S75_R2_001.fastq.gz":"mp839-MV411_IRF2BP_VHL_6h-BRD4-r1_R2.fastq.gz",
"20200203_10_MP7781_S76_R2_001.fastq.gz":"mp840-MV411_IRF2BP_VHL_6h-IRF8-r1_R2.fastq.gz",
"20200203_10_MP7781_S76_R1_001.fastq.gz":"mp840-MV411_IRF2BP_VHL_6h-IRF8-r1_R1.fastq.gz",
"20200203_11_MP7781_S77_R1_001.fastq.gz":"mp841-MV411_IRF2BP_VHL_6h-SMC1-r1_R1.fastq.gz",
"20200203_11_MP7781_S77_R2_001.fastq.gz":"mp841-MV411_IRF2BP_VHL_6h-SMC1-r1_R2.fastq.gz",
"20200203_12_MP7781_S78_R1_001.fastq.gz":"mp842-MV411_IRF2BP_VHL_6h-MED1-r3_R1.fastq.gz",
"20200203_12_MP7781_S78_R2_001.fastq.gz":"mp842-MV411_IRF2BP_VHL_6h-MED1-r3_R2.fastq.gz",
"20200203_13_MP7781_S79_R1_001.fastq.gz":"mp843-MV411_IRF2BP_VHL_6h-ZEB2-r1_R1.fastq.gz",
"20200203_13_MP7781_S79_R2_001.fastq.gz":"mp843-MV411_IRF2BP_VHL_6h-ZEB2-r1_R2.fastq.gz",
"20200203_14_MP7781_S80_R2_001.fastq.gz":"mp844-MV411_IRF2BP_VHL_6h-CEBPA-r1_R2.fastq.gz",
"20200203_14_MP7781_S80_R1_001.fastq.gz":"mp844-MV411_IRF2BP_VHL_6h-CEBPA-r1_R1.fastq.gz",
"20200203_Input_MP7781_S81_R1_001.fastq.gz":"mp845-MV411_IRF2BP2_-INPUT-r1_R1.fastq.gz",
"20200203_Input_MP7781_S81_R2_001.fastq.gz":"mp845-MV411_IRF2BP2_-INPUT-r1_R2.fastq.gz"
}

In [None]:
for k,v in rename.items():
    ! mv ../../data/IRF2BP2_degraded_rep3/fastqs/$k ../../data/IRF2BP2_degraded_rep3/fastqs/$v

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [None]:
a = ! ls ../../data/IRF2BP2_degraded_rep3/fastqs

In [None]:
gsheet

In [None]:
df = {
"fastq_1": [],
"fastq_2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in helper.grouped(a[:-2],2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('-')[0]]
    df['group'].append(row['name'].values[0])
    df['replicate'].append(1)
    df['fastq_1'].append(val[0])
    df['fastq_2'].append(val[1])
    df['antibody'].append(row['protein'].values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq_1'].append(a[-2])
df['fastq_2'].append(a[-1])
df['antibody'].append("")
df['control'].append("")
df = pd.DataFrame(df)

In [None]:
df

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/IRF2BP2_degraded_rep3_design.csv',index=False)

In [None]:
#process chips
! sudo ../nextflow run nf-core/chipseq --paired_end --seq_center 'DFCI' --email \
'jkobject@gmail.com' --narrow_peak --input ../nextflow/../../data/IRF2BP2_degraded_rep3_design.csv --genome GRCh38 --skip_preseq \
--max_cpus 24 -profile docker -w work

In [None]:
ls

In [None]:
!cp -r results/* ../../data/IRF2BP2_degraded_rep3/ && sudo rm -r work && sudo rm -r results

In [None]:
ls ../TrimGalore-0.6.5/trim_galore

In [None]:
# get scaling values
mappedreads, umappedreads_norm = chip.getSpikeInControlScales(refgenome="../../ref/reference_droso.fna",
                                                                      fastQfolder='../../data/IRF2BP2_degraded_rep3/fastqs',
                                                                      pairedEnd=True, cores=8,
                                                                      tofilter=False,
                                                                      totrim=False,
                                                                      tomap=False,
                                                                      pathtotrim_galore="../TrimGalore-0.6.5/trim_galore")
mappedreads, umappedreads_norm

computing scales from the excel sheet

In [None]:
scales = [[536923,632558],
[601370,681405],
[2402198,1676203],
[417892,216192],
[1544590,1350802],
[1174994,1376726],
[289635,240366]]

In [None]:
scales= [[1.0, 0.8488122828],
[1.0, 0.8825441551],
[0.6977788675, 1.0],
[0.5173394083, 1.0],
[0.8745375796, 1.0],
[1.0, 0.8534697536],
[0.8298927961, 1.0]]

In [None]:
bams = ! ls ../../data/IRF2BP2_degraded_rep3/bwa/mergedLibrary/*.bam #../../data/results3/bwa/mergedLibrary/*.bam
bams

In [None]:
bams[7]

In [None]:
bams = [bam1.split('/')[-1].split('.')[0] for bam1 in bams]

### on scalled data

In [None]:
# diffPeak on scaled data
size=[206, 218, 189, 194, 217, 217, 176]
for i in range(int(len(bams)/2)):
    if i<0:
        continue
    name1 = bams[i]
    name2 = bams[7+i]
    print(name1,name2)
    chip.diffPeak(name1, name2, directory= "../../data/IRF2BP2_degraded_rep3/diffData/", res_directory='../../data/IRF2BP2_degraded_rep3/diffPeaks/', scaling1=scales[i][1], scaling2=scales[i][0], size=size[i])

In [None]:
os.popen('for i in $(ls ../../data/IRF2BP2_degraded_rep3/diffPeaks/*.bed); \
            do echo $(wc -l $i); \
            done').read().split('\n')

In [None]:
# diffPeak on scaled data
for i in range(int(len(bams)/2)):
    if i <0:
        continue
    name1 = bams[i]
    name2 = bams[7+i]
    print(name1,name2)
    chip.fullDiffPeak(name1,name2, control1='../../data/IRF2BP2_degraded_rep3/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/IRF2BP2_degraded_rep3/diffData/", res_directory = "../../data/IRF2BP2_degraded_rep3/diffPeaks/", isTF=True, compute_size=True, pairedend=False)

In [None]:
scales = [1.0,
1.0,
0.6977788675,
0.5173394083,
0.8745375796,
1.0,
0.8298927961,
0.8488122828,
0.8825441551,
1.0,
1.0,
1.0,
0.8534697536,
1.0]

In [None]:
bams

In [None]:
chip.bigWigFrom(bams[1:], 
                genome='GRCh38',scaling=scales,
               numthreads=8)

In [None]:
! mv diffPeaks ../../data/IRF2BP2_degraded_rep3
! mv diffData ../../data/IRF2BP2_degraded_rep3

In [None]:
!mv bigwig ../../data/recalib_bigwig_3 

In [None]:
os.popen('for i in $(ls ../../data/IRF2BP2_degraded_rep3/diffPeaks); \
            do echo $(wc -l "../../data/IRF2BP2_degraded_rep3/diffPeaks/"$i); \
            done').read().split('\n')

In [None]:
bw = ! ls ../../data/recalib_bigwig_3/*.bw
bw

In [None]:
cond1peak = ! ls ../../data/IRF2BP2_degraded_rep3/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/IRF2BP2_degraded_rep3/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/IRF2BP2_degraded_rep3/diffPeaks/*common.bed
cond1peak

In [None]:
names = ["CDK8","BRD4","IRF8","SMC1","MED1","ZEB2","CEBPA"]

In [None]:
for i in range(int(len(bw)/2)):
    if i<0:
        continue
    name1 = bw[i]
    name2 = bw[7+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    #for val in peak:
        #chip.dropWeirdChromosomes(val)
    name = names[i]
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], refpoint='center', peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=14, name='../../data/IRF2BP2_degraded_rep3/diffPeaks/'+name+'_mat.pdf', withDeeptools=True, torecompute=True)

In [None]:
names = ["CDK8","BRD4","IRF8","SMC1","MED1","ZEB2","CEBPA"]
names.extend([i+'_VHL' for i in names])
names

In [None]:
peaks = ! ls ../../data/IRF2BP2_degraded_rep3/bwa/mergedLibrary/macs/narrowPeak/*.narrowPeak
peaks

In [None]:
for i in range(len(bw)):
    chip.getPeaksAt(peaks[i], bw[i], window=3000, folder="", title=names[i], numthreads=7, torecompute=True, onlyProfile=True, refpoint='center', name='../../data/IRF2BP2_degraded_rep3/diffPeaks/'+names[i]+'_mat_profile.pdf', withDeeptools=True)
    chip.getPeaksAt(peaks[i], bw[i], window=3000, folder="", title=names[i], numthreads=7, torecompute=True, refpoint='center', onlyProfile=True,name='../../data/IRF2BP2_degraded_rep3/diffPeaks/'+names[i]+'_mat_profile_clust3.pdf', withDeeptools=True, cluster=3)

### on unscalled data

In [None]:
bams = !ls ../../data/IRF2BP2_degraded_rep3/bwa/mergedLibrary/mp*.bam
bams

In [None]:
! mkdir ../../data/IRF2BP2_degraded_rep3/diffPeaks_unscaled

In [None]:
#on unscalled data 
for i in range(int(len(bams)/2)):
    if i < 0:
        continue
    name1 = bams[i]
    name2 = bams[7+i]
    print(name1,name2)
    chip.fullDiffPeak(name1,name2, control1='../../data/results3/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/IRF2BP2_degraded_rep3/diffData_unscaled/", res_directory = "../../data/IRF2BP2_degraded_rep3/diffPeaks_unscaled/",pairedend=False)

In [None]:
bw = ! ls ../../data/IRF2BP2_degraded_rep3/bwa/mergedLibrary/bigwig/*.bigWig
bw

In [None]:
cond1peak = ! ls ../../data/IRF2BP2_degraded_rep3/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/IRF2BP2_degraded_rep3/diffPeaks_unscaled/*cond2.bed
commonpeak = ! ls ../../data/IRF2BP2_degraded_rep3/diffPeaks_unscaled/*common.bed
cond1peak

In [None]:
names = ["CDK8","BRD4","IRF8","SMC1","MED1","ZEB2","CEBPA"]

In [None]:
for i in range(int((len(bw)-1)/2)):
    if i <0:
        continue
    name1 = bw[1+i]
    name2 = bw[8+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    #for val in peak:
        #chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=14, torecompute=True, refpoint="center", name='../../data/IRF2BP2_degraded_rep3/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
! gsutil -m cp -r "../../data/IRF2BP2_degraded_rep3/" gs://amlproject/Chip/

## histones V1&2

In [None]:
bamfolder="../../data/IRF2BP2_degraded_v3/fastqs/"

In [None]:
! mkdir ../../data/IRF2BP2_degraded_v3 && mkdir ../../data/IRF2BP2_degraded_v3/fastqs && gsutil cp "gs://transfer-amlproject/*.fastq.gz" $bamfolder

In [None]:
! gsutil -m cp gs://transfer-amlproject/*MP7693* gs://transfer-amlproject/IRF2BP2_degraded_v3/ && gsutil -m rm gs://transfer-amlproject/*MP7693*


### analysis

In [None]:
rename = {
    "20200103_1_MP7693_S6_R1_001.fastq.gz":"mp811-MV411_IRF2BP2_DMSO_6h-H3K27ac-r1_R1.fastq.gz",
    "20200103_1_MP7693_S6_R2_001.fastq.gz":"mp811-MV411_IRF2BP2_DMSO_6h-H3K27ac-r1.fastq.gz",
    "20200103_2_MP7693_S7_R1_001.fastq.gz":"mp812-MV411_IRF2BP2_DMSO_6h-H3K27ac-r2_R1.fastq.gz",
    "20200103_2_MP7693_S7_R2_001.fastq.gz":"mp812-MV411_IRF2BP2_DMSO_6h-H3K27ac-r2.fastq.gz",
    "20200103_3_MP7693_S8_R1_001.fastq.gz":"mp813-MV411_IRF2BP2_DMSO_6h-H3K27me3-r1_R1.fastq.gz",
    "20200103_3_MP7693_S8_R2_001.fastq.gz":"mp813-MV411_IRF2BP2_DMSO_6h-H3K27me3-r1.fastq.gz",
    "20200103_4_MP7693_S9_R1_001.fastq.gz":"mp814-MV411_IRF2BP2_DMSO_6h-H3K27me3-r2_R1.fastq.gz",
    "20200103_4_MP7693_S9_R2_001.fastq.gz":"mp814-MV411_IRF2BP2_DMSO_6h-H3K27me3-r2.fastq.gz",
    "20200103_5_MP7693_S10_R1_001.fastq.gz":"mp815-MV411_IRF2BP2_DMSO_6h-H3K4me1-r1_R1.fastq.gz",
    "20200103_5_MP7693_S10_R2_001.fastq.gz":"mp815-MV411_IRF2BP2_DMSO_6h-H3K4me1-r1.fastq.gz",
    "20200103_6_MP7693_S11_R1_001.fastq.gz":"mp816-MV411_IRF2BP2_DMSO_6h-H3K4me1-r2_R1.fastq.gz",
    "20200103_6_MP7693_S11_R2_001.fastq.gz":"mp816-MV411_IRF2BP2_DMSO_6h-H3K4me1-r2.fastq.gz",
    "20200103_7_MP7693_S12_R1_001.fastq.gz":"mp817-MV411_IRF2BP2_DMSO_6h-H3K4me3-r1_R1.fastq.gz",
    "20200103_7_MP7693_S12_R2_001.fastq.gz":"mp817-MV411_IRF2BP2_DMSO_6h-H3K4me3-r1.fastq.gz",
    "20200103_8_MP7693_S13_R1_001.fastq.gz":"mp818-MV411_IRF2BP2_DMSO_6h-H3K4me3-r2_R1.fastq.gz",
    "20200103_8_MP7693_S13_R2_001.fastq.gz":"mp818-MV411_IRF2BP2_DMSO_6h-H3K4me3-r2.fastq.gz",
    "20200103_9_MP7693_S14_R1_001.fastq.gz":"mp819-MV411_IRF2BP2_DMSO_6h-H3K79me2-r1_R1.fastq.gz",
    "20200103_9_MP7693_S14_R2_001.fastq.gz":"mp819-MV411_IRF2BP2_DMSO_6h-H3K79me2-r1.fastq.gz",
    "20200103_10_MP7693_S15_R1_001.fastq.gz":"mp820-MV411_IRF2BP2_DMSO_6h-H3K79me2-r2_R1.fastq.gz",
    "20200103_10_MP7693_S15_R2_001.fastq.gz":"mp820-MV411_IRF2BP2_DMSO_6h-H3K79me2-r2.fastq.gz",
    "20200103_11_MP7693_S16_R1_001.fastq.gz":"mp821-MV411_IRF2BP2_VHL_6h-H3K27ac-r1_R1.fastq.gz",
    "20200103_11_MP7693_S16_R2_001.fastq.gz":"mp821-MV411_IRF2BP2_VHL_6h-H3K27ac-r1.fastq.gz",
    "20200103_12_MP7693_S17_R1_001.fastq.gz":"mp822-MV411_IRF2BP2_VHL_6h-H3K27ac-r2_R1.fastq.gz",
    "20200103_12_MP7693_S17_R2_001.fastq.gz":"mp822-MV411_IRF2BP2_VHL_6h-H3K27ac-r2.fastq.gz",
    "20200103_13_MP7693_S18_R1_001.fastq.gz":"mp823-MV411_IRF2BP2_VHL_6h-H3K27me3-r1_R1.fastq.gz",
    "20200103_13_MP7693_S18_R2_001.fastq.gz":"mp823-MV411_IRF2BP2_VHL_6h-H3K27me3-r1.fastq.gz",
    "20200103_14_MP7693_S19_R1_001.fastq.gz":"mp824-MV411_IRF2BP2_VHL_6h-H3K27me3-r2_R1.fastq.gz",
    "20200103_14_MP7693_S19_R2_001.fastq.gz":"mp824-MV411_IRF2BP2_VHL_6h-H3K27me3-r2.fastq.gz",
    "20200103_15_MP7693_S20_R1_001.fastq.gz":"mp825-MV411_IRF2BP2_VHL_6h-H3K4me1-r1_R1.fastq.gz",
    "20200103_15_MP7693_S20_R2_001.fastq.gz":"mp825-MV411_IRF2BP2_VHL_6h-H3K4me1-r1.fastq.gz",
    "20200103_16_MP7693_S21_R1_001.fastq.gz":"mp826-MV411_IRF2BP2_VHL_6h-H3K4me1-r2_R1.fastq.gz",
    "20200103_16_MP7693_S21_R2_001.fastq.gz":"mp826-MV411_IRF2BP2_VHL_6h-H3K4me1-r2.fastq.gz",
    "20200103_17_MP7693_S22_R1_001.fastq.gz":"mp827-MV411_IRF2BP2_VHL_6h-H3K4me3-r1_R1.fastq.gz",
    "20200103_17_MP7693_S22_R2_001.fastq.gz":"mp827-MV411_IRF2BP2_VHL_6h-H3K4me3-r1.fastq.gz",
    "20200103_18_MP7693_S23_R1_001.fastq.gz":"mp828-MV411_IRF2BP2_VHL_6h-H3K4me3-r2_R1.fastq.gz",
    "20200103_18_MP7693_S23_R2_001.fastq.gz":"mp828-MV411_IRF2BP2_VHL_6h-H3K4me3-r2.fastq.gz",
    "20200103_19_MP7693_S24_R1_001.fastq.gz":"mp829-MV411_IRF2BP2_VHL_6h-H3K79me2-r1_R1.fastq.gz",
    "20200103_19_MP7693_S24_R2_001.fastq.gz":"mp829-MV411_IRF2BP2_VHL_6h-H3K79me2-r1.fastq.gz",
    "20200103_20_MP7693_S25_R1_001.fastq.gz":"mp830-MV411_IRF2BP2_VHL_6h-H3K79me2-r2_R1.fastq.gz",
    "20200103_20_MP7693_S25_R2_001.fastq.gz":"mp830-MV411_IRF2BP2_VHL_6h-H3K79me2-r2.fastq.gz"
}

In [None]:
! gsutil -m cp gs://transfer-amlproject/IRF2BP2_degraded_v3/* ../../data/IRF2BP2_degraded_v3/fastqs/

In [None]:
for k,v in rename.items():
    ! mv $bamfolder$k $bamfolder$v

In [None]:
inputfastq="gs://amlproject/Chip/fastqs/mp99-MV411-INPUT-r1.fastq.gz"
! gsutil cp $inputfastq $bamfolder

In [None]:
a = ! ls $bamfolder

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [None]:
df = {
"fastq1": [],
"fastq2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in helper.grouped(a[:-1],2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('_')[0]]
    df['group'].append((row.id.values[0] + '_'+ row.name_replicate.values[0] +"_"+row.protein.values[0]))
    df['replicate'].append(1)
    df['fastq1'].append(val[0])
    #df['fastq2'].append(val[1])
    df['antibody'].append(row.protein.values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq1'].append(a[-1])
#df['fastq2'].append(val[1])
df['antibody'].append("INPUT")
df['control'].append("INPUT")
df = pd.DataFrame(df)

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/IRF2BP2_degraded_histones_design.csv')

In [None]:
#process chips
 ! sudo ./nextflow run nf-core/chipseq --single_end --seq_center 'DFCI' --email 'jkobject@gmail.com' --narrow_peak --input ../nextflow/IRF2BP2_degraded_histones_design.csv --genome GRCh38 --skip_preseq --max_cpus 24 -profile docker -w work

In [None]:
! mv results/* ../../data/IRF2BP2_degraded_v3/

In [None]:
bams = ! ls ../../data/IRF2BP2_degraded_v3/bwa/mergedLibrary/*.bam
bams = [i.split('/')[-1].split('.')[0] for i in bams]
bams

In [None]:
scales = [[191079,278272],
[274625,494562],
[1094016,2067804],
[1328914,969565],
[237779,132422],
[162971,174092],
[115788,187078],
[134269,495924],
[171890,225315],
[188370,199911]]

In [None]:
scales = [[1.0,0.6866626897],
[1.0,0.5552893267],
[1.0,0.5290714207],
[0.7295919826,1.0],
[0.5569120906,1.0],
[1.0,0.9361199825],
[1.0,0.6189290029],
[1.0,0.2707451142],
[1.0,0.762887513],
[1.0,0.9422693098]]

### on scalled data

In [None]:
# diffPeak on scaled data
size=[206, 213, 47, 272, 229, 235, 190, 196, 287, 288]
for i in range(int(len(bams)/2)):
    name1 = bams[1+i]
    name2 = bams[11+i]
    print(name1,name2)
    chip.diffPeak(name1, name2, directory= "../../data/IRF2BP2_degraded_v3/diffData/", res_directory='../../data/IRF2BP2_degraded_v3/diffPeaks/', scaling1=scales[i][1], scaling2=scales[i][0], size=size[i])

In [None]:
# diffPeak on scaled data
for i in range(int(len(bams)/2)):
    name1 = bams[1+i]
    name2 = bams[11+i]
    print(name1,name2)
    chip.fullDiffPeak(name1,name2, bams[0], scaling = scales[i], directory='../../data/IRF2BP2_degraded_v3/diffData/',
res_directory="../../data/IRF2BP2_degraded_v3/diffPeaks/", isTF=False, compute_size=True, pairedend=False)

In [None]:
scales = [1.0,
1.0,
1.0,
0.7295919826,
0.5569120906,
1.0,
1.0,
1.0,
1.0,
1.0,
0.6866626897,
0.5552893267,
0.5290714207,
1.0,
1.0,
0.9361199825,
0.6189290029,
0.2707451142,
0.762887513,
0.9422693098,]

In [None]:
chip.bigWigFrom(bams[1:],genome='GRCh38',scaling=scales)

In [None]:
! mv ../../recalib_bigwig_hist/* ../../data/IRF2BP2_degraded_v3/recalib_bigwig/
bw = ! ls ../../data/IRF2BP2_degraded_v3/recalib_bigwig/*.bw
bw

In [None]:
! rm ../../data/IRF2BP2_degraded_v3/diffData/mp*_R1*

In [None]:
cond1peak = ! ls ../../data/IRF2BP2_degraded_v3/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/IRF2BP2_degraded_v3/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/IRF2BP2_degraded_v3/diffPeaks/*common.bed
cond1peak

In [None]:
names = ["H3K27ac", "H3K27ac_v2","H3K27me3","H3K27me3_v2","H3K4me1","H3K4me1_v2", "H3K4me3", "H3K4me3_v2", "H3K79me2", "H3K79me2_v2"]

In [None]:
for i in range(int(len(bw)/2)):
    if i < 0 : 
        continue
    print(i)
    name1 = bw[i]
    name2 = bw[10+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    #for val in peak:
        #chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], torecompute= True, bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=7, refpoint='center', name='../../data/IRF2BP2_degraded_v3/diffPeaks/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
names = ["H3K27ac", "H3K27ac_v2","H3K27me3","H3K27me3_v2","H3K4me1","H3K4me1_v2", "H3K4me3", "H3K4me3_v2", "H3K79me2", "H3K79me2_v2"]
names.extend([i+'_VHL' for i in names])
names

In [None]:
peaks = ! ls ../../data/IRF2BP2_degraded_v3/bwa/mergedLibrary/macs/broadPeak/*.broadPeak 
peaks

In [None]:
for i in range(len(bw)):
    chip.getPeaksAt(peaks[i], bw[i], window=3000, folder="", title=names[i], numthreads=7, torecompute=True, onlyProfile=True, refpoint='center', name='../../data/IRF2BP2_degraded_v3/diffPeaks/'+names[i]+'_mat_profile.pdf', withDeeptools=True)
    chip.getPeaksAt(peaks[i], bw[i], window=3000, folder="", title=names[i], numthreads=7, torecompute=True, refpoint='center', onlyProfile=True,name='../../data/IRF2BP2_degraded_v3/diffPeaks/'+names[i]+'_mat_profile_clust3.pdf', withDeeptools=True, cluster=3)

### on unscalled data

In [None]:
bams = !ls ../../data/IRF2BP2_degraded_v3/bwa/mergedLibrary/mp*.bam
bams

In [None]:
bw = ! ls ../../data/IRF2BP2_degraded_v3/bwa/mergedLibrary/bigwig/*.bigWig
bw

In [None]:
# on unscalled data 
for i in range(int((len(bams)-1)/2)):
    name1 = bams[1+i]
    name2 = bams[11+i]
    chip.fullDiffPeak(name1,name2, control1='../../data/IRF2BP2_degraded_v3/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/IRF2BP2_degraded_v3/diffData_unscaled/", res_directory = "../../data/IRF2BP2_degraded_v3/diffPeaks_unscaled/",isTF=False, compute_size=True, pairedend=False)

In [None]:
cond1peak = ! ls ../../data/IRF2BP2_degraded_v3/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/IRF2BP2_degraded_v3/diffPeaks_unscaled/*cond2.bed
commonpeak = ! ls ../../data/IRF2BP2_degraded_v3/diffPeaks_unscaled/*common.bed
cond2peak

In [None]:
names = ["H3K27ac", "H3K27ac_v2","H3K27me3","H3K27me3_v2","H3K4me1","H3K4me1_v2", "H3K4me3", "H3K4me3_v2", "H3K79me2", "H3K79me2_v2"]

In [None]:
for i in range(int(len(bw)/2)):
    if i < 0:
        continue
    name1 = bw[1+i]
    name2 = bw[11+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    #for val in peak:
        #chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=10, refpoint='center', name='../../data/IRF2BP2_degraded_v3/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True, torecompute=True)

In [None]:
! gsutil -m cp -r "../../data/IRF2BP2_degraded_v3/" gs://amlproject/Chip/

## v4

In [None]:
! gsutil -m mv gs://transfer-amlproject/*MP7868*  gs://transfer-amlproject/IRF2BP2_v4/

### analysis

In [None]:
! mkdir ../../data/IRF2BP2_degraded_rep4 && mkdir ../../data/IRF2BP2_degraded_rep4/fastqs && gsutil -m cp gs://transfer-amlproject/IRF2BP2_v4/* ../../data/IRF2BP2_degraded_rep4/fastqs

In [None]:
a = ! ls ../../data/IRF2BP2_degraded_rep4/fastqs
a

In [None]:
rename = {
"20200302_1_MP7868_S51_R1_001.fastq.gz":"mp846-MV411_IRF2BP_DMSO_6h-MED1-r4_R1.fastq.gz", 
"20200302_1_MP7868_S51_R2_001.fastq.gz":"mp846-MV411_IRF2BP_DMSO_6h-MED1-r4_R2.fastq.gz", 
"20200302_2_MP7868_S52_R1_001.fastq.gz":"mp847-MV411_IRF2BP_DMSO_6h-MED1-r5_R1.fastq.gz", 
"20200302_2_MP7868_S52_R2_001.fastq.gz":"mp847-MV411_IRF2BP_DMSO_6h-MED1-r5_R2.fastq.gz", 
"20200302_3_MP7868_S53_R1_001.fastq.gz":"mp848-MV411_IRF2BP_DMSO_6h-FLAG_IRF2BP2-r3_R1.fastq.gz", 
"20200302_3_MP7868_S53_R2_001.fastq.gz":"mp848-MV411_IRF2BP_DMSO_6h-FLAG_IRF2BP2-r3_R2.fastq.gz",
"20200302_4_MP7868_S54_R1_001.fastq.gz":"mp849-MV411_IRF2BP_DMSO_6h-POLII_total-r3_R1.fastq.gz",
"20200302_4_MP7868_S54_R2_001.fastq.gz":"mp849-MV411_IRF2BP_DMSO_6h-POLII_total-r3_R2.fastq.gz",
"20200302_5_MP7868_S55_R1_001.fastq.gz":"mp850-MV411_IRF2BP_DMSO_6h-POLII_S2-r3_R1.fastq.gz",
"20200302_5_MP7868_S55_R2_001.fastq.gz":"mp850-MV411_IRF2BP_DMSO_6h-POLII_S2-r3_R2.fastq.gz",
"20200302_6_MP7868_S56_R1_001.fastq.gz":"mp851-MV411_IRF2BP_DMSO_6h-POLII_S5-r3_R1.fastq.gz",
"20200302_6_MP7868_S56_R2_001.fastq.gz":"mp851-MV411_IRF2BP_DMSO_6h-POLII_S5-r3_R2.fastq.gz",
"20200302_7_MP7868_S57_R1_001.fastq.gz":"mp852-MV411_IRF2BP_VHL_6h-MED1-r4_R1.fastq.gz",
"20200302_7_MP7868_S57_R2_001.fastq.gz":"mp852-MV411_IRF2BP_VHL_6h-MED1-r4_R2.fastq.gz",
"20200302_8_MP7868_S58_R1_001.fastq.gz":"mp853-MV411_IRF2BP_VHL_6h-MED1-r5_R1.fastq.gz",
"20200302_8_MP7868_S58_R2_001.fastq.gz":"mp853-MV411_IRF2BP_VHL_6h-MED1-r5_R2.fastq.gz",
"20200302_9_MP7868_S59_R1_001.fastq.gz":"mp854-MV411_IRF2BP_VHL_6h-FLAG_IRF2BP2-r3_R1.fastq.gz",
"20200302_9_MP7868_S59_R2_001.fastq.gz":"mp854-MV411_IRF2BP_VHL_6h-FLAG_IRF2BP2-r3_R2.fastq.gz", 
"20200302_10_MP7868_S60_R1_001.fastq.gz":"mp855-MV411_IRF2BP_VHL_6h-POLII_total-r3_R1.fastq.gz",
"20200302_10_MP7868_S60_R2_001.fastq.gz":"mp855-MV411_IRF2BP_VHL_6h-POLII_total-r3_R2.fastq.gz",
"20200302_11_MP7868_S61_R1_001.fastq.gz":"mp856-MV411_IRF2BP_VHL_6h-POLII_S2-r3_R1.fastq.gz",
"20200302_11_MP7868_S61_R2_001.fastq.gz":"mp856-MV411_IRF2BP_VHL_6h-POLII_S2-r3_R2.fastq.gz",
"20200302_12_MP7868_S62_R1_001.fastq.gz":"mp857-MV411_IRF2BP_VHL_6h-POLII_S5-r3_R1.fastq.gz",
"20200302_12_MP7868_S62_R2_001.fastq.gz":"mp857-MV411_IRF2BP_VHL_6h-POLII_S5-r3_R2.fastq.gz"
}

In [None]:
for k,v in rename.items():
    ! mv ../../data/IRF2BP2_degraded_rep4/fastqs/$k ../../data/IRF2BP2_degraded_rep4/fastqs/$v

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [None]:
gsheet

In [None]:
df = {
"fastq_1": [],
"fastq_2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in h.grouped(a,2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('-')[0]]
    df['group'].append(row['name'].values[0])
    df['replicate'].append(1)
    df['fastq_1'].append(val[0])
    df['fastq_2'].append(val[1])
    df['antibody'].append(row['protein'].values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq_1'].append('IRF2BP2_degraded_rep3/mp845-MV411_IRF2BP2_-INPUT-r1_R1.fastq.gz')
df['fastq_2'].append('IRF2BP2_degraded_rep3/mp845-MV411_IRF2BP2_-INPUT-r1_R2.fastq.gz')
df['antibody'].append("")
df['control'].append("")
df = pd.DataFrame(df)

In [None]:
df

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/IRF2BP2_degraded_rep4_design.csv',index=False)

In [None]:
#process chips
! sudo ../nextflow run nf-core/chipseq --paired_end --seq_center 'DFCI' --email \
'jkobject@gmail.com' --narrow_peak --input ../nextflow/IRF2BP2_degraded_rep4_design.csv --genome GRCh38 --skip_preseq \
--max_cpus 24 -profile docker -w work

In [None]:
!cp -r results/* ../../data/IRF2BP2_degraded_rep4/ && sudo rm -r work && sudo rm -r results

In [None]:
ls ../../TrimGalore-0.6.5/trim_galore

In [None]:
# get scaling values
mappedreads, umappedreads_norm = chip.getSpikeInControlScales(refgenome="../../data/ref/reference_droso.fna",
fastQfolder='../../data/IRF2BP2_degraded_rep4/fastqs/',
pairedEnd=True, cores=12,
tofilter=True,
totrim=False,
tomap=True,
results="../../data/IRF2BP2_degraded_rep4/",
pathtotrim_galore="../../TrimGalore-0.6.5/trim_galore")
mappedreads, umappedreads_norm

computing scales from the excel sheet

In [None]:
bams = ! ls ../../data/IRF2BP2_degraded_rep4/bwa/mergedLibrary/*.bam
bams

In [None]:
bams[6]

### on scalled data

In [None]:
bams = [bam1.split('/')[-1].split('.')[0] for bam1 in bams]

In [None]:
# diffPeak on scaled data
size=[ 208, 214, 207, 234, 296, 231]
for i in range(int(len(bams)/2)-1):
    name1 = bams[1+i]
    name2 = bams[7+i]
    print(name1,name2)
    chip.diffPeak(name1, name2, directory= "../../data/IRF2BP2_degraded_rep4/diffData/", res_directory='../../data/IRF2BP2_degraded_rep4/diffPeaks/', scaling1=scales[i][1], scaling2=scales[i][0], size=size[i])

In [None]:
# diffPeak on scaled data
for i in range(int(len(bams[1:])/2)):
    name1 = bams[1+i]
    name2 = bams[7+i]
    chip.fullDiffPeak(name1,name2, control1='../../data/IRF2BP2_degraded_rep3/bwa/mergedLibrary/mp845-MV411_IRF2BP2-INPUT-r1.mLb.clN.sorted.bam', directory = "../../data/IRF2BP2_degraded_rep4/diffData/", res_directory = "../../data/IRF2BP2_degraded_rep4/diffPeaks/",pairedend=False)

In [None]:
scales = [1.0,
1.0,
1.0,
1.0,
1.0,
0.626304048,
0.2628507876,
0.9021192519,
0.1869653476,
0.5500321887,
0.6046056203,
1.0]

In [None]:
bams

In [None]:
chip.bigWigFrom(bams[1:], 
                genome='GRCh38',scaling=scales,
               numthreads=8)

In [None]:
!mv bigwig ../../data/IRF2BP2_degraded_rep4/recalib_bigwig/

In [None]:
os.popen('for i in $(ls ../../data/IRF2BP2_degraded_rep4/diffPeaks/*.bed); \
            do echo $(wc -l $i); \
            done').read().split('\n')

In [None]:
bw = ! ls ../../data/IRF2BP2_degraded_rep4/recalib_bigwig/*.bw
bw

In [None]:
cond1peak = ! ls ../../data/IRF2BP2_degraded_rep4/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/IRF2BP2_degraded_rep4/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/IRF2BP2_degraded_rep4/diffPeaks/*common.bed
cond1peak

In [None]:
names = ["MED1","MED1_v2","FLAG_IRF2BP2","POLII_total","POLII_S2","POLII_S5"]

In [None]:
for i in range(int(len(bw)/2)):
    if i <0:
        continue
    name1 = bw[i]
    name2 = bw[i+6]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
   # for val in peak:
      #  chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=7, refpoint="center", name='../../data/IRF2BP2_degraded_rep4/diffPeaks/'+name+'_mat.pdf', withDeeptools=True, torecompute=True)

In [None]:
peaks = ! ls ../../data/results4/bwa/mergedLibrary/macs/narrowPeak/*.narrowPeak
peaks

In [None]:
names = ["MED1","MED1_v2","FLAG_IRF2BP2","POLII_total","POLII_S2","POLII_S5"]
names.extend([i+'_VHL' for i in names])
names

In [None]:
for i in range(len(bw)):
    chip.getPeaksAt(peaks[i], bw[i], window=3000, folder="", title=names[i], numthreads=7, torecompute=True, onlyProfile=True, name='../../data/IRF2BP2_degraded_rep4/diffPeaks/'+names[i]+'_mat_profile.pdf', withDeeptools=True, refpoint="center")
    chip.getPeaksAt(peaks[i], bw[i], window=3000, folder="", title=names[i], numthreads=7, torecompute=True, onlyProfile=True,name='../../data/IRF2BP2_degraded_rep4/diffPeaks/'+names[i]+'_mat_profile_clust3.pdf', withDeeptools=True, cluster=3, refpoint="center")

### on unscalled data

In [None]:
bams = !ls ../../data/results4/bwa/mergedLibrary/mp*.bam
bams

In [None]:
! mkdir ../../data/IRF2BP2_degraded_rep4/diffPeaks_unscaled

In [None]:
#on unscalled data 
for i in range(int(len(bams)/2)):
    if i < 5:
        continue
    name1 = bams[i]
    name2 = bams[6+i]
    print(name1,name2)
    chip.fullDiffPeak(name1,name2, control1='../../data/results4/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/IRF2BP2_degraded_rep4/diffData_unscaled/", res_directory = "../../data/IRF2BP2_degraded_rep4/diffPeaks_unscaled/",pairedend=False)

In [None]:
bw = ! ls ../../data/results4/bwa/mergedLibrary/bigwig/*.bigWig
bw

In [None]:
cond1peak = ! ls ../../data/IRF2BP2_degraded_rep4/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/IRF2BP2_degraded_rep4/diffPeaks_unscaled/*cond2.bed
commonpeak = ! ls ../../data/IRF2BP2_degraded_rep4/diffPeaks_unscaled/*common.bed
commonpeak

In [None]:
names = ["MED1","MED1_v2","FLAG_IRF2BP2","POLII_total","POLII_S2","POLII_S5"]

In [None]:
for i in range(int((len(bw)-1)/2)):
    name1 = bw[1+i]
    name2 = bw[7+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    #for val in peak:
     #   chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=7, torecompute=True, refpoint='center', name='../../data/IRF2BP2_degraded_rep4/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
! gsutil -m cp -r "../../data/IRF2BP2_degraded_rep4" gs://amlproject/Chip/

## Copying data

In [None]:
mkdir ../results/$project/diffPeaks_unscaled/

In [None]:
! cp ../../data/*/diffPeaks_unscaled/*.bed ../results/$project/diffPeaks_unscaled/

In [None]:
mkdir ../results/$project/diffPeaks_scaled/

In [None]:
! cp ../../data/*/diffPeaks/*.bed ../results/$project/diffPeaks_scaled/

## knockouts_v1

In [None]:
project="knockouts_v1"

In [None]:
! gsutil ls gs://transfer-amlproject/200723_MP8095_fastq/

In [None]:
mkdir ../data/$project/qc/

In [None]:
!gsutil -m cp gs://transfer-amlproject/200723_MP8095_fastq/multiqc_report.html ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/200723_MP8095_fastq/Reports/ ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/200723_MP8095_fastq/multiqc_data/ ../data/$project/qc/

In [None]:
! gsutil -m cp gs://transfer-amlproject/200723_MP8095_fastq/*  gs://transfer-amlproject/$project/

### analysis

In [None]:
! mkdir ../../data/$project && mkdir ../../data/$project/fastqs && gsutil -m cp gs://transfer-amlproject/$project/* ../../data/$project/fastqs

In [None]:
! rm ../../data/$project/fastqs/multiqc_report.html

In [None]:
a = ! ls ../../data/$project/fastqs
a

In [None]:
rename = {
"20200723_1_MP8095_S211": "mp858-MV411_RNP_AAVS1-H3K27AC-r1",
"20200723_2_MP8095_S212": "mp859-MV411_RNP_AAVS1-H3K27AC-r2",
"20200723_3_MP8095_S213": "mp860-MV411_RNP_RUNX1-H3K27AC-r1",
"20200723_4_MP8095_S214": "mp861-MV411_RNP_RUNX1-H3K27AC-r2",
"20200723_5_MP8095_S215": "mp862-MV411_RNP_RUNX2-H3K27AC-r1",
"20200723_6_MP8095_S216": "mp863-MV411_RNP_RUNX2-H3K27AC-r2",
"20200723_7_MP8095_S217": "mp864-MV411_RNP_RUNX1_RUNX2-H3K27AC-r1",
"20200723_8_MP8095_S218": "mp865-MV411_RNP_RUNX1_RUNX2-H3K27AC-r2",
"20200723_9_MP8095_S219": "mp866-MV411_RNP_MEF2D-H3K27AC-r1",
"20200723_10_MP8095_S220": "mp867-MV411_RNP_MEF2D-H3K27AC-r2",
"20200723_11_MP8095_S221": "mp868-MV411_RNP_IRF8-H3K27AC-r1",
"20200723_12_MP8095_S222": "mp869-MV411_RNP_IRF8-H3K27AC-r2",
"20200723_13_MP8095_S223": "mp870-MV411_RNP_MYB-H3K27AC-r1",
"20200723_14_MP8095_S224": "mp871-MV411_RNP_MYB-H3K27AC-r2",
"20200723_15_MP8095_S225": "mp872-MV411_RNP_SPI1-H3K27AC-r1",
"20200723_16_MP8095_S226": "mp873-MV411_RNP_SPI1-H3K27AC-r2",
"20200723_1S_MP8095_S209": "mp874-MV411_MEF2D_NT_SC_63-FLAG_MEF2D-r2",
"20200723_2S_MP8095_S210": "mp875-MV411_MEF2C_NT-FLAG_MEF2C-r1"}

In [None]:
for val in a:
    rep = val
    for k,v in rename.items():
        rep = rep.replace(k,v)
    !mv ../../data/$project/fastqs/$val ../../data/$project/fastqs/$rep

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [None]:
gsheet

In [None]:
a = ! ls ../../data/$project/fastqs
a

In [None]:
for val in a[-4:]:
    !gsutil cp ../../data/$project/fastqs/$val gs://amlproject/Chip/fastqs/
    !rm ../../data/$project/fastqs/$val

In [None]:
gcp.patternRN({'mp845-MV411_IRF2BP2_-INPUT-r1':'mp845-MV411-INPUT-r2'},'gs://amlproject/Chip/',wildcards=['**','.*'], test=False)

In [None]:
df = {
"fastq_1": [],
"fastq_2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in h.grouped(a[:-4],2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('-')[0]]
    df['group'].append("MV4"+row['name'].values[0].split('-r')[0].split('-MV4')[1])
    df['replicate'].append(row['name'].values[0].split('-r')[1])
    df['fastq_1'].append(project+"/fastqs/"+val[0])
    df['fastq_2'].append(project+"/fastqs/"+val[1])
    df['antibody'].append(row['protein'].values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq_1'].append('ref/mp845-MV411-INPUT-r2_R1.fastq.gz')
df['fastq_2'].append('ref/mp845-MV411-INPUT-r2_R2.fastq.gz')
df['antibody'].append("")
df['control'].append("")
df = pd.DataFrame(df)

In [None]:
df

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/additional_degraded_v1_design.csv',index=False)

In [None]:
! cd ../../data/ && sudo ../nextflow log ## to get access to the previous runs

In [None]:
#process chips
! cd ../../data/ && sudo ../nextflow run nf-core/chipseq --paired_end --seq_center 'DFCI' --email 'jkobject@gmail.com' --input ../AMLproject/nextflow/additional_degraded_v1_design.csv --genome GRCh38 --skip_preseq --max_cpus 16 -profile docker -w work -resume exotic_bartik

In [None]:
!cp -r ../../data/results/* ../../data/$project/ && sudo rm -r ../data/results && sudo rm -r ..data/work

In [None]:
# get scaling values
norm, mapped = h.getSpikeInControlScales(refgenome="../../data/ref/reference_droso.fna",
fastQfolder='../../data/'+project+'/fastqs/',
pairedEnd=True, cores=12,
tofilter=True,
totrim=True,
tomap=True,
results="../../data/"+project+"/",
pathtotrim_galore="../../TrimGalore-0.6.5/trim_galore")
norm, mapped

computing scales from the excel sheet

In [None]:
scales = [0.3011826465, # 'MV411_RNP_IRF8-H3K27AC_R1',
0.4865371752, # 'MV411_RNP_IRF8-H3K27AC_R2',
0.5670857556, # 'MV411_RNP_MEF2D-H3K27AC_R1',
0.7349663619, # 'MV411_RNP_MEF2D-H3K27AC_R2',
0.7548163023, # 'MV411_RNP_MYB-H3K27AC_R1',
0.5976325206, # 'MV411_RNP_MYB-H3K27AC_R2',
1.400750948, # 'MV411_RNP_RUNX1-H3K27AC_R1',
0.849637265, # 'MV411_RNP_RUNX1-H3K27AC_R2',
0.7539535476, # 'MV411_RNP_RUNX1_RUNX2-H3K27AC_R1',
0.6905692051, # 'MV411_RNP_RUNX1_RUNX2-H3K27AC_R2',
1.50505384, # 'MV411_RNP_RUNX2-H3K27AC_R1',
0.8442345485, # 'MV411_RNP_RUNX2-H3K27AC_R2',
0.5019100631, # 'MV411_RNP_SPI1-H3K27AC_R1',
0.8688220473, # 'MV411_RNP_SPI1-H3K27AC_R2'
         ]

In [None]:
bams = ! ls ../../data/$project/bwa/mergedLibrary/*.bam
bams

In [None]:
bams[6]

### on scalled data

In [None]:
! mkdir ../../data/$project/diffPeaks/ && ! mkdir ../../data/$project/diffData/

In [None]:
! mkdir ../../data/$project/droso_aligned

In [None]:
! mv ../../data/$project/mp* ../../data/$project/droso_aligned

In [None]:
wigs = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig

In [None]:
ls ../../data/$project/bwa/mergedLibrary/bigwig/

In [None]:
! gsutil -m cp -r ../../data/$project/bwa/mergedLibrary/bigwig/ gs://amlproject/Chip/$project/bwa/mergedLibrary/
! gsutil -m cp -r ../../data/$project/droso_aligned gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/recalib_bigwig gs://amlproject/Chip/$project/

In [None]:
for val in wigs:
    bedg = val[:-6]+'bdg'
    ! bigWigToBedGraph $val $bedg

In [None]:
bdg=! ls ../../data/knockouts_v1/bwa/mergedLibrary/bigwig/*.bdg
bdg

In [None]:
# diffPeak on scaled data 
size= 240
for i in range(int(len(bdg)-3)):
    name1 = bdg[1+i%2]
    name2 = bdg[3+i]
    print(name1,name2)
    print(chip.diffPeak(name1, name2, control1=bdg[0], control2=bdg[0], res_directory="../../data/"+project+"/diffPeaks/", scaling1=1, scaling2=scales[i], size=size))

In [None]:
bams

In [None]:
bams[10]

In [None]:
# diffPeak on scaled data (full reprocessing)
size= 240
for i in range(len(bams)-3):
    if i < 8:
        continue
    bam1 = bams[1+i%2]
    bam2 = bams[3+i]
    print(bam1,bam2)
    print(chip.fullDiffPeak(bam1,bam2, control1=bams[0], compute_size=False, size=size, scaling=[scales[i],1], directory = "../../data/"+project+"/diffData/", res_directory = "../../data/"+project+"/diffPeaks/",pairedend=True))

In [None]:
initscales = ! cat ../../data/$project/bwa/mergedLibrary/bigwig/scale/*.txt

In [None]:
scales = [val*float(initscales[3+i]) for i, val in enumerate(scales)]

In [None]:
scales

In [None]:
chip.bigWigFrom(bams[3:], 
                genome='GRCh38',scaling=scales,
               numthreads=8)

In [None]:
ls ../../data/$project/recalib_bigwig/

In [None]:
!mv bigwig/* ../../data/$project/recalib_bigwig/

In [None]:
!cp ../../data/$project/bwa/mergedLibrary/bigwig/MV411_RNP_AAVS1-*.bigWig ../../data/$project/recalib_bigwig/

In [None]:
os.popen('for i in $(ls ../../data/'+project+'/diffPeaks/*.bed); \
            do echo $(wc -l $i); \
            done').read().split('\n')

In [None]:
bw = ! ls ../../data/$project/recalib_bigwig/*
bw

In [None]:
!mkdir ../results/$project/
!mkdir ../results/$project/plots
!mkdir ../results/$project/plots/heatmaps/

In [None]:
# GENOME WIDE comparison

In [None]:
peaks = ! ls ../../data/$project/bwa/mergedLibrary/macs/broadPeak/*.broadPeak

In [None]:
names = ["AAVS1", "AAVS1_v2", "IRF8","IRF8_v2","MEF2D","MEF2D_v2","MYB","MYB_v2","RUNX1","RUNX1_v2","RUNX1_RUNX2","RUNX1_RUNX2_v2","RUNX2","RUNX2_v2", "SPI1","SPI1_v2"]

In [None]:
for i, val in enumerate(bw):
    if i <2:
        continue
    name = names[i]
    print(name)
    chip.getPeaksAt(peaks[0], bigwigs = [val,bw[0]], bigwignames= [name,"AAVS1"],peaknames=['Macs2_Peaks'], window=3000, folder="", title=name+"_vs_AAVS1", numthreads=8, refpoint="center", name='../../data/'+project+'/'+name+'_mat.pdf', withDeeptools=True, torecompute=True, vmax=4, legendLoc="lower-left")

In [None]:
! cp ../../data/additional_degraded_v1/*.pdf ../results/$project/plots/scaled/heatmaps/

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks/*common.bed
cond1peak

In [None]:
names = ["IRF8", "MEF2D", "MYB","RUNX1","RUNX1_RUNX2","RUNX2", "SPI1", "IRF8_v2", "MEF2D_v2", "MYB_v2", "RUNX1_v2", "RUNX1_RUNX2_v2", "RUNX2_v2", "SPI1_v2"]

In [None]:
bw

In [None]:
bw[(6*2)+3]

In [None]:
int(len(bw[2:])/2)

In [None]:
bw[round(7/6)]

In [None]:
for i in range(int(len(bw[2:]))):
    if i <8:
        continue
    cl = len(bw[2:])
    name1 = bw[round(i/(cl-1))]
    name2 = bw[((i%int(cl/2))*2)+2+round(i/(cl-1))]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
   # for val in peak:
      #  chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=7, refpoint="center", name='../../data/'+project+'/diffPeaks/'+name+'_mat.pdf', withDeeptools=True, torecompute=False)

In [None]:
! mkdir ../results/$project/plots/scaled/diffPeaks/
! cp ../../data/$project/diffPeaks/*.pdf ../results/$project/plots/scaled/diffPeaks/

### on unscalled data

In [None]:
bams = !ls ../../data/$project/bwa/mergedLibrary/mp*.bam
bams

In [None]:
! mkdir ../../data/$project/diffPeaks_unscaled

In [None]:
#on unscalled data 
for i in range(int(len(bams)/2)):
    if i < 5:
        continue
    name1 = bams[i]
    name2 = bams[6+i]
    print(name1,name2)
    chip.fullDiffPeak(name1,name2, control1='../../data/'+project+'/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/"+project+"/diffData_unscaled/", res_directory = "../../data/"+project+"/diffPeaks_unscaled/",pairedend=False)

In [None]:
bw = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig
bw

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks_unscaled/*common.bed
commonpeak

In [None]:
names = ["MED1","MED1_v2","FLAG_IRF2BP2","POLII_total","POLII_S2","POLII_S5"]

In [None]:
for i in range(int((len(bw)-1)/2)):
    name1 = bw[1+i]
    name2 = bw[7+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    #for val in peak:
     #   chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=7, torecompute=True, refpoint='center', name='../../data/'+project+'/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
! gsutil -m cp -r ../../data/$project gs://amlproject/Chip/

In [None]:
! cp ../../$project/*_mat.pdf ../results/$project/unscaled/
! cp ../../$project/*_mat.pdf ../results/$project/scaled/

! cp ../../$project/*_mat.pdf ../results/$project/unscaled/
! cp ../../$project/*_mat.pdf ../results/$project/unscaled/

! cp -r ../data/$project/bwa/mergedLibrary/deepTools/plot/* ../results/$project/plots

## MEF2D_degraded_v1

In [None]:
project="MEF2D_degraded_v1"

In [None]:
! gsutil ls gs://transfer-amlproject/201001_MP8262_fastq/

In [None]:
mkdir ../data/$project/ && mkdir ../data/$project/qc/

In [None]:
!gsutil -m cp gs://transfer-amlproject/201001_MP8262_fastq/multiqc_report.html ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/201001_MP8262_fastq/Reports/ ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/201001_MP8262_fastq/multiqc_data/ ../data/$project/qc/

In [None]:
! gsutil -m cp gs://transfer-amlproject/201001_MP8262_fastq/*  gs://transfer-amlproject/$project/

### analysis

In [None]:
! mkdir ../../data/$project && mkdir ../../data/$project/fastqs && gsutil -m cp gs://transfer-amlproject/$project/* ../../data/$project/fastqs

In [None]:
! rm ../../data/$project/fastqs/multiqc_report.html

In [None]:
a = ! ls ../../data/$project/fastqs
a

In [None]:
rename = {
"20201001_FLAG_DMSO_MP8262_S93": "mp881-MV411_MEF2D_NT_SC_63_DMSO-FLAG_MEF2D-r3",
"20201001_FLAG_VHL_MP8262_S99": "mp882-MV411_MEF2D_NT_SC_63_VHL-FLAG_MEF2D-r1",
"20201001_IRF8_DMSO_MP8262_S88": "mp883-MV411_MEF2D_NT_SC_63_DMSO-IFR8-r1",
"20201001_IRF8_VHL_MP8262_S94": "mp884-MV411_MEF2D_NT_SC_63_VHL-IFR8-r1",
"20201001_MED1_DMSO_MP8262_S91": "mp885-MV411_MEF2D_NT_SC_63_DMSO-MED1-r1",
"20201001_MED1_VHL_MP8262_S97": "mp886-MV411_MEF2D_NT_SC_63_VHL-MED1-r1",
"20201001_MEF2C_DMSO_MP8262_S90": "mp887-MV411_MEF2D_NT_SC_63_DMSO-MEF2C-r1",
"20201001_MEF2C_VHL_MP8262_S96": "mp888-MV411_MEF2D_NT_SC_63_VHL-MEF2C-r1",
"20201001_MYC_DMSO_MP8262_S89": "mp889-MV411_MEF2D_NT_SC_63_DMSO-MYC-r1",
"20201001_MYC_VHL_MP8262_S95": "mp890-MV411_MEF2D_NT_SC_63_VHL-MYC-r1",
"20201001_POL_II_Total_DMSO_MP8262_S92": "mp891-MV411_MEF2D_NT_SC_63_DMSO-POLII-r1",
"20201001_POL_II_Total_VHL_MP8262_S98": "mp892-MV411_MEF2D_NT_SC_63_VHL-POLII-r1",}

In [None]:
for val in a:
    rep = val
    for k,v in rename.items():
        rep = rep.replace(k,v)
    !mv ../../data/$project/fastqs/$val ../../data/$project/fastqs/$rep

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [None]:
gsheet

In [None]:
a = ! ls ../../data/$project/fastqs
a

In [None]:
df = {
"fastq_1": [],
"fastq_2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in h.grouped(a,2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('-')[0]]
    df['group'].append("MV4"+row['name'].values[0].split('-r')[0].split('-MV4')[1])
    df['replicate'].append(row['name'].values[0].split('-r')[1])
    df['fastq_1'].append(project+"/fastqs/"+val[0])
    df['fastq_2'].append(project+"/fastqs/"+val[1])
    df['antibody'].append(row['protein'].values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq_1'].append('ref/mp845-MV411-INPUT-r2_R1.fastq.gz')
df['fastq_2'].append('ref/mp845-MV411-INPUT-r2_R2.fastq.gz')
df['antibody'].append("")
df['control'].append("")
df = pd.DataFrame(df)

In [None]:
df.loc[0].tolist()

In [None]:
cat ../nextflow/chipseq_MEF2D_degraded_design.csv

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/'+project+'_design.csv',index=False)

In [None]:
! cd ../../data/ && sudo ../nextflow log ## to get access to the previous runs

In [None]:
#process chips
! cd ../../data/ && sudo ../nextflow run nf-core/chipseq --paired_end --seq_center 'DFCI' --email 'jkobject@gmail.com' --input ../AMLproject/nextflow/$project_design.csv --genome GRCh38 --skip_preseq --max_cpus 16 -profile docker -w work -resume exotic_bartik

In [None]:
! sudo mv ../../data/results/* ../../data/$project/ && sudo rm -r ..data/work

In [None]:
ls ../../TrimGalore-0.6.5/trim_galore

In [None]:
# get scaling values
norm, mapped = h.getSpikeInControlScales(refgenome="../../data/ref/reference_droso.fna",
fastQfolder='../../data/'+project+'/fastqs/',
pairedEnd=True, cores=12,
tofilter=True,
totrim=True,
tomap=True,
results="../../data/"+project+"/",
pathtotrim_galore="../../TrimGalore-0.6.5/trim_galore")
norm, mapped

In [None]:
mapped

computing scales from the excel sheet

In [None]:
scales = [1.0,
1.0,
0.9644136372,
0.9068238696,
1.0,
1.0,
0.3954022974,
0.8368988329,
1.0,
1.0,
0.5551449949,
0.844680746]

In [None]:
bams = ! ls ../../data/$project/bwa/mergedLibrary/*.bam
bams

In [None]:
bams[6]

### on scalled data

In [None]:
! mkdir ../../data/$project/diffPeaks/ && ! mkdir ../../data/$project/diffData/

In [None]:
! mkdir ../../data/$project/droso_aligned

In [None]:
! mv ../../data/$project/mp* ../../data/$project/droso_aligned

In [None]:
wigs = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig

In [None]:
initscales = ! cat ../../data/$project/bwa/mergedLibrary/bigwig/scale/*.txt

In [None]:
! ls ../../data/$project/bwa/mergedLibrary/bigwig/scale/*.txt

In [None]:
rescales = [val*float(initscales[1+i]) for i, val in enumerate(scales)]

In [None]:
rescales

In [None]:
chip.bigWigFrom(bams[1:], 
                genome='GRCh38',scaling=rescales,
               numthreads=12)

In [None]:
!mkdir ../../data/$project/recalib_bigwig/ && mv bigwig/* ../../data/$project/recalib_bigwig/

In [None]:
mv bigwig/* ../../data/$project/recalib_bigwig/

In [None]:
#! gsutil -m cp -r ../../data/$project/droso_aligned gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/recalib_bigwig gs://amlproject/Chip/$project/

In [None]:
rm -r ../../data/$project/droso_aligned/

In [None]:
# diffPeak on scaled data 
for i in range(int((len(bams)-1)/2)):
    if i==4:
        continue
    name1 = bams[i+1]
    name2 = bams[i+7]
    print('\n')
    print(name1,name2)
    print(chip.fullDiffPeak(name1, name2, control1=bams[0], 
                            res_directory="../../data/"+project+"/diffPeaks/", 
                            scaling=[scales[i],scales[i+6]], 
                            directory="../../data/"+project+'/diffData/'))

In [None]:
os.popen('for i in $(ls ../../data/'+project+'/diffPeaks/*.bed); \
            do echo $(wc -l $i); \
            done').read().split('\n')

In [None]:
bw = ! ls ../../data/$project/recalib_bigwig/*
bw

In [None]:
!mkdir ../results/$project/
!mkdir ../results/$project/plots
!mkdir ../results/$project/plots/heatmaps/

In [None]:
# GENOME WIDE comparison

In [None]:
peaks = ! ls ../../data/$project/bwa/mergedLibrary/macs/broadPeak/*.broadPeak

In [None]:
names = ["DMSO_FLAG_MEF2D",
"DMSO_IFR8",
"DMSO_MED1",
"DMSO_MEF2C",
"DMSO_MYC",
"DMSO_POLII",
"VHL_FLAG_MEF2D",
"VHL_IFR8",
"VHL_MED1",
"VHL_MEF2C",
"VHL_MYC",
"VHL_POLII"]

In [None]:
! mkdir ../../data/$project/peakplot/

In [None]:
for i, val in enumerate(bw):
    if i <0:
        continue
    name = names[i]
    print(name)
    chip.getPeaksAt(peaks[i], bigwigs = val, bigwignames= name, peaknames=['Macs2_Peaks'], window=3000, folder="", title=name, numthreads=8, refpoint="center", name='../../data/'+project+'/peakplot/'+name+'_mat.pdf', withDeeptools=True, torecompute=True, vmax=2.5, legendLoc="lower-left")

In [None]:
rm ../../data/chipseq_MEF2D_degraded/diffPeaks/MV411_MEF2D_NT_SC_63_DMSO-FLAG_MEF2D_R1_treat_pileup_vs_MV411_MEF2D_NT_SC_63_VHL-POLII_R1_tre*

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks/*common.bed
cond1peak

In [None]:
names = ["FLAG_MEF2D",
"IFR8",
"MED1",
"MEF2C",
"MYC",
"POLII"]

In [None]:
for i in range(int(len(bw)/2)):
    if i <0:
        continue
    cl = len(bw)
    name1 = bw[i]
    name2 = bw[i+6]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
   # for val in peak:
      #  chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=7, refpoint="center", name='../../data/'+project+'/diffPeaks/'+name+'_mat.pdf', withDeeptools=True, torecompute=True)

In [None]:
#h.createFoldersFor('../results/'+project+'/plots/scaled/diffPeaks/')
! cp ../../data/$project/diffPeaks/*.pdf ../results/$project/plots/scaled/diffPeaks/

In [None]:
!cp ../../data/$project/peakplot/*_mat.pdf ../results/$project/plots/scaled/

### looking at TSS

In [None]:
server = BiomartServer( "http://www.ensembl.org/biomart" )
ensembl = server.datasets['hsapiens_gene_ensembl']
ensembltss = pd.read_csv(io.StringIO(ensmbl.search({
  'attributes': ['ensembl_gene_id','gene_biotype', "transcription_start_site", "3_utr_start","start_position","external_gene_name", 'chromosome_name']
}, header=1).content.decode()), sep='\t')

In [None]:
ensmbl.show_attributes_by_page()

In [None]:
MEF2Dtargets = h.fileToList('../results/slamseqMax/MEF2Dtargets.txt')
MEF2Dtargets.pop(0)

In [None]:
peaksVHL = pd.read_csv('gs://amlproject/Chip/chipseq_IRF8_degraded/bwa/mergedLibrary/macs/broadPeak/MV411_IRF8_NT_VHL-POLII_total_R1_peaks.broadPeak', sep='\t',header=None,names=['chr','start','end','name','score','.','1','2','3'])

In [None]:
ensembltss

In [None]:
set(ensembltss['Chromosome/scaffold name'])

In [None]:
ensembltss['Chromosome/scaffold name'] = ensembltss['Chromosome/scaffold name'].astype(str)

In [None]:
ensembltss = ensembltss[ensembltss['Chromosome/scaffold name'].isin(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9','X','Y'])]

In [None]:
ensembltss = ensembltss.drop_duplicates('Gene start (bp)')

### on unscalled data

In [None]:
bams = !ls ../../data/$project/bwa/mergedLibrary/mp*.bam
bams

In [None]:
! mkdir ../../data/$project/diffPeaks_unscaled

In [None]:
bams

In [None]:
#on unscalled data 
for i in range(int(len(bams[1:])/2)):
    if i < 0:
        continue
    name1 = bams[1+i]
    name2 = bams[7+i]
    print(name1,name2)
    chip.fullDiffPeak(name1,name2, control1='../../data/'+project+'/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/"+project+"/diffData_unscaled/", res_directory = "../../data/"+project+"/diffPeaks_unscaled/",pairedend=False)

In [None]:
bw = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig
bw

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks_unscaled/*common.bed
commonpeak

In [None]:
names = ["FLAG_MEF2D",
"IFR8",
"MED1",
"MEF2C",
"MYC",
"POLII"]

In [None]:
for i in range(int(len(bw)/2)):
    name1 = bw[i]
    name2 = bw[6+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    #for val in peak:
     #   chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=7, torecompute=True, refpoint='center', name='../../data/'+project+'/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
h.createFoldersFor('../results/'+project+'/plots/unscaled/diffPeaks/')
! cp ../../data/$project/diffPeaks_unscaled/*.pdf ../results/$project/plots/unscaled/diffPeaks/

! gsutil -m cp gs://amlproject/Chip/$project/bwa/mergedLibrary/deepTools/**.pdf ../results/$project/plots/

In [None]:
! gsutil -m cp -r ../../data/$project/diffPeaks gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffData gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffPeaks_unscaled gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffData_unscaled gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/peakplot gs://amlproject/Chip/$project/

## MEF2CMEF2D_knockout_v1

In [32]:
project="MEF2CMEF2D_knockout_v1"

In [None]:
! gsutil ls gs://transfer-amlproject/200924_MP8230_fastq/

In [None]:
mkdir ../data/$project/ && mkdir ../data/$project/qc/

In [None]:
!gsutil -m cp gs://transfer-amlproject/200924_MP8230_fastq/multiqc_report.html ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/200924_MP8230_fastq/Reports/ ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/200924_MP8230_fastq/multiqc_data/ ../data/$project/qc/

In [None]:
! gsutil -m cp gs://transfer-amlproject/200924_MP8230_fastq/*  gs://transfer-amlproject/$project/

### analysis

In [None]:
! mkdir ../../data/$project \
&& mkdir ../../data/$project/fastqs \
&& gsutil -m cp gs://transfer-amlproject/$project/* ../../data/$project/fastqs

In [None]:
! rm ../../data/$project/fastqs/multiqc_report.html

In [None]:
a = ! ls ../../data/$project/fastqs
a

In [None]:
rename = {
"20200924_MP1_MP8230_S113": "mp893-MV411_RNP_AAVS1-H3K27AC-r3",
"20200924_MP2_MP8230_S114": "mp894-MV411_RNP_AAVS1-H3K27AC-r4",
"20200924_MP3_MP8230_S115": "mp895-MV411_RNP_MEF2C-H3K27AC-r1",
"20200924_MP4_MP8230_S116": "mp896-MV411_RNP_MEF2C-H3K27AC-r2",
"20200924_MP5_MP8230_S117": "mp897-MV411_RNP_MEF2D-H3K27AC-r3",
"20200924_MP6_MP8230_S118": "mp898-MV411_RNP_MEF2D-H3K27AC-r4",
"20200924_MP7_MP8230_S119": "mp899-MV411_RNP_MEF2C_MEF2D-H3K27AC-r1",
"20200924_MP8_MP8230_S120": "mp900-MV411_RNP_MEF2C_MEF2D-H3K27AC-r2",}

In [None]:
for val in a:
    rep = val
    for k,v in rename.items():
        rep = rep.replace(k,v)
    !mv ../../data/$project/fastqs/$val ../../data/$project/fastqs/$rep

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [None]:
gsheet

In [None]:
a = ! ls ../../data/$project/fastqs
a

In [None]:
df = {
"fastq_1": [],
"fastq_2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in h.grouped(a,2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('-')[0]]
    df['group'].append("MV4"+row['name'].values[0].split('-r')[0].split('-MV4')[1])
    df['replicate'].append(row['name'].values[0].split('-r')[1])
    df['fastq_1'].append(project+"/fastqs/"+val[0])
    df['fastq_2'].append(project+"/fastqs/"+val[1])
    df['antibody'].append(row['protein'].values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq_1'].append('ref/mp845-MV411-INPUT-r2_R1.fastq.gz')
df['fastq_2'].append('ref/mp845-MV411-INPUT-r2_R2.fastq.gz')
df['antibody'].append("")
df['control'].append("")
df = pd.DataFrame(df)

In [None]:
df

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/'+project+'_design.csv',index=False)

In [None]:
! cd ../../data/ && sudo ../nextflow log ## to get access to the previous runs

In [None]:
#process chips
! cd ../../data/ && sudo ../nextflow run nf-core/chipseq --paired_end --seq_center 'DFCI' --email 'jkobject@gmail.com' --input ../AMLproject/nextflow/$project_design.csv --genome GRCh38 --skip_preseq --max_cpus 16 -profile docker -w work -resume exotic_bartik

In [None]:
! gsutil -m cp -r gs://workamlproject/MEF2CMEF2D/output/* gs://amlproject/Chip/MEF2CMEF2D_knockout_v1/

In [None]:
# get scaling values
norm, mapped = h.getSpikeInControlScales(refgenome="../../data/ref/reference_droso.fna",
fastQfolder='../../data/'+project+'/fastqs/',
pairedEnd=True, cores=12,
tofilter=True,
totrim=True,
tomap=True,
results="../../data/"+project+"/",
pathtotrim_galore="../../TrimGalore-0.6.5/trim_galore")
norm, mapped

computing scales from the excel sheet

In [None]:
scales = [
    1.00,
    1.00,
    0.98,
    1.08,
    0.74,
    0.99,
    1.13,
    0.94
]

In [None]:
#! mkdir ../../data/$project/bwa 
#!mkdir ../../data/$project/bwa/mergedLibrary
#!gsutil cp gs://amlproject/Chip/$project/bwa/mergedLibrary/*.ba* ../../data/$project/bwa/mergedLibrary/
#! gsutil -m cp -r gs://amlproject/Chip/$project/bwa/mergedLibrary/bigwig/ ../../data/$project/bwa/mergedLibrary/
! gsutil  -m cp -r gs://amlproject/Chip/$project/bwa/mergedLibrary/macs/ ../../data/$project/bwa/mergedLibrary/

In [None]:
bams = ! ls ../../data/$project/bwa/mergedLibrary/*.bam
bams

### on scalled data

In [None]:
! mkdir ../../data/$project/diffPeaks/
! mkdir ../../data/$project/diffData/
! mkdir ../../data/$project/droso_aligned
! mv ../../data/$project/mp* ../../data/$project/droso_aligned

In [None]:
wigs = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig
wigs

In [None]:
# diffPeak on scaled data (full reprocessing)
for i in range(len(bams[3:])):
    if i < 3:
        continue
    bam1 = bams[1+(i%2)]
    bam2 = bams[3+i]
    print(bam1,bam2)
    print(chip.fullDiffPeak(bam1, bam2, control1=bams[0], 
                            scaling=[1.0, scales[2+i]], 
                            directory = "../../data/"+project+"/diffData/", 
                            res_directory = "../../data/"+project+"/diffPeaks/", 
                            pairedend=True))

In [None]:
initscales = ! cat ../../data/$project/bwa/mergedLibrary/bigwig/scale/*.txt
initscales

In [None]:
rescales = [val*float(initscales[1+i]) for i, val in enumerate(scales)]
rescales

In [None]:
chip.bigWigFrom(bams[3:], 
                genome='GRCh38',scaling=rescales[2:],
               numthreads=8)

In [None]:
!mkdir ../../data/$project/recalib_bigwig/ 
!mv bigwig/* ../../data/$project/recalib_bigwig/
!cp ../../data/$project/bwa/mergedLibrary/bigwig/*AAVS1*.bigWig ../../data/$project/recalib_bigwig/

In [None]:
os.popen('for i in $(ls ../../data/'+project+'/diffPeaks/*.bed); \
            do echo $(wc -l $i); \
            done').read().split('\n')

In [None]:
bw = ! ls ../../data/$project/recalib_bigwig/*
bw

In [None]:
! gsutil -m cp -r ../../data/$project/droso_aligned gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/recalib_bigwig gs://amlproject/Chip/$project/

In [None]:
!mkdir ../results/$project/
!mkdir ../results/$project/plots
!mkdir ../results/$project/plots/heatmaps/

In [None]:
# GENOME WIDE comparison

In [None]:
peaks = ! ls ../../data/$project/bwa/mergedLibrary/macs/broadPeak/*.broadPeak

In [None]:
names = ["AAVS1", "AAVS1_v2", "MEF2C","MEF2C_v2","MEF2C-MEF2D","MEF2C-MEF2D_v2", "MEF2D","MEF2D_v2"]

In [None]:
for i, val in enumerate(bw):
    if i <0:
        continue
    name = names[i]
    print(name)
    chip.getPeaksAt(peaks[i], bigwigs = val, bigwignames= name, peaknames=['Macs2_Peaks'], window=3000, folder="", title=name, numthreads=8, refpoint="center", name='../../data/'+project+'/peakplot/'+name+'_mat.pdf', withDeeptools=True, torecompute=True, vmax=2.5, legendLoc="lower-left")

In [38]:
h.createFoldersFor('../results/'+project+'/plots/scaled/heatmaps/')
! cp ../../data/$project/peakplot/*.pdf ../results/$project/plots/scaled/heatmaps/

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks/*common.bed
cond1peak

In [None]:
for i in range(int(len(bw[2:]))):
    if i <1:
        continue
    name1 = bw[i%2]
    name2 = bw[i+2]
    a = int(i/2)+3 if i%2 else int(i/2)
    peak = [cond1peak[a], commonpeak[a], cond2peak[a]]
   # for val in peak:
      #  chip.dropWeirdChromosomes(val)
    name = names[i+2]
    print(name,name1,name2,a)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=7, refpoint="center", name='../../data/'+project+'/diffPeaks/'+name+'_mat.pdf', withDeeptools=True, torecompute=False)

In [None]:
! mkdir ../results/$project/plots/scaled/diffPeaks/
! cp ../../data/$project/diffPeaks/*.pdf ../results/$project/plots/scaled/diffPeaks/

### on unscalled data

In [None]:
bams = !ls ../../data/$project/bwa/mergedLibrary/mp*.bam
bams

In [None]:
! mkdir ../../data/$project/diffPeaks_unscaled

In [None]:
#on unscalled data 
for i in range(int(len(bams[3:]))):
    if i < 4:
        continue
    name1 = bams[1+i%2]
    name2 = bams[3+i]
    print(name1,name2)
    chip.fullDiffPeak(name1,name2, control1='../../data/'+project+'/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/"+project+"/diffData_unscaled/", res_directory = "../../data/"+project+"/diffPeaks_unscaled/",pairedend=False)

In [None]:
bw = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig
bw

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks_unscaled/*common.bed
commonpeak

In [None]:
for i in range(int(len(bw[2:]))):
    name1 = bw[i%2]
    name2 = bw[2+i]
    a = int(i/2)+3 if i%2 else int(i/2)
    peak = [cond1peak[a], commonpeak[a], cond2peak[a]]
    #for val in peak:
     #   chip.dropWeirdChromosomes(val)
    name = names[i+2]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=7, torecompute=True, refpoint='center', name='../../data/'+project+'/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
h.createFoldersFor('../results/'+project+'/plots/unscaled/diffPeaks/')
! cp ../../data/$project/diffPeaks_unscaled/*.pdf ../results/$project/plots/unscaled/diffPeaks/

! gsutil -m cp gs://amlproject/Chip/$project/bwa/mergedLibrary/deepTools/**.pdf ../results/$project/plots/

In [None]:
! gsutil -m cp -r ../../data/$project/diffPeaks gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffData gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffPeaks_unscaled gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffData_unscaled gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/peakplot gs://amlproject/Chip/$project/

## IRF8_degraded_v1

In [None]:
project="IRF8_degraded_v1"

In [None]:
fastq = ! gsutil ls gs://transfer-amlproject/201023_MP8292_fastq/
fastq

In [None]:
mkdir ../data/$project/ && mkdir ../data/$project/qc/

In [None]:
!gsutil -m cp gs://transfer-amlproject/201023_MP8292_fastq/multiqc_report.html ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/201023_MP8292_fastq/Reports/ ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/201023_MP8292_fastq/multiqc_data/ ../data/$project/qc/

In [None]:
! gsutil -m cp gs://transfer-amlproject/201023_MP8292_fastq/*  gs://transfer-amlproject/$project/

### analysis

In [None]:
rename = {
"20201023_IRF8_DMSO_MP8292_S119": "mp901-MV411_IRF8_NT_DMSO-IRF8-r1",
"20201023_IRF8_VHL_MP8292_S125": "mp902-MV411_IRF8_NT_VHL-IRF8-r1",
"20201023_MED1_DMSO_MP8292_S122": "mp903-MV411_IRF8_NT_DMSO-MED1-r1",
"20201023_MED1_VHL_MP8292_S128": "mp904-MV411_IRF8_NT_VHL-MED1-r1",
"20201023_MEF2C_DMSO_MP8292_S121": "mp905-MV411_IRF8_NT_DMSO-MEF2C-r1",
"20201023_MEF2C_VHL_MP8292_S127": "mp906-MV411_IRF8_NT_VHL-MEF2C-r1",
"20201023_MEF2D_DMSO_MP8292_S120": "mp907-MV411_IRF8_NT_DMSO-MEF2D-r1",
"20201023_MEF2D_VHL_MP8292_S126": "mp908-MV411_IRF8_NT_VHL-MEF2D-r1",
"20201023_MYC_DMSO_MP8292_S123": "mp909-MV411_IRF8_NT_DMSO-MYC-r1",
"20201023_MYC_VHL_MP8292_S129": "mp910-MV411_IRF8_NT_VHL-MYC-r1",
"20201023_POL_II_total_DMSO_MP8292_S124": "mp911-MV411_IRF8_NT_DMSO-POLII_total-r1",
"20201023_POL_II_total_VHL_MP8292_S130": "mp912-MV411_IRF8_NT_VHL-POLII_total-r1",}

In [None]:
for val in fastq:
    rep = val
    for k,v in rename.items():
        rep = rep.replace(k,v)
    rep = rep.replace('transfer-amlproject/201023_MP8292_fastq/',"amlproject/Chip/"+project+'/fastqs')
    !gsutil cp $val $rep

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [None]:
fastq = ! gsutil ls gs://amlproject/Chip/$project/fastqs/
fastq

In [None]:
df = {
"fastq_1": [],
"fastq_2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in h.grouped(fastq,2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('-')[0]]
    df['group'].append("MV4"+row['name'].values[0].split('-r')[0].split('-MV4')[1])
    df['replicate'].append(row['name'].values[0].split('-r')[1])
    df['fastq_1'].append(val[0])
    df['fastq_2'].append(val[1])
    df['antibody'].append(row['protein'].values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq_1'].append('gs://amlproject/Chip/IRF2BP2_degraded_rep3/fastqs/mp845-MV411-INPUT-r2_R1.fastq.gz')
df['fastq_2'].append('gs://amlproject/Chip/IRF2BP2_degraded_rep3/fastqs/mp845-MV411-INPUT-r2_R2.fastq.gz')
df['antibody'].append("")
df['control'].append("")
df = pd.DataFrame(df)

In [None]:
df

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/'+project+'_design.csv',index=False)

### about

0. you need to have a google project set up with a billing account
1. you need to activte your APIs this way: https://cloud.google.com/life-sciences/docs/tutorials/nextflow?hl=fr
2. nextflow needs to be installed with this installation command `export NXF_MODE=google && curl https://get.nextflow.io | bash`

In [None]:
! cd ../../nextflow log ## to get access to the previous runs

In [None]:
#process chips 
! ../../nextflow run jkobject/chipseq \
    --paired_end \
    --seq_center 'DFCI' \
    --email 'jkobject@gmail.com' \
    --input ../nextflow/chipseq_IRF8_degraded_design.csv \
    --genome GRCh38 \
    --max_cpus 16 \
    -profile jkcloud \ #my profile for the cloud (to edit for your account)
    -w gs://workamlproject/IRF8res \ #where the withinput will be stored
    --spiking \ #I have spiking using drosophilia genome (default)
    --outdir gs://workamlproject/IRF8 \ #specify aa bucket+folderr where the results will be
    --tracedir ../nextflow/IRF8info/ \ #you need to specify a local place for that

In [None]:
h.createFoldersFor('../../data/'+project+'/bwa/mergedLibrary/')
!gsutil -m cp gs://amlproject/Chip/$project/bwa/mergedLibrary/*.ba* ../../data/$project/bwa/mergedLibrary/
! gsutil -m cp -r gs://amlproject/Chip/$project/bwa/mergedLibrary/bigwig/ ../../data/$project/bwa/mergedLibrary/
! gsutil  -m cp -r gs://amlproject/Chip/$project/bwa/mergedLibrary/macs/ ../../data/$project/bwa/mergedLibrary/

computing scales from the excel sheet

In [None]:
scales = [
        1.00,
        0.79,
        0.59,
        1.00,
        0.77,
        1.00,
        1.00,
        0.72,
        0.91,
        1.00,
        1.00,
        0.23,
         ]

In [None]:
bams = ! ls ../../data/$project/bwa/mergedLibrary/*.bam
bams

### on scalled data

In [None]:
! mkdir ../../data/$project/diffPeaks/ && ! mkdir ../../data/$project/diffData/

In [None]:
wigs = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig
wigs

In [None]:
# diffPeak on scaled data (full reprocessing)
for i in range(int(len(bams)/2)):
    if i < 0:
        continue
    bam1 = bams[1+i]
    bam2 = bams[7+i]
    print(bam1,bam2)
    print(chip.fullDiffPeak(bam1,bam2, control1=bams[0], 
                            scaling=[scales[(i*2)], scales[(i*2)+1]], 
                            directory = "../../data/"+project+"/diffData/", 
                            res_directory = "../../data/"+project+"/diffPeaks/", 
                            pairedend=True))
    

In [None]:
initscales = ! cat ../../data/$project/bwa/mergedLibrary/bigwig/scale/*.txt
initscales

In [None]:
rescales = [val*float(initscales[1+i]) for i, val in enumerate(scales)]
rescales

In [None]:
chip.bigWigFrom(bams[1:], 
                genome='GRCh38',scaling=rescales,
               numthreads=8)

In [None]:
!mkdir ../../data/$project/recalib_bigwig/

In [None]:
!mv bigwig/* ../../data/$project/recalib_bigwig/

In [None]:
os.popen('for i in $(ls ../../data/'+project+'/diffPeaks/*.bed); \
            do echo $(wc -l $i); \
            done').read().split('\n')

In [None]:
bw = ! ls ../../data/$project/recalib_bigwig/*
bw

In [None]:
!mkdir ../results/$project/
!mkdir ../results/$project/plots
!mkdir ../results/$project/plots/heatmaps/

#### GENOME WIDE comparison

In [None]:
peaks = ! ls ../../data/$project/bwa/mergedLibrary/macs/broadPeak/*.broadPeak

In [None]:
names = ["IRF8","MED1","MEF2C","MEF2D","MYC","POLII_total"]

In [None]:
for i, val in enumerate(bw):
    if i <1:
        continue
    name = names[i-6]+'_wIRF8ko' if i//6 else names[i]+'_DMSO'
    print(name)
    chip.getPeaksAt(peaks[i], bigwigs = val, bigwignames= name, peaknames=['Macs2_Peaks'], window=3000, folder="", title=name, numthreads=8, refpoint="center", name='../../data/'+project+'/'+name+'_mat.pdf', withDeeptools=True, torecompute=True, legendLoc="lower-left")

In [None]:
h.createFoldersFor('../results/'+project+'/plots/scaled/heatmaps/')
! cp ../../data/$project/*.pdf ../results/$project/plots/scaled/heatmaps/

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks/*common.bed
cond1peak

In [None]:
for i in range(int(len(bw)/2)):
    if i <0:
        continue
    name1 = bw[i]
    name2 = bw[i+6]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
   # for val in peak:
      #  chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=8, refpoint="center", name='../../data/'+project+'/diffPeaks/'+name+'_mat.pdf', withDeeptools=True, torecompute=False)

In [None]:
! mkdir ../results/$project/plots/scaled/diffPeaks/
! cp ../../data/$project/diffPeaks/*.pdf ../results/$project/plots/scaled/diffPeaks/

### on unscalled data

In [None]:
! mkdir ../../data/$project/diffPeaks_unscaled

In [None]:
#on unscalled data 
for i in range(int(len(bams)/2)):
    if i < 0:
        continue
    name1 = bams[i]
    name2 = bams[6+i]
    print(name1,name2)
    chip.fullDiffPeak(name1,name2, control1='../../data/'+project+'/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/"+project+"/diffData_unscaled/", res_directory = "../../data/"+project+"/diffPeaks_unscaled/",pairedend=False)

In [None]:
bw = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig
bw

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks_unscaled/*common.bed
commonpeak

In [None]:
for i in range(int((len(bw)-1)/2)):
    name1 = bw[i]
    name2 = bw[6+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    #for val in peak:
     #   chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=8, torecompute=True, refpoint='center', name='../../data/'+project+'/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
h.createFoldersFor('../results/'+project+'/plots/unscaled/diffPeaks/')
! cp ../../data/$project/diffPeaks_unscaled/*.pdf ../results/$project/plots/unscaled/diffPeaks/

! gsutil -m cp gs://amlproject/Chip/$project/bwa/mergedLibrary/deepTools/**.pdf ../results/$project/plots/

In [None]:
! gsutil -m cp -r ../../data/$project/diffPeaks gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffData gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffPeaks_unscaled gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffData_unscaled gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/peakplot gs://amlproject/Chip/$project/

## MEF2D_degraded_v2

In [None]:
project="MEF2D_degraded_v2"
loc= "201222_MP8533_fastq"

In [None]:
fastq = ! gsutil ls gs://transfer-amlproject/$loc/
fastq

In [None]:
!gsutil -m cp gs://transfer-amlproject/$loc/multiqc_report.html ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/$loc/Reports/ ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/$loc/multiqc_data/ ../data/$project/qc/

In [None]:
! gsutil -m cp gs://transfer-amlproject/$loc/*  gs://transfer-amlproject/$project/

In [None]:
for val in fastq:
    rep = val
    for k,v in rename.items():
        rep = rep.replace(k,v)
    rep = rep.replace('transfer-amlproject/'+loc+'/',"amlproject/Chip/"+project+'/fastqs/')
    !gsutil cp $val $rep

In [None]:
from gsheets import Sheets
sheets = Sheets.from_files('~/.client_secret.json', '~/.storage.json')
url="https://docs.google.com/spreadsheets/d/1yFLjYB1McU530JnLgL0QIMAKIkVl3kl0_LCHje2gk8U"
gsheet = sheets.get(url).sheets[2].to_frame()

In [None]:
! gsutil rm gs://amlproject/Chip/$project/fastqs/multiqc_report.html

In [None]:
fastq = ! gsutil ls gs://amlproject/Chip/$project/fastqs/
fastq

### Analysis

In [None]:
df = {
"fastq_1": [],
"fastq_2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in h.grouped(fastq,2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('-')[0]]
    df['group'].append("MV4"+row['name'].values[0].split('-r')[0].split('-MV4')[1])
    df['replicate'].append(1)
    df['fastq_1'].append(val[0])
    df['fastq_2'].append(val[1])
    df['antibody'].append(row['protein'].values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq_1'].append('gs://amlproject/Chip/IRF2BP2_degraded_rep3/fastqs/mp845-MV411-INPUT-r2_R1.fastq.gz')
df['fastq_2'].append('gs://amlproject/Chip/IRF2BP2_degraded_rep3/fastqs/mp845-MV411-INPUT-r2_R2.fastq.gz')
df['antibody'].append("")
df['control'].append("")
df = pd.DataFrame(df)
df

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/'+project+'_design.csv',index=False)

In [None]:
project

### about

0. you need to have a google project set up with a billing account
1. you need to activte your APIs this way: https://cloud.google.com/life-sciences/docs/tutorials/nextflow?hl=fr
2. nextflow needs to be installed with this installation command `export NXF_MODE=google && curl https://get.nextflow.io | bash`

In [None]:
! cd ../../nextflow log ## to get access to the previous runs

In [None]:
#process chips 
! cd ../../ && ./nextflow run jkobject/chipseq --paired_end --seq_center 'DFCI' --email 'jkobject@gmail.com' --input AMLproject/nextflow/chipseq_MF2C_degraded_v2_design.csv --genome GRCh38 --max_cpus 16 -profile jkcloud -w gs://workamlproject/MEF2Cv2res --spiking --outdir gs://workamlproject/MEF2C_v2 --tracedir AMLproject/nextflow/MEF2Cdegradedv2/

In [None]:
project

In [None]:
! gsutil -m cp -r gs://workamlproject/MEF2C_v2/* gs://amlproject/Chip/$project/

In [None]:
! gsutil cat gs://amlproject/Chip/$project/droso_aligned/counts/*
! gsutil ls gs://amlproject/Chip/$project/droso_aligned/counts/*

In [None]:
# now use the total counts in gs://amlproject/Chip/MEF2D_degraded_v2/multiqc/broadPeak/multiqc_report.html
# https://console.cloud.google.com/storage/browser/amlproject/Chip/MEF2D_degraded_v2/multiqc/broadPeak/multiqc_report.html

In [None]:
h.createFoldersFor('../../data/'+project+'/bwa/mergedLibrary/')
!gsutil -m cp gs://amlproject/Chip/$project/bwa/mergedLibrary/*.ba* ../../data/$project/bwa/mergedLibrary/
! gsutil -m cp -r gs://amlproject/Chip/$project/bwa/mergedLibrary/bigwig/ ../../data/$project/bwa/mergedLibrary/
! gsutil  -m cp -r gs://amlproject/Chip/$project/bwa/mergedLibrary/macs/ ../../data/$project/bwa/mergedLibrary/

computing scales from the excel sheet

In [None]:
scales = [
1.00,
0.79,
1.00,
0.86,
0.26,
    
0.79,
1.00,
0.83,
1.00,
1.00,
         ]

In [None]:
bams = ! ls ../../data/$project/bwa/mergedLibrary/*.bam
bams

In [None]:
names = ["FLAG_MEF2D","MED1","MEF2C","MYC","POLII_total"]

### on scalled data

In [None]:
! mkdir ../../data/$project/diffPeaks/ && ! mkdir ../../data/$project/diffData/

In [None]:
wigs = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig
wigs

In [None]:
# diffPeak on scaled data (full reprocessing)
for i in range(int(len(bams)/2)):
    if i < 0:
        continue
    bam1 = bams[1+i]
    bam2 = bams[1+len(names)+i]
    print(bam1,bam2)
    print(chip.fullDiffPeak(bam1,bam2, control1=bams[0], scaling=[scales[i], scales[i+len(names)]], directory = "../../data/"+project+"/diffData/", res_directory = "../../data/"+project+"/diffPeaks/",pairedend=True))

In [None]:
initscales = ! cat ../../data/$project/bwa/mergedLibrary/bigwig/scale/*.txt
initscales

In [None]:
rescales = [val*float(initscales[1+i]) for i, val in enumerate(scales)]
rescales

In [None]:
chip.bigWigFrom(bams[1:], 
                genome='GRCh38',scaling=rescales,
               numthreads=8)

In [None]:
!mkdir ../../data/$project/recalib_bigwig/

In [None]:
!mv bigwig/* ../../data/$project/recalib_bigwig/

In [None]:
! gsutil -m cp -r ../../data/$project/recalib_bigwig gs://amlproject/Chip/$project/

In [None]:
os.popen('for i in $(ls ../../data/'+project+'/diffPeaks/*.bed); \
            do echo $(wc -l $i); \
            done').read().split('\n')

In [None]:
bw = ! ls ../../data/$project/recalib_bigwig/*
bw

In [None]:
!mkdir ../results/$project/
!mkdir ../results/$project/plots
!mkdir ../results/$project/plots/heatmaps/

In [None]:
peaks = ! ls ../../data/$project/bwa/mergedLibrary/macs/broadPeak/*.broadPeak
peaks

#### merging peaks VHL/DMSO

In [None]:
mpeaks = []
for i, val in enumerate(names):
    if i<0:
        continue
    print(val)
    dmso = peaks[i]
    vhl = peaks[i+len(names)]
    chip.simpleMergePeaks(pd.concat([chip.loadPeaks(dmso), chip.loadPeaks(vhl)])).to_csv('../../data/' + project + '/' + val + '_genomewide_merged.bed', sep='\t', header=False, index=False)
    mpeaks.append('../../data/'+project+'/'+val+'_genomewide_merged.bed')

#### GENOME WIDE comparison

In [None]:
condname="_MEF2D_ko"
dmsoname="_DMSO"

In [None]:
for i, val in enumerate(bw):
    if i < 0:
        continue
    name = names[i-len(names)]+condname if i//len(names) else names[i]+dmsoname
    print(name)
    chip.getPeaksAt(mpeaks[i%len(names)], bigwigs = val, bigwignames= name, 
                    peaknames=['Macs2_Peaks'], window=3000, folder="", title=name, numthreads=8, 
                    refpoint="center", name='../../data/'+project+'/'+name+'_mat.pdf', withDeeptools=True, 
                    torecompute=True, legendLoc="lower-left")

#### making overlapping profiles

In [None]:
for i, val in enumerate(names):
    if i <0:
        continue
    val1 = '../../data/'+project+'/' + val +dmsoname+'_mat.gz'
    val2 = '../../data/'+project+'/' + val +condname+'_mat.gz'
    print(val)
    chip.makeProfiles(matx=[val1,val2], matnames=['DMSO','VHL'], title=val, 
                      refpoint="center", name='../../data/'+project+'/'+val+'_combined_mat.pdf', 
                      legendLoc="lower-left")

In [None]:
h.createFoldersFor('../results/'+project+'/plots/scaled/heatmaps/')
! cp ../../data/$project/*.pdf ../results/$project/plots/scaled/heatmaps/

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks/*common.bed
cond1peak

In [None]:
for i in range(int(len(bw)/2)):
    if i <0:
        continue
    name1 = bw[i]
    name2 = bw[i+len(names)]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
   # for val in peak:
      #  chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=8, refpoint="center", name='../../data/'+project+'/diffPeaks/'+name+'_mat.pdf', withDeeptools=True, torecompute=False)

In [None]:
! mkdir ../results/$project/plots/scaled/diffPeaks/
! cp ../../data/$project/diffPeaks/*.pdf ../results/$project/plots/scaled/diffPeaks/

### Unscalled

In [None]:
! mkdir ../../data/$project/diffPeaks_unscaled! mkdir ../../data/$project/diffPeaks_unscaled

In [None]:
#on unscalled data 
for i in range(int(len(bams)/2)):
    if i < 0:
        continue
    name1 = bams[1+i]
    name2 = bams[1+len(names)+i]
    print(name1,name2)
    chip.fullDiffPeak(name1,name2, control1='../../data/'+project+'/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/"+project+"/diffData_unscaled/", res_directory = "../../data/"+project+"/diffPeaks_unscaled/",pairedend=False)

In [None]:
bw = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig
bw

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks_unscaled/*common.bed
commonpeak

In [None]:
for i in range(int((len(bw)-1)/2)):
    name1 = bw[1+i]
    name2 = bw[1+len(names)+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    #for val in peak:
     #   chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=8, torecompute=True, refpoint='center', name='../../data/'+project+'/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
h.createFoldersFor('../results/'+project+'/plots/unscaled/diffPeaks/')
! cp ../../data/$project/diffPeaks_unscaled/*.pdf ../results/$project/plots/unscaled/diffPeaks/

! gsutil -m cp gs://amlproject/Chip/$project/bwa/mergedLibrary/deepTools/**.pdf ../results/$project/plots/

In [None]:
! gsutil -m cp -r ../../data/$project/diffPeaks gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffData gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffPeaks_unscaled gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffData_unscaled gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/peakplot gs://amlproject/Chip/$project/

## MEF2D_degraded_v3

In [None]:
project="MEF2D_degraded_v3"
loc1= "201222_MP8548_fastq"
loc2= "201204_MP8489_fastq"

In [2]:
! gsutil ls gs://transfer-amlproject

gs://transfer-amlproject/201120_MP8439_fastq/
gs://transfer-amlproject/210331_MP8777_fastq/
gs://transfer-amlproject/210514_MP8847_fastq/
gs://transfer-amlproject/Cobinding_additional/
gs://transfer-amlproject/MEF2D_degraded_v4/
gs://transfer-amlproject/MONOMAC_synMEF2CD/
gs://transfer-amlproject/RNP_MEIS1/
gs://transfer-amlproject/RNPv5/
gs://transfer-amlproject/additional_degraded_v1/
gs://transfer-amlproject/chipseq_MEF2C_2h_degraded/
gs://transfer-amlproject/chipseq_MEF2D_degraded/
gs://transfer-amlproject/chipseq_MEFF2CMEF2D_degraded/
gs://transfer-amlproject/chipseq_MF2C_degraded_v2/
gs://transfer-amlproject/slamseq_IRF8/
gs://transfer-amlproject/slamseq_YLK_MV411/
gs://transfer-amlproject/slamseq_inhibitor_spikeins/


In [4]:
#! gsutil ls gs://transfer-amlproject/210331_MP8777_fastq

gs://transfer-amlproject/210331_MP8777_fastq/20210331_DMSO_FLAG_A_MP8777_S94_R1_001.fastq.gz
gs://transfer-amlproject/210331_MP8777_fastq/20210331_DMSO_FLAG_A_MP8777_S94_R2_001.fastq.gz
gs://transfer-amlproject/210331_MP8777_fastq/20210331_DMSO_FLAG_B_MP8777_S95_R1_001.fastq.gz
gs://transfer-amlproject/210331_MP8777_fastq/20210331_DMSO_FLAG_B_MP8777_S95_R2_001.fastq.gz
gs://transfer-amlproject/210331_MP8777_fastq/20210331_DMSO_H3K27ac_A_MP8777_S98_R1_001.fastq.gz
gs://transfer-amlproject/210331_MP8777_fastq/20210331_DMSO_H3K27ac_A_MP8777_S98_R2_001.fastq.gz
gs://transfer-amlproject/210331_MP8777_fastq/20210331_DMSO_H3K27ac_B_MP8777_S99_R1_001.fastq.gz
gs://transfer-amlproject/210331_MP8777_fastq/20210331_DMSO_H3K27ac_B_MP8777_S99_R2_001.fastq.gz
gs://transfer-amlproject/210331_MP8777_fastq/20210331_DMSO_MED1_A_MP8777_S96_R1_001.fastq.gz
gs://transfer-amlproject/210331_MP8777_fastq/20210331_DMSO_MED1_A_MP8777_S96_R2_001.fastq.gz
gs://transfer-amlproject/210331_MP8777_fastq/202

In [None]:
fastq2 = ! gsutil ls gs://transfer-amlproject/$loc2/
fastq2

In [None]:
fastq1 = ! gsutil ls gs://transfer-amlproject/$loc1/
fastq1

In [None]:
mkdir ../data/$project/ && mkdir ../data/$project/qc/

In [None]:
!gsutil -m cp gs://transfer-amlproject/$loc1/multiqc_report.html ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/$loc1/Reports/ ../data/$project/qc/
!gsutil -m cp -r gs://transfer-amlproject/$loc1/multiqc_data/ ../data/$project/qc/
    
!gsutil -m cp gs://transfer-amlproject/$loc2/multiqc_report.html ../data/$project/qc2/
!gsutil -m cp -r gs://transfer-amlproject/$loc2/Reports/ ../data/$project/qc2/
!gsutil -m cp -r gs://transfer-amlproject/$loc2/multiqc_data/ ../data/$project/qc2/

In [None]:
! gsutil -m cp gs://transfer-amlproject/$loc1/*  gs://transfer-amlproject/$project/
! gsutil -m cp gs://transfer-amlproject/$loc2/*  gs://transfer-amlproject/$project/

### analysis

In [None]:
rename = {
"20201222_FLAG_DMSO_2h_MP8548_S1": "mp931-MV411_MEF2D_NT_SC_63_DMSO-FLAG_MEF2D-r5",
"20201222_FLAG_VHL_2h_MP8548_S6": "mp932-MV411_MEF2D_NT_SC_63_VHL_2h-FLAG_MEF2D-r5",
"20201222_MED1_DMSO_2h_MP8548_S3": "mp933-MV411_MEF2D_NT_SC_63_DMSO-MED1-r3",
"20201222_MED1_VHL_2h_MP8548_S8": "mp934-MV411_MEF2D_NT_SC_63_VHL_2h-MED1-r3",
"20201222_MEF2C_DMSO_2h_MP8548_S5": "mp935-MV411_MEF2D_NT_SC_63_DMSO-MEF2C-r3",
"20201222_MEF2C_VHL_2h_MP8548_S10": "mp936-MV411_MEF2D_NT_SC_63_VHL_2h-MEF2C-r3",
"20201222_MYC_DMSO_2h_MP8548_S2": "mp937-MV411_MEF2D_NT_SC_63_DMSO-MYC-r3",
"20201222_MYC_VHL_2h_MP8548_S7": "mp938-MV411_MEF2D_NT_SC_63_VHL_2h-MYC-r3",
"20201222_POL_II_DMSO_2h_MP8548_S4": "mp939-MV411_MEF2D_NT_SC_63_DMSO-POLII_total-r3",
"20201222_POL_II_VHL_2h_MP8548_S9": "mp940-MV411_MEF2D_NT_SC_63_VHL_2h-POLII_total-r3",
"20201204_DMSO_FLAG_MEF2D_degron_2h_MP8489_S47": "mp941-MV411_MEF2D_NT_SC_63_DMSO-FLAG_MEF2D-r6",
"20201204_DMSO_MED1_MEF2D_degron_2h_MP8489_S49": "mp942-MV411_MEF2D_NT_SC_63_DMSO-MED1-r4",
"20201204_DMSO_MEF2C_MEF2D_degron_2h_MP8489_S51": "mp943-MV411_MEF2D_NT_SC_63_DMSO-MEF2C-r4",
"20201204_DMSO_MYC_MEF2D_degron_2h_MP8489_S48": "mp944-MV411_MEF2D_NT_SC_63_DMSO-MYC-r4",
"20201204_DMSO_POL_II_MEF2D_degron_2h_MP8489_S50": "mp945-MV411_MEF2D_NT_SC_63_DMSO-POLII_total-r4",
"20201204_VHL_FLAG_MEF2D_degron_2h_MP8489_S52": "mp946-MV411_MEF2D_NT_SC_63_VHL_2h-FLAG_MEF2D-r6",
"20201204_VHL_MED1_MEF2D_degron_2h_1_MP8489_S54": "mp947-MV411_MEF2D_NT_SC_63_VHL_2h-MED1-r4",
"20201204_VHL_MED1_MEF2D_degron_2h_2_MP8489_S56": "mp948-MV411_MEF2D_NT_SC_63_VHL_2h-MEF2C-r4",
"20201204_VHL_MYC_MEF2D_degron_2h_MP8489_S53": "mp949-MV411_MEF2D_NT_SC_63_VHL_2h-MYC-r4",
"20201204_VHL_POL_II_MEF2D_degron_2h_MP8489_S55": "mp950-MV411_MEF2D_NT_SC_63_VHL_2h-POLII_total-r4",}

In [None]:
for val in fastq1+fastq2:
    rep = val
    for k,v in rename.items():
        rep = rep.replace(k,v)
    rep = rep.replace('transfer-amlproject/'+loc1+'/',"amlproject/Chip/"+project+'/fastqs/').replace('transfer-amlproject/' + loc2 + '/', "amlproject/Chip/"+project+'/fastqs/')
    !gsutil cp $val $rep

In [None]:
! gsutil rm gs://amlproject/Chip/$project/fastqs/multiqc_report.html

In [None]:
fastq = ! gsutil ls gs://amlproject/Chip/$project/fastqs/
fastq

In [1]:
df = {
"fastq_1": [],
"fastq_2": [],
"antibody": [],
"group": [],
"replicate": [],
"control": []
}
for val in h.grouped(fastq,2):
    row = gsheet[gsheet.id==val[0].split('/')[-1].split('-')[0]]
    df['group'].append("MV4"+row['name'].values[0].split('-r')[0].split('-MV4')[1])
    df['replicate'].append(df['group'].count(df['group'][-1]))
    df['fastq_1'].append(val[0])
    df['fastq_2'].append(val[1])
    df['antibody'].append(row['protein'].values[0])
    df['control'].append("INPUT")
df['group'].append('INPUT')
df['replicate'].append(1)
df['fastq_1'].append('gs://amlproject/Chip/fastqs/paired_end/mp845-MV411-INPUT-r2_R1.fastq.gz')
df['fastq_2'].append('gs://amlproject/Chip/fastqs/paired_end/mp845-MV411-INPUT-r2_R1.fastq.gz')
df['antibody'].append("")
df['control'].append("")
df = pd.DataFrame(df)
df

NameError: name 'h' is not defined

In [None]:
df[df.columns[[3,4,0,1,2,5]]].to_csv('../nextflow/'+project+'_design.csv',index=False)

In [None]:
project

### about

0. you need to have a google project set up with a billing account
1. you need to activte your APIs this way: https://cloud.google.com/life-sciences/docs/tutorials/nextflow?hl=fr
2. nextflow needs to be installed with this installation command `export NXF_MODE=google && curl https://get.nextflow.io | bash`

In [None]:
! cd ../../ && ./nextflow log ## to get access to the previous runs

In [None]:
#process chips 
! cd ../../ && ./nextflow run jkobject/chipseq --paired_end --seq_center 'DFCI' --email 'jkobject@gmail.com' --input AMLproject/nextflow/chipseq_MEF2C_2h_degraded_design.csv --genome GRCh38 --max_cpus 16 --skip_diff_analysis -profile jkcloud -w gs://workamlproject/MEF2C2hres --spiking --outdir gs://workamlproject/MEF2C2h --tracedir AMLproject/nextflow/MEF2Cdegraded2h/

In [None]:
! gsutil -m cp -r gs://workamlproject/MEF2C2h/* gs://amlproject/Chip/$project/

In [None]:
! gsutil cat gs://amlproject/Chip/$project/droso_aligned/counts/*
! gsutil ls gs://amlproject/Chip/$project/droso_aligned/counts/

In [None]:
project

In [None]:
# now use the total counts in gs://amlproject/Chip/chipseq_MF2C_degraded_v2/multiqc/broadPeak/multiqc_report.html
# https://console.cloud.google.com/storage/browser/amlproject/Chip/chipseq_MF2C_degraded_v2/multiqc/broadPeak/multiqc_report.html

In [None]:
h.createFoldersFor('../../data/'+project+'/bwa/mergedLibrary/')
!gsutil -m cp gs://amlproject/Chip/$project/bwa/mergedLibrary/*.ba* ../../data/$project/bwa/mergedLibrary/
! gsutil -m cp -r gs://amlproject/Chip/$project/bwa/mergedLibrary/bigwig/ ../../data/$project/bwa/mergedLibrary/
! gsutil  -m cp -r gs://amlproject/Chip/$project/bwa/mergedLibrary/macs/ ../../data/$project/bwa/mergedLibrary/

computing scales from the excel sheet

In [None]:
scales = [
1.00,
1.00,
1.00,
0.70,
1.00,
0.10,
0.38,
1.00,
0.92,
1.00,
    
0.26,
0.83,
0.12,
1.00,
0.39,
1.00,
1.00,
0.40,
1.00,
0.03,
         ]

In [None]:
bams = ! ls ../../data/$project/bwa/mergedLibrary/*.bam
bams

### on scalled data

In [None]:
! mkdir ../../data/$project/diffPeaks/ && ! mkdir ../../data/$project/diffData/

In [None]:
wigs = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig
wigs

In [None]:
# diffPeak on scaled data (full reprocessing)
for i in range(int(len(bams)/2)):
    if i < 0:
        continue
    bam1 = bams[1+i]
    bam2 = bams[11+i]
    print(bam1,bam2)
    print(chip.fullDiffPeak(bam1,bam2, control1=bams[0], scaling=[scales[i], scales[i+10]], directory = "../../data/"+project+"/diffData/", res_directory = "../../data/"+project+"/diffPeaks/",pairedend=True))

In [None]:
initscales = ! cat ../../data/$project/bwa/mergedLibrary/bigwig/scale/*.txt
initscales

In [None]:
rescales = [val*float(initscales[1+i]) for i, val in enumerate(scales)]
rescales

In [None]:
chip.bigWigFrom(bams[1:], 
                genome='GRCh38',scaling=rescales,
               numthreads=8)

In [None]:
!mkdir ../../data/$project/recalib_bigwig/

In [None]:
!mv bigwig/* ../../data/$project/recalib_bigwig/

In [None]:
! gsutil -m cp -r ../../data/$project/recalib_bigwig gs://amlproject/Chip/$project/

In [None]:
os.popen('for i in $(ls ../../data/'+project+'/diffPeaks/*.bed); \
            do echo $(wc -l $i); \
            done').read().split('\n')

In [None]:
bw = ! ls ../../data/$project/recalib_bigwig/*
bw

In [None]:
!mkdir ../results/$project/
!mkdir ../results/$project/plots
!mkdir ../results/$project/plots/heatmaps/

In [None]:
peaks = ! ls ../../data/$project/bwa/mergedLibrary/macs/broadPeak/*.broadPeak
peaks

In [None]:
names = ["FLAG_MEF2D_R1", "FLAG_MEF2D_R2", "MED1_R1", "MED1_R2", "MEF2C_R1", "MEF2C_R2", "MYC_R1", "MYC_R2", "POLII_total_R1", "POLII_total_R2",]

#### merging peaks VHL/DMSO

In [None]:
mpeaks = []
for i, val in enumerate(names):
    if i<0:
        continue
    print(val)
    dmso = peaks[i]
    vhl = peaks[i+10]
    chip.simpleMergePeaks(pd.concat([chip.loadPeaks(dmso), chip.loadPeaks(vhl)])).to_csv('../../data/' + project + '/' + val + '_genomewide_merged.bed', sep='\t', header=False, index=False)
    mpeaks.append('../../data/'+project+'/'+val+'_genomewide_merged.bed')

#### GENOME WIDE comparison

In [None]:
dmsoname="_DMSO"
condname="_wMEF2D_ko_at2h"

In [None]:
for i, val in enumerate(bw):
    if i > 0:
        continue
    name = names[i-10]+condname if i//10 else names[i]+dmsoname
    print(name)
    chip.getPeaksAt(mpeaks[i%10], bigwigs = val, bigwignames= name, 
                    peaknames=['Macs2_Peaks'], window=3000, folder="", title=name, 
                    numthreads=8, refpoint="center", name='../../data/'+project+'/'+name+'_mat.pdf', 
                    withDeeptools=True, torecompute=True, legendLoc="lower-left")

#### making overlapping profiles

In [None]:
for i, val in enumerate(names):
    if i <0:
        continue
    val1 = '../../data/'+project+'/' + val + dmsoname+'_mat.gz'
    val2 = '../../data/'+project+'/' + val + condname+'_mat.gz'
    print(val)
    chip.makeProfiles(matx=[val1,val2], matnames=['DMSO','VHL'], title=val, refpoint="center", name='../../data/'+project+'/'+val+'_combined_mat.pdf', legendLoc="lower-left")

In [None]:
h.createFoldersFor('../results/'+project+'/plots/scaled/heatmaps/')
! cp ../../data/$project/*.pdf ../results/$project/plots/scaled/heatmaps/

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks/*common.bed
cond1peak

In [None]:
for i in range(int(len(bw)/2)):
    if i <0:
        continue
    name1 = bw[i]
    name2 = bw[i+len(names)]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
   # for val in peak:
      #  chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, folder="", title=name, numthreads=8, refpoint="center", name='../../data/'+project+'/diffPeaks/'+name+'_mat.pdf', withDeeptools=True, torecompute=False)

In [None]:
! mkdir ../results/$project/plots/scaled/diffPeaks/
! cp ../../data/$project/diffPeaks/*.pdf ../results/$project/plots/scaled/diffPeaks/

#### Looking at TSS

In [None]:
IRF8targets = h.fileToList('../results/slamseqMax/IRF8targets.txt')

In [None]:
ls ../../data/chipseq_IRF8_degraded/recalib_bigwig/*

In [None]:
peaksVHL = pd.read_csv('../../data/chipseq_IRF8_degraded/diffPeaks/MV411_IRF8_NT_DMSO-POLII_total_R1_treat_pileup_vs_MV411_IRF8_NT_VHL-POLII_total_R1_treat_pileup_c3.0_cond2.bed', sep='\t',header=None, skiprows=1, names=['chrom','start','end','name','score'])
peaksVHL['chrom']= [i[3:] for i in peaksVHL['chrom']]
peaksVHL[['start','end']] = peaksVHL[['start','end']].astype(int)

In [None]:
peaksDMSO = pd.read_csv('../../data/chipseq_IRF8_degraded/diffPeaks/MV411_IRF8_NT_DMSO-POLII_total_R1_treat_pileup_vs_MV411_IRF8_NT_VHL-POLII_total_R1_treat_pileup_c3.0_cond1.bed', sep='\t', header=None, skiprows=1, names=['chrom','start','end','name','score'])
peaksDMSO['chrom']= [i[3:] for i in peaksDMSO['chrom']]
peaksDMSO[['start','end']] = peaksDMSO[['start','end']].astype(int)

In [None]:
peakcommon = pd.read_csv('../../data/chipseq_IRF8_degraded/diffPeaks/MV411_IRF8_NT_DMSO-POLII_total_R1_treat_pileup_vs_MV411_IRF8_NT_VHL-POLII_total_R1_treat_pileup_c3.0_common.bed', sep='\t',header=None, skiprows=1, names=['chrom','start','end','name','score'])
peakcommon['chrom']= [i[3:] for i in peakcommon['chrom']]
peakcommon[['start','end']] = peakcommon[['start','end']].astype(int)

In [None]:
ensembltss['Chromosome/scaffold name'] = ensembltss['Chromosome/scaffold name'].astype(str)
ensembltss = ensembltss[ensembltss['Chromosome/scaffold name'].isin(['1', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2', '20', '21', '22', '3', '4', '5', '6', '7', '8', '9','X','Y'])]
ensembltss = ensembltss.drop_duplicates('Gene start (bp)')
ensembltss = ensembltss.reset_index(drop=True)
ensembltss = ensembltss.astype(str)
ensembltss['loci'] = ensembltss['Transcription start site (TSS)'].astype(int)
ensembltss['chrom'] = ensembltss['Chromosome/scaffold name']
ensembltss = ensembltss.sort_values(by=['chrom','loci']).reset_index(drop=True)

In [None]:
chip.substractPeaksTo(peaksDMSO, ensembltss).to_csv('../../data/'+project+'/diffPeaks/polII_DMSO_TSSonly.bed',sep='\t',index=None,header=False)
chip.substractPeaksTo(peakcommon, ensembltss).to_csv('../../data/'+project+'/diffPeaks/polII_common_TSSonly.bed',sep='\t',index=None,header=False)
chip.substractPeaksTo(peaksVHL, ensembltss).to_csv('../../data/'+project+'/diffPeaks/polII_VHL_TSSonly.bed',sep='\t',index=None,header=False)

In [None]:
chip.getPeaksAt(['../../data/'+project+'/diffPeaks/polII_DMSO_TSSonly.bed', '../../data/'+project+'/diffPeaks/polII_common_TSSonly.bed', '../../data/'+project+'/diffPeaks/polII_VHL_TSSonly.bed'],
                bigwigs = ['../../data/chipseq_IRF8_degraded/recalib_bigwig/MV411_IRF8_NT_DMSO-POLII_total_R1.bw', '../../data/chipseq_IRF8_degraded/recalib_bigwig/MV411_IRF8_NT_VHL-POLII_total_R1.bw'], 
                bigwignames= ['IRF8_DMSO', 'IRF8_VHL'], 
                peaknames=['DMSO','common','VHL'], window=3000, folder="", title='polII_with_IRF8_degron', numthreads=4, 
                refpoint="center", 
                name='../../data/'+project+'/diffPeaks/polII_with_IRF8_degron_mat.pdf', 
                withDeeptools=True, torecompute=True, legendLoc="lower-left")

In [None]:
IRF8tss = ensembltss[ensembltss['Gene name'].isin(IRF8targets)].reset_index(drop=True)

In [None]:
a = ! wc -l ../../data/$project/diffPeaks/polII_DMSO_TSSonly.bed 

In [None]:
int(a[0].split(' ')[0])/len(peaksDMSO), len(peaksDMSO)

In [None]:
len(chip.substractPeaksTo(peaksDMSO, IRF8tss))/int(a[0].split(' ')[0])

In [None]:
a = ! wc -l ../../data/$project/diffPeaks/polII_common_TSSonly.bed 

In [None]:
int(a[0].split(' ')[0])/ len(peakcommon),  len(peakcommon)

In [None]:
len(chip.substractPeaksTo(peakcommon, IRF8tss))/int(a[0].split(' ')[0])

In [None]:
a = ! wc -l ../../data/$project/diffPeaks/polII_VHL_TSSonly.bed 

In [None]:
len(peaksVHL)

In [None]:
int(a[0].split(' ')[0])/len(peaksVHL),  len(peaksVHL)

In [None]:
len(chip.substractPeaksTo(peaksVHL, IRF8tss))/int(a[0].split(' ')[0])

### on unscalled data

In [None]:
! mkdir ../../data/$project/diffPeaks_unscaled

In [None]:
#on unscalled data 
for i in range(int(len(bams)/2)):
    if i < 0:
        continue
    name1 = bams[1+i]
    name2 = bams[1+len(names)+i]
    print(name1,name2)
    chip.fullDiffPeak(name1,name2, control1='../../data/'+project+'/bwa/mergedLibrary/INPUT_R1.mLb.clN.sorted.bam', directory = "../../data/"+project+"/diffData_unscaled/", res_directory = "../../data/"+project+"/diffPeaks_unscaled/",pairedend=False)

In [None]:
bw = ! ls ../../data/$project/bwa/mergedLibrary/bigwig/*.bigWig
bw

In [None]:
cond1peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond1.bed
cond2peak = ! ls ../../data/$project/diffPeaks_unscaled/*cond2.bed
commonpeak = ! ls ../../data/$project/diffPeaks_unscaled/*common.bed
commonpeak

In [None]:
for i in range(int((len(bw)-1)/2)):
    name1 = bw[1+i]
    name2 = bw[1+len(names)+i]
    peak = [cond1peak[i], commonpeak[i], cond2peak[i]]
    #for val in peak:
     #   chip.dropWeirdChromosomes(val)
    name = names[i]
    print(name1,name2)
    chip.getPeaksAt(peak, [name1, name2], bigwignames=['DMSO', 'VHL'], 
                    peaknames=['DMSO_peaks', 'common', 'VHL_peaks'], window=3000, 
                    folder="", title=name, numthreads=8, torecompute=True, refpoint='center', 
                    name='../../data/'+project+'/diffPeaks_unscaled/'+name+'_mat.pdf', withDeeptools=True)

In [None]:
h.createFoldersFor('../results/'+project+'/plots/unscaled/diffPeaks/')
! cp ../../data/$project/diffPeaks_unscaled/*.pdf ../results/$project/plots/unscaled/diffPeaks/

! gsutil -m cp gs://amlproject/Chip/$project/bwa/mergedLibrary/deepTools/**.pdf ../results/$project/plots/

In [None]:
! gsutil -m cp -r ../../data/$project/diffPeaks gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffData gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffPeaks_unscaled gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/diffData_unscaled gs://amlproject/Chip/$project/
! gsutil -m cp -r ../../data/$project/peakplot gs://amlproject/Chip/$project/