In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

# Specify HOMER 5' Peak files

In [None]:
count_cols = ['nexus']
min_count = -1
peaks_quantifications_filepath = 'nanog-chip-nexus-sites.bowtie2.dedup.minraw_4.txt.rlogs.txt'
peaks_counts_filepath = 'nanog-chip-nexus-sites.bowtie2.dedup.minraw_4.txt.counts.txt'

# Load peak quantifications

In [None]:
peak_quantifications_df = pd.read_csv(peaks_quantifications_filepath, sep = '\t')
cols = list(peak_quantifications_df.columns)
peak_quantifications_df['PeakID'] = peak_quantifications_df[cols[0]]
peak_quantifications_df['nexus'] = peak_quantifications_df[cols[-2]]
peak_quantifications_df['patchcap'] = peak_quantifications_df[cols[-1]]
peak_quantifications_df

# Load peak counts

In [None]:
peak_counts_df = pd.read_csv(peaks_counts_filepath, sep = '\t')
cols = list(peak_counts_df.columns)
peak_counts_df['PeakID'] = peak_counts_df[cols[0]]
peak_counts_df['nexus'] = peak_counts_df[cols[-2]]
peak_counts_df['patchcap'] = peak_counts_df[cols[-1]]
peak_counts_df['sum'] = peak_counts_df[count_cols].sum(axis = 1)


# Calculate Log2 Fold Change between nexus and patchcap control

In [None]:
peak_ids_with_min_count = list(peak_counts_df[peak_counts_df['sum']>=min_count]['PeakID'])

In [None]:
log2fc_df = (peak_quantifications_df.copy().set_index('PeakID')[['nexus', 'patchcap']]+peak_quantifications_df[['nexus', 'patchcap']].min().min()+1).copy()
log2fc_df['log2fc'] = np.log2(log2fc_df['nexus']/log2fc_df['patchcap'])
log2fc_df

# Calculate Rank of Log2FC score, as well as sum of coverage at each peak

In [None]:
peak_quantifications_df['log2fc'] = peak_quantifications_df['PeakID'].map(log2fc_df['log2fc'].to_dict())
peak_quantifications_df['log2fc_rank'] = peak_quantifications_df['log2fc'].rank()
peak_quantifications_df['sum'] = peak_quantifications_df[['nexus','patchcap']].sum(axis = 1)
peak_quantifications_df

In [None]:
peak_quantifications_df[['log2fc', 'log2fc_rank', 'sum', 'nexus']].hist(bins = 100)

In [None]:
count_cols_str = '_'.join(count_cols)

# Score peaks by 5' coverage in nexus samples

In [None]:
score_type = 'nexus'
genome_fa='mm10.fa'
slop=200
margin=5
pval=0.05
pval=0.0001
bed_filepath=f'{peaks_quantifications_filepath}.min_count_{min_count}_in_{count_cols_str}.scored_by_{score_type}_nexus_vs_patchcap.bed'
mepp_filepath=f'{bed_filepath}.slop_{slop}.margin_{margin}.pval_{pval}.mepp'
motifs_filepath='homer.motifs.txt'


In [None]:
bed_df = peak_quantifications_df[['Chr','Start','End','PeakID',score_type,'Strand','sum']].sort_values(by = ['Chr', 'Start']).copy()
bed_df.to_csv(bed_filepath, sep = '\t', header = False, index = False)
! wc -l {bed_filepath}
! head {bed_filepath}

# Download external data

In [None]:
%%bash
# Download genome
GENOME_URL="http://hgdownload.cse.ucsc.edu/goldenpath/mm10/bigZips/mm10.fa.gz"
wget -nc -O "$genome_fa".gz "$GENOME_URL"
zcat "$genome_fa".gz > "$genome_fa"

# Index genome
samtools faidx "$GENOME_FILEPATH"

# Download motifs
wget -nc -O homer.motifs.txt https://raw.githubusercontent.com/npdeloss/mepp/main/data/homer.motifs.txt


# Run MEPP Analysis
* Start with scored bed file of 5' end peaks
* Expand intervals to get sequences from +/- 200bp of 5' ends
* Deduplicate clusters of overlapping intervals, keeping the one with the most coverage
* Create scored fasta file from scored bed file
* Analyze scored sequences with MEPP

In [None]:
mepp_cmd = (
    f'bedtools slop -i {bed_filepath} -g {genome_fa}.fai -b {slop} '
    f'| bedtools cluster -s -i - '
    f'| sort -k8,8n -k7,7nr | awk \'!a[$8]++\' '
    f'| bedtools sort -i - |cut -f1-6 '
    f'| python -m mepp.get_scored_fasta -fi {genome_fa} '
    f'-bed - '
    f'| $(which time) --verbose python -m mepp.cli '
    f'--fa - '
    f'--motifs {motifs_filepath} '
    f'--out {mepp_filepath} '
    f'--perms 100 '
    f'--batch 1000 '
    f'--dgt 50 '
    f'--jobs 15 '
    f'--margin {margin} '
    f'--pval {pval} '
    f'--gjobs 15 '
    f'--nogpu '
    f'--dpi 100 '
    f'--orientations +/- '
    f'--margin {margin} '
    f'&> {mepp_filepath}.log'
)
print(mepp_cmd)

In [None]:
%%time

for cmd in tqdm([mepp_cmd]):
    print(cmd)
    ! {cmd}

# Display links to HTML output

In [None]:
from IPython.display import display, Markdown

In [None]:

mepp_results_table_fwd_md = f'[Results table, + orientation]({mepp_filepath}/results_table_orientation_fwd.html)'
mepp_clustermap_fwd_md = f'[Clustermap, + orientation]({mepp_filepath}/clustermap_orientation_fwd.html)'

mepp_results_table_rev_md = f'[Results table, - orientation]({mepp_filepath}/results_table_orientation_rev.html)'
mepp_clustermap_rev_md = f'[Clustermap, - orientation]({mepp_filepath}/clustermap_orientation_rev.html)'

mepp_results_table_both_md = f'[Results table, both orientations]({mepp_filepath}/results_table_orientation_fwd-rev.html)'
mepp_clustermap_both_md = f'[Clustermap, both orientations]({mepp_filepath}/clustermap_orientation_fwd-rev.html)'

In [None]:
# display(Markdown(mepp_results_table_fwd_md))
# display(Markdown(mepp_clustermap_fwd_md))
# display(Markdown(mepp_results_table_rev_md))
# display(Markdown(mepp_clustermap_rev_md))
display(Markdown(mepp_results_table_both_md))
display(Markdown(mepp_clustermap_both_md))