# DAP-seq Analysis

This notebook visualizes the output of the DAP-seq Snakemake pipeline.

**Usage:**
1. Copy this notebook into your project directory (or open it directly).
2. Set `RESULTS_DIR` in the next cell to the path of your Snakemake output directory.
3. Optionally set `CONTROL` to exclude your input-control sample from plots.
4. Run all cells.

In [None]:
# ── Configuration ────────────────────────────────────────────────────────────
# Point this at the Snakemake output directory (the 'output_dir' from config.yaml)
RESULTS_DIR = "/path/to/snakemake/output"

# Name of the input-control sample to exclude from analysis (set to None if none)
CONTROL = None

# ── Auto-discover samples ────────────────────────────────────────────────────
from pathlib import Path

results = Path(RESULTS_DIR)
assert results.is_dir(), f"RESULTS_DIR not found: {RESULTS_DIR}"

# Samples are those with MACS summits files
all_samples = sorted(
    p.stem.replace("_summits", "")
    for p in (results / "MACS").glob("*_summits.bed")
)
samples = [s for s in all_samples if s != CONTROL]
print(f"Found {len(all_samples)} total samples, {len(samples)} treatment samples:")
for s in samples:
    print(f"  - {s}")

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib_venn import venn3

## QC Summary

In [None]:
multiqc_report = results / "multiqc_report.html"
if multiqc_report.exists():
    print(f"MultiQC report: {multiqc_report}")
else:
    print("MultiQC report not found.")

## Peak Calling Summary

In [None]:
rows = []
for sample in samples:
    macs_file = results / "MACS" / f"{sample}_summits.bed"
    gem_file  = results / "GEM" / sample / f"{sample}.GEM_events.txt"

    macs_count = sum(1 for _ in open(macs_file)) if macs_file.exists() else None
    # GEM events file has a header line
    gem_count  = max(0, sum(1 for _ in open(gem_file)) - 1) if gem_file.exists() else None

    rows.append({"sample": sample, "MACS_peaks": macs_count, "GEM_peaks": gem_count})

df_peaks = pd.DataFrame(rows)
df_peaks

## Venn Diagrams — Peak Caller & Motif Overlap

In [None]:
# Compute venn sets per sample
venn_set = {}
for sample in samples:
    try:
        df_MACS      = pd.read_table(results / "compare_bed" / f"{sample}.MACS.bed", header=None)
        df_GEM       = pd.read_table(results / "compare_bed" / f"{sample}.GEM.bed", header=None)
        df_Motif     = pd.read_table(results / "fimo" / sample / "fimo.bed", header=None)
        df_MACS_peak = pd.read_table(results / "compare_bed" / f"{sample}.MACS_peak.bed", header=None)
        df_GEM_peak  = pd.read_table(results / "compare_bed" / f"{sample}.GEM_peak.bed", header=None)
        df_compare   = pd.read_table(results / "compare_bed" / f"{sample}.compare_peak.bed", header=None)

        All_overlay    = len(set(df_MACS_peak[3]).intersection(df_GEM_peak[3]))
        MACS_only      = len(df_MACS) - len(df_compare) - len(df_MACS_peak) + All_overlay
        GEM_only       = len(df_GEM) - len(df_compare) - len(df_GEM_peak) + All_overlay
        MOTIF_only     = len(df_Motif) - len(df_MACS_peak) - len(df_GEM_peak) + All_overlay
        GEM_MACS_only  = len(df_compare) - All_overlay
        MACS_MOTIF_only = len(df_MACS_peak) - All_overlay
        GEM_MOTIF_only = len(df_GEM_peak) - All_overlay

        venn_set[sample] = [MACS_only, GEM_only, GEM_MACS_only, MOTIF_only,
                            MACS_MOTIF_only, GEM_MOTIF_only, All_overlay]
    except Exception as e:
        print(f"Skipping {sample}: {e}")

print(f"Computed venn sets for {len(venn_set)}/{len(samples)} samples.")

In [None]:
# Plot venn diagrams with first-round motif logos
f, ax = plt.subplots(2 * int(len(venn_set) / 4) + 2, 4,
                     figsize=(24, max(8, len(venn_set) * 2)))
ax = ax.ravel()
[axi.set_axis_off() for axi in ax.ravel()]

for ix, sample in enumerate(venn_set):
    ax[2 * ix + 1].set_title(sample)
    venn3(subsets=venn_set[sample],
          set_labels=('MACS', 'GEM', 'Motif'), ax=ax[2 * ix + 1])
    ax[2 * ix].set_title(sample)
    logo = results / "meme" / sample / "logo1.png"
    if logo.exists():
        img = mpimg.imread(str(logo))
        ax[2 * ix].imshow(img)

plt.tight_layout()
plt.show()

## Refined Motifs (Intersection Peaks)

In [None]:
# Plot venn diagrams with refined (intersection) motif logos
f, ax = plt.subplots(2 * int(len(venn_set) / 4) + 2, 4,
                     figsize=(24, max(8, len(venn_set) * 2)))
ax = ax.ravel()
[axi.set_axis_off() for axi in ax.ravel()]

for ix, sample in enumerate(venn_set):
    ax[2 * ix + 1].set_title(sample)
    venn3(subsets=venn_set[sample],
          set_labels=('MACS', 'GEM', 'Motif'), ax=ax[2 * ix + 1])
    ax[2 * ix].set_title(sample)
    logo = results / "meme" / f"{sample}-intersection" / "logo1.png"
    if logo.exists():
        img = mpimg.imread(str(logo))
        ax[2 * ix].imshow(img)

plt.tight_layout()
plt.show()