In [1]:
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()

In [None]:
# set working directory
%cd /beegfs/scratch/ric.broccoli/kubacki.michal/SRF_ChipSeq/Visualization

In [2]:
%load_ext rpy2.ipython

In [None]:
%%R
# Load required libraries
library(ChIPseeker)
library(TxDb.Mmusculus.UCSC.mm10.knownGene)
library(clusterProfiler)
library(ggplot2)

# Set your working directory to where your files are located
setwd("/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_ChipSeq/custom_pipeline")

In [7]:
%%R
# Define sample groups
endogenous_samples <- c("NeuM2", "NeuM3", "NSCM1", "NSCM2", "NSCM3", "IgM")
exogenous_samples <- c("NeuV1", "NeuV2", "NeuV3", "NSCv1", "NSCv2", "NSCv3")

In [None]:
%%R
# Read in your narrowPeak files
peak_files <- list.files(path = "/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_ChipSeq/custom_pipeline/results/peaks", pattern = "*_peaks.narrowPeak$", full.names = TRUE)

# Add error checking for empty file list
if (length(peak_files) == 0) {
  stop("No peak files found in the specified directory")
}

# Add error checking for file reading
peak_list <- list()
for (file in peak_files) {
  if (file.size(file) > 0) {
    peak_list[[file]] <- readPeakFile(file)
  } else {
    warning(paste("Skipping empty file:", file))
  }
}

# Only proceed with naming if we have peaks
if (length(peak_list) > 0) {
  names(peak_list) <- gsub("_peaks.narrowPeak", "", basename(names(peak_list)))
} else {
  stop("No valid peak files were read")
}

In [11]:
%%R
# Filter peak list to include only the specified samples
peak_list <- peak_list[c(endogenous_samples, exogenous_samples)]

In [12]:
%%R
# Create a txdb object
txdb <- TxDb.Mmusculus.UCSC.mm10.knownGene

In [None]:
%%R
# Plot the peak distribution relative to TSS
promoter <- getPromoters(TxDb=txdb, upstream=3000, downstream=3000)

# Add error checking for peak files
tagMatrixList <- list()
for (name in names(peak_list)) {
  peaks <- peak_list[[name]]
  # Check if peaks exists and is not NULL before checking nrow
  if (!is.null(peaks)) {
    if (is(peaks, "GRanges") && length(peaks) > 0) {
      tagMatrixList[[name]] <- getTagMatrix(peaks, windows=promoter)
    } else {
      warning(paste("Skipping invalid peaks for:", name))
    }
  } else {
    warning(paste("Skipping NULL peaks for:", name)) 
  }
}


In [17]:
%%R
# Separate endogenous and exogenous samples
tagMatrixList_endo <- tagMatrixList[endogenous_samples]
tagMatrixList_exo <- tagMatrixList[exogenous_samples]

In [None]:
%%R
# Plot average profiles
pdf("./ChIP_seq_avg_profiles.pdf", width=12, height=8)

# Check if tagMatrixList_endo has valid data before plotting
if (length(tagMatrixList_endo) > 0 && all(sapply(tagMatrixList_endo, function(x) !is.null(x) && is.matrix(x) && nrow(x) > 0))) {
  plotAvgProf(tagMatrixList_endo, xlim=c(-3000, 3000), conf=0.95, resample=500, 
              facet="row")
  title(main="Endogenous Samples")
} else {
  plot.new()
  title(main="No valid data for endogenous samples")
}

# Check if tagMatrixList_exo has valid data before plotting  
if (length(tagMatrixList_exo) > 0 && all(sapply(tagMatrixList_exo, function(x) !is.null(x) && is.matrix(x) && nrow(x) > 0))) {
  plotAvgProf(tagMatrixList_exo, xlim=c(-3000, 3000), conf=0.95, resample=500, 
              facet="row")
  title(main="Exogenous Samples")
} else {
  plot.new()
  title(main="No valid data for exogenous samples") 
}

dev.off()

In [23]:
%%R
# Annotate peaks
peakAnnoList <- lapply(names(peak_list), function(name) {
  peaks <- peak_list[[name]]
  if (!is.null(peaks)) {
    if (is.data.frame(peaks) && nrow(peaks) > 0) {
      annotatePeak(peaks, TxDb=txdb, tssRegion=c(-3000, 3000), verbose=FALSE)
    } else {
      NULL
    }
  } else {
    NULL
  }
})
names(peakAnnoList) <- names(peak_list)

In [None]:
%%R
# Plot genomic annotation
pdf("ChIP_seq_genomic_annotation.pdf", width=12, height=8)

# Check if peakAnnoList has valid data for endogenous samples before plotting
valid_endo <- !sapply(peakAnnoList[endogenous_samples], is.null)
if (any(valid_endo)) {
  plotAnnoBar(peakAnnoList[endogenous_samples][valid_endo])
  title(main="Endogenous Samples")
} else {
  plot.new()
  title(main="No valid data for endogenous samples")
}

# Check if peakAnnoList has valid data for exogenous samples before plotting
valid_exo <- !sapply(peakAnnoList[exogenous_samples], is.null) 
if (any(valid_exo)) {
  plotAnnoBar(peakAnnoList[exogenous_samples][valid_exo])
  title(main="Exogenous Samples")
} else {
  plot.new()
  title(main="No valid data for exogenous samples")
}

dev.off()

In [None]:
%%R
# Plot distribution of peaks over chromosomes
pdf("ChIP_seq_peak_distribution.pdf", width=12, height=8)

# Check if peakAnnoList has valid data for endogenous samples before plotting
valid_endo <- !sapply(peakAnnoList[endogenous_samples], is.null)
if (any(valid_endo)) {
  plotAnnoPie(peakAnnoList[endogenous_samples][valid_endo])
  title(main="Endogenous Samples - Peak Distribution")
} else {
  plot.new()
  title(main="No valid data for endogenous samples")
}

# Check if peakAnnoList has valid data for exogenous samples before plotting
valid_exo <- !sapply(peakAnnoList[exogenous_samples], is.null)
if (any(valid_exo)) {
  plotAnnoPie(peakAnnoList[exogenous_samples][valid_exo])
  title(main="Exogenous Samples - Peak Distribution") 
} else {
  plot.new()
  title(main="No valid data for exogenous samples")
}

dev.off()

In [None]:
%%R
# Compare peak numbers
peak_numbers <- sapply(peak_list, length)
peak_numbers_df <- data.frame(
  Sample = names(peak_numbers),
  PeakCount = peak_numbers,
  Condition = ifelse(names(peak_numbers) %in% endogenous_samples, "Endogenous", "Exogenous"),
  CellType = ifelse(grepl("Neu", names(peak_numbers)), "Neuron", "NSC")
)

In [None]:
%%R
ggplot(peak_numbers_df, aes(x = Sample, y = PeakCount, fill = CellType)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ Condition, scales = "free_x") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(title = "Peak Counts Across Samples", y = "Number of Peaks")
ggsave("peak_counts_comparison.pdf", width = 12, height = 6)