In [None]:
suppressMessages({
    library(data.table)
    library(dplyr)
    library(tidyverse)
    library(ggplot2)
    library(ChIPseeker)
    library(TxDb.Hsapiens.UCSC.hg38.knownGene)
    library(GenomicRanges)
    library(rtracklayer)
    library(IRanges)
    library(org.Hs.eg.db)
    library(ChIPpeakAnno)
})

In [None]:
df <- fread('../../_m/chipseeker_controlpeaks_stringent_annotation_homertss.tsv') #load all individual brain/samples peaks


In [None]:
df_stringent <- fread('../../overlap_peaks/_m/STRINGENT_H3K4me3_overlap_peaks_allregions_annotated.tsv') %>%  #load stringent h3k4me3 peaks
                mutate(sample_id = 'stringent_me3') %>%
                as.data.frame()
dim(df_stringent)

In [None]:
#rename each sample to Brain1, Brain2, Brain3....

all_me3_tag <- df %>%
                    filter(grepl('H3K4me3',sample_id)) %>%
                    dplyr::select(sample_id)

all_me3_tag <- unique(all_me3_tag$sample_id) %>% 
                        as.data.frame() %>%
                        mutate(old_peak_id = c('Brain3_me3','Brain2_me3','Brain1_me3')) %>%
                        dplyr::rename('sample_id' = '.')

all_me3_tag


#7810

In [None]:
#create a dataframe with all (unique) me3 peaks indenfied in all 3 brains

all_me3 <- df %>%
                    filter(grepl('H3K4me3',sample_id)) %>%
                    #dplyr::select(sample_id) %>%
                    left_join(all_me3_tag) %>%
                    mutate(loc_id = paste0(seqnames,':',start,'-',end)) %>%
                    distinct(loc_id,.keep_all = T) %>%
                    group_by(old_peak_id) %>%
                    mutate(peak_id = paste0(old_peak_id,'_',1:n())) %>%
                    ungroup() %>%
                    as.data.frame()


row.names(all_me3) <- all_me3$peak_id               
                             
                             

In [None]:
me3_stringent <- df_stringent %>%
                 mutate(peak_id = paste0(sample_id,'_',1:n())) %>%
                 as.data.frame()

row.names(me3_stringent) <- me3_stringent$peak_id               

#me3_stringent           

In [None]:
#head(me3_stringent)

In [None]:
#check all the numbers

table(all_me3$old_peak_id)


print(paste0('Brain 1 - H3K4me3: ', all_me3 %>% filter(grepl('Brain1',peak_id)) %>% nrow()))
print(paste0('Brain 2 - H3K4me3: ', all_me3 %>% filter(grepl('Brain2',peak_id)) %>% nrow()))
print(paste0('Brain 3 - H3K4me3: ', all_me3 %>% filter(grepl('Brain3',peak_id)) %>% nrow()))

print(paste0('Total Unique H3K4me3 peaks: ', all_me3 %>% nrow()))


In [None]:
# Now we will work with all the SETD1A samples (atlas & cst, brain1, brain2, brain3)


In [None]:


setd1a <- df %>%
                filter(grepl('SETD1A',sample_id))

setd1a_tags <- unique(setd1a$sample_id) %>% 
                as.data.frame() %>%
                mutate(old_peak_id = c('Brain3_cst','Brain3_atlas','Brain2_cst','Brain2_atlas','Brain1_cst','Brain1_atlas')) %>%
                dplyr::rename('sample_id' = '.')

setd1a_tags

In [None]:
#create a dataframe with all (unique) SETD1A Atlas and SETD1A cst peaks indenfied in all 3 brains


all_setd1a <- df %>%
                    filter(grepl('SETD1A',sample_id)) %>%
                    #dplyr::select(sample_id) %>%
                    left_join(setd1a_tags) %>%
                    mutate(loc_id = paste0(seqnames,':',start,'-',end)) %>%
                    distinct(loc_id,.keep_all = T) %>%
                    group_by(old_peak_id) %>%
                    mutate(peak_id = paste0(old_peak_id,'_',1:n())) %>%
                    ungroup() %>%
                    as.data.frame()


row.names(all_setd1a) <- all_setd1a$peak_id   

In [None]:
#all_setd1a

In [None]:
#tail(all_setd1a)

In [None]:
#sum(duplicated(liberal_me3$loc_id))
#4

In [None]:
#check all the numbers


print(paste0('Brain 1 - SETD1A_Atlas: ', all_setd1a %>% filter(grepl('Brain1',peak_id) & grepl('atlas',peak_id)) %>% nrow()))
print(paste0('Brain 1 - SETD1A_CST: ', all_setd1a %>% filter(grepl('Brain1',peak_id) & grepl('cst',peak_id)) %>% nrow()))

print(paste0('Brain 2 - SETD1A_Atlas: ', all_setd1a %>% filter(grepl('Brain2',peak_id) & grepl('atlas',peak_id)) %>% nrow()))
print(paste0('Brain 2 - SETD1A_CST: ', all_setd1a %>% filter(grepl('Brain2',peak_id) & grepl('cst',peak_id)) %>% nrow()))

print(paste0('Brain 3 - SETD1A_Atlas: ', all_setd1a %>% filter(grepl('Brain3',peak_id) & grepl('atlas',peak_id)) %>% nrow()))
print(paste0('Brain 3 - SETD1A_CST: ', all_setd1a %>% filter(grepl('Brain3',peak_id) & grepl('cst',peak_id)) %>% nrow()))

print(paste0('Total Atlas: ', all_setd1a %>% filter(grepl('atlas',peak_id)) %>% nrow()))
print(paste0('Total CST: ', all_setd1a %>% filter(grepl('cst',peak_id)) %>% nrow()))



In [None]:
#ok, everything seems right! lets calculate the overlaps

In [None]:
ol <- findOverlapsOfPeaks(A=makeGRangesFromDataFrame(all_me3,keep.extra.columns = T), 
                          B=makeGRangesFromDataFrame(all_setd1a, keep.extra.columns = T),
                          ignore.strand=TRUE,
                          connectedPeaks="merge")
#ol$overlappingPeaks


df_overlap <- ol$overlappingPeaks[[1]] %>% as.data.frame()

colnames(df_overlap)[27:50] <- paste0(colnames(df_overlap)[27:50],'_peaks2')

fwrite(df_overlap,'overlap_peakids_all_me3xsetd1a.tsv',quote=F,sep='\t',row.names=T)

In [None]:
#now we have a huge dataframe with the information between the overlap of each brain setd1a marker and each brain h3k4me3 marker

In [None]:
#
dim(df_overlap)
head(df_overlap)

#for example, in row1 :
# the peak 'Brain3_me3_1' from Brain 3 H3K4me3 sample overlapped with 'Brain3_atlas_1', which is the Peak 1 from SETD1A Atlas from Brain 3!

In [None]:
#we could use this information to create a dataframe with all the quantities / percentage of overlapped peaks:

In [None]:
seta_tag <- data.frame(SetA = c('Brain1_me3','Brain2_me3','Brain3_me3'),
                       SetA_universe = c(7807,7822,8089)
                      )


setb_tag <- data.frame(SetB = c('Brain1_atlas','Brain1_cst','Brain2_atlas','Brain2_cst','Brain3_atlas','Brain3_cst'),
                       SetB_universe = c(6425, 5512, 8516, 3836, 9730, 4461)
                       )




In [None]:
#df_overlap$peak_id[1:10]

In [None]:
#individual overlap by each brain me3 x each brain setd1a

df_overlap_all <- table(df_overlap$old_peak_id, df_overlap$old_peak_id_peaks2) %>% 
                as.data.frame() %>%
                rename('Var1' = 'SetA', 'Var2' = 'SetB', 'Freq' = 'Overlap') %>%
                left_join(seta_tag) %>%
                left_join(setb_tag) %>%
                #mutate(percentage_overlap = Overlap / (SetA_universe + SetB_universe - Overlap)) %>%
                mutate(percentage_overlap = Overlap / SetB_universe) %>%
                relocate(SetA,SetB,SetA_universe,SetB_universe,Overlap,percentage_overlap)


df_overlap_all

fwrite(df_overlap_all,'overlap_summary_all_me3xsetd1a.tsv',quote=F,sep='\t',row.names=F)

In [None]:
# for example, if you want to save all the overlaps between Brain 3 H3K4me3 and Brain 2 SETD1A atlas, you just need to:

# Brain3_me3	Brain2_atlas

df_cut <- subset(df_overlap, (old_peak_id == 'Brain3_me3' & old_peak_id_peaks2 == 'Brain2_atlas'))
dim(df_cut)
head(df_cut)


In [None]:
#head(df_overlap[,1:25])

### do the same thing, but now with the overlap between each setd1a marker/brain with stringent me3 peaks:

In [None]:
ol <- findOverlapsOfPeaks(A=makeGRangesFromDataFrame(me3_stringent,keep.extra.columns = T), 
                          B=makeGRangesFromDataFrame(all_setd1a, keep.extra.columns = T),
                          ignore.strand=TRUE,
                          connectedPeaks="merge")
#ol$overlappingPeaks


df_overlap <- ol$overlappingPeaks[[1]] %>% as.data.frame()



colnames(df_overlap)[23:46] <- paste0(colnames(df_overlap)[23:46],'_peaks2')



fwrite(df_overlap,'overlap_peakids_stringent_me3xsetd1a.tsv',quote=F,sep='\t',row.names=T)

In [None]:
#colnames(df_overlap)[23:46]

In [None]:
#colnames(df_overlap)

In [None]:
seta_tag <- data.frame(SetA = c('stringent_me3'),
                       SetA_universe = c(7889)
                      )

 
setb_tag <- data.frame(SetB = c('Brain1_atlas','Brain1_cst','Brain2_atlas','Brain2_cst','Brain3_atlas','Brain3_cst'),
                       SetB_universe = c(6425, 5512, 8516, 3836, 9730, 4461)
                       )


In [None]:
dim(me3_stringent)

In [None]:
df_overlap_all <- table(df_overlap$sample_id, df_overlap$old_peak_id_peaks2) %>% 
                as.data.frame() %>%
                rename('Var1' = 'SetA', 'Var2' = 'SetB', 'Freq' = 'Overlap') %>%
                left_join(seta_tag) %>%
                left_join(setb_tag) %>%
                #mutate(percentage_overlap = Overlap / (SetA_universe + SetB_universe - Overlap)) %>%
                mutate(percentage_overlap = Overlap / SetB_universe) %>%
                relocate(SetA,SetB,SetA_universe,SetB_universe,Overlap,percentage_overlap)


df_overlap_all


fwrite(df_overlap_all,'overlap_summary_stringent_me3xsetd1a.tsv',quote=F,sep='\t',row.names=T)

In [None]:


df_cut <- subset(df_overlap, (old_peak_id_peaks2 == 'Brain2_atlas'))
dim(df_cut)
head(df_cut)