In [None]:
suppressMessages({
    library(data.table)
    library(dplyr)
    library(tidyverse)
    library(ggplot2)
    library(readxl)
    library(purrr)
    library(regioneR)
    #library(BSgenome.Hsapiens.UCSC.hg19)
    library(GenomicRanges)
    library(TxDb.Hsapiens.UCSC.hg38.knownGene)
})

In [None]:
filepath <- Sys.glob('../../../../../datasets/*/_m/*.tsv')
filepath <- grep('sz_pgc3_gwas_byregion',filepath,value = T)
filepath


paths <- tools::file_path_sans_ext(basename(filepath))
paths

annotation_groups <- rbindlist(lapply(filepath, fread))
head(annotation_groups)
tail(annotation_groups)
table(annotation_groups$class)

In [None]:
files_path <- Sys.glob('../../../../chipseeker/peaks_annotated/*/_m/*_10set.tsv')
files_path

In [None]:
custom_hg38 <- fread('../../../input/custom_hg38/_m/hg38_blacklist_24chr.bed') %>%
    as.data.frame() %>%
    dplyr::rename(seqnames = V1, start = V2, end = V3) %>%
    makeGRangesFromDataFrame()

In [None]:
for (i in 1:length(files_path)){
    
    
    df <- fread(files_path[i]) %>% 
                as.data.frame() %>%
                subset(grepl('STRINGENT_SETD1A_AtlasAndCST_POSITIVE|LIBERAL_SETD1A_AtlasAndCST_NEGATIVE|LIBERAL_SETD1A_AtlasAndCST_POSITIVE|^Active_Promoter|^Active_Enhancers|^Inactive_Enhancers',sample_id))

    
    # df <- fread(files_path[i]) %>% 
    #     as.data.frame() %>%
    #     subset(grepl('STRINGENT|LIBERAL',sample_id))

    samples <- unique(df$sample_id)
    
    print('#####')
    print(samples)
    print('####')
    
    }
    

In [None]:
df <- suppressWarnings(read_excel("../../../../../datasets/sz_pgc3_gwas_byregion/_h/Supplementary Table 3.xls", sheet = 1,skip=0)) %>%
                    #dplyr::select(Chromosome,`top-pos`) %>%
                    plyr::arrange(Chromosome,as.numeric(Chromosome)) %>%
                    mutate(seqnames = paste0('chr',Chromosome), 
                    start = `merge-LEFT`,
                    end = `merge-RIGHT`,  #change here, if necessary
                    class = 'sz_pgc3_gwas_byregion' ) %>%
                    #dplyr::select(seqnames, start, end, class) %>%
                    as.data.frame()


df$seqnames[grepl('^chr23$',df$seqnames)] <- 'chrX'

df$seqnames[grepl('^chr24$',df$seqnames)] <- 'chrY'

sz_annot <- df
head(sz_annot)
tail(sz_annot)


In [None]:
for (i in 1:length(files_path)){
    
    
    df <- fread(files_path[i]) %>% 
                as.data.frame() %>%
                subset(grepl('STRINGENT_SETD1A_AtlasAndCST_POSITIVE|LIBERAL_SETD1A_AtlasAndCST_NEGATIVE|LIBERAL_SETD1A_AtlasAndCST_POSITIVE|^Active_Promoter|^Active_Enhancers|^Inactive_Enhancers',sample_id))

    


    samples <- unique(df$sample_id)
    
    
    for (i2 in seq_along(paths)){
        
    annotation_granges <- makeGRangesFromDataFrame(annotation_groups,keep.extra.columns = T)
    
    annotation_granges <- subset(annotation_groups, class == paths[i2])
    
    
    dir.create(paths[i2]) # create output folder


        for(z in seq_along(samples)){

            df_sample <- df %>% 
                     filter(sample_id == samples[z]) %>%
                     #dplyr::sample_frac(0.10) %>% ## remove here! just to test
                     dplyr::select(seqnames,start,end, sample_id) %>%
                     makeGRangesFromDataFrame(keep.extra.columns = T) 

            output_overlap <- overlapRegions(df_sample,annotation_granges)
            output_overlap$sample_id <- df_sample$sample_id[1]
            df3 <- dplyr::left_join(output_overlap, sz_annot, by=c("chr" = "seqnames", "startB" = "start", "endB" = "end"))
            fwrite(df3, paste0(paths[i2],'/',samples[z],'_overlap-regions.tsv'), sep='\t',quote=F, row.names=F)


            x <- annotation_granges
            x$n_peaks <- 0
            x$peak_class <- samples[z]

            for (npeak in 1:nrow(annotation_granges)){
                n_peaks <- numOverlaps(df_sample, annotation_granges[npeak,], count.once=F)
                x$n_peaks[npeak] <- n_peaks
                }
            x <- x %>%
                    filter(n_peaks > 0)
            x <- dplyr::left_join(x, sz_annot, by=c("seqnames" = "seqnames", "start" = "start", "end" = "end"))
            
            fwrite(x, paste0(paths[i2],'/',samples[z],'_peaknumber_by-region.tsv'), sep='\t',quote=F, row.names=F)

            
            }
    }
}
                

In [None]:
sessionInfo()