In [2]:
library(goldmine)
library(ggplot2)
library(dplyr)
library(parallel)
library(pbapply)

Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: generics


Attaching package: ‘generics’


The following objects are masked from ‘package:base’:

    as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
    setequal, union



Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
    unsplit, which.max, which.min


Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following object is masked from ‘pac

In [3]:
repeat_elt_annot <- function(bin, repeat_elt, ncores = detectCores() - 1) {
  bin$repNameID <- NA
  bin$repName <- NA
  bin$repClass <- NA
  bin$repFamily <- NA
  bin$re_width <- NA
  
  message(Sys.time(), ': Finding overlaps between bins and repetitive elements...')
  ovl <- findOverlaps(bin, repeat_elt) %>% as.data.frame()
  ovl_spl <- split(ovl, ovl$queryHits)
  
  message(Sys.time(), ': Annotating ', length(ovl_spl), ' bins with repetitive elements using ', ncores, ' cores...')
  
  if(length(ovl_spl) > 0) {
    # Parallel processing function
    process_overlap <- function(x) {
      query_idx <- unique(x$queryHits)
      list(
        query_idx = query_idx,
        repNameID = paste0(x$subjectHits, collapse = ','),
        repName = paste0(repeat_elt$repName[x$subjectHits], collapse = ','),
        repClass = paste0(repeat_elt$repClass[x$subjectHits], collapse = ','),
        repFamily = paste0(repeat_elt$repFamily[x$subjectHits], collapse = ','),
        re_width = paste0(repeat_elt$re_width[x$subjectHits], collapse = ',')
      )
    }
    
    # Set progress bar options
    pboptions(type = "txt", char = "=", style = 3)
    
    # Process in parallel with progress bar
    results <- pblapply(ovl_spl, process_overlap, cl = ncores)
    
    # Vectorized application of results back to bin
    message(Sys.time(), ': Applying results back to bins (vectorized)...')
    
    # Extract all indices and values
    all_indices <- sapply(results, function(x) x$query_idx)
    repNameIDs <- sapply(results, function(x) x$repNameID)
    repNames <- sapply(results, function(x) x$repName)
    repClasses <- sapply(results, function(x) x$repClass)
    repFamilies <- sapply(results, function(x) x$repFamily)
    re_widths <- sapply(results, function(x) x$re_width)
    
    # Vectorized assignment
    bin$repNameID[all_indices] <- repNameIDs
    bin$repName[all_indices] <- repNames
    bin$repClass[all_indices] <- repClasses
    bin$repFamily[all_indices] <- repFamilies
    bin$re_width[all_indices] <- re_widths
  }
  
  message(Sys.time(), ': Annotation complete!')
  return(bin)
}

rep_enrich_calc <- function(bin_annot_re,repFamily='Alu',repeat_elt,bin_size=10,threshold=0.7,
                            re.filter=T,max.ovl.re.only=T) {
  message(Sys.time(), ': Compute repeat enrichment for ', repFamily, '...')
  boolean_column <- paste0(repFamily,'_or_not')
  bin_annot_re <- bin_annot_re[bin_annot_re[[boolean_column]],]
  bin_annot_re <- tidyr::separate_rows(bin_annot_re,
                                               c('repNameID','repName','repClass','repFamily'),
                                               sep = ",") %>% as.data.frame()
  rebin_count <- bin_annot_re %>% count(X,repNameID,repName)
  rebin_count$re_width <- repeat_elt$re_width[as.integer(rebin_count$repNameID)]
  rebin_count$perct_width_match <- rebin_count$n*bin_size/rebin_count$re_width
  rebin_count$pass <- rebin_count$perct_width_match >= threshold
  # Filter based on percentage width match
  if (re.filter) {
    message(Sys.time(), ': Only retaining REs with percentage width match at least ', threshold*100, '% ...')
    rebin_count <- rebin_count[rebin_count$pass,]
  }
  # Keep only the RE with the highest percentage width match per motif cluster
  if (max.ovl.re.only) {
    rebin_count <- rebin_count %>% group_by(X) %>% slice_max(perct_width_match, n = 1) %>% 
      slice_max(re_width, n = 1) %>% as.data.frame()
  }
  rebin_count$repFamily <- sub(":.*", "",rebin_count$repName)
  return(rebin_count)
}

In [4]:
# Variables
wd <- '~/Desktop/Research/scPASU_pipeline_runs/Ureter10_scPASU_run/outputs/differentiation_stage_cellranger_peakcount/'
setwd(wd)
fprefix <- 'all_TU_disjoint_utr3'

In [4]:
repeat_elt <- read.delim('~/Downloads/hg38_rmsk/rmsk.txt',header = F)
colnames(repeat_elt) <- c('bin','swScore','milliDiv','milliDel','milliIns','genoName',
                          'genoStart','genoEnd','genoLeft','strand','repName','repClass',
                          'repFamily','repStart','repEnd','repLeft','id')
colnames(repeat_elt)[c(6,7,8)] <- c('chr','start','end')
repeat_elt$repClass <- apply(repeat_elt,1,function(x){
  conc <- paste0(x[13:12],collapse = ':')
  return(conc)
})
repeat_elt$repName <- apply(repeat_elt,1,function(x){
  conc <- paste0(x[12:11],collapse = ':')
  return(conc)
})
repeat_elt <- repeat_elt %>% mutate(re_width = end - start + 1)
repeat_elt_gr <- makeGRanges(repeat_elt,strand = T)

#### Lengthening

In [5]:
filedir=paste0('sea_out_lgthn_',fprefix)
lgthn_sea_clst_grouped <- read.delim(paste0(filedir,'/',"motif_cluster_detection_res_all_TU_disjoint_utr3.tsv"), header = T)
head(lgthn_sea_clst_grouped)

Unnamed: 0_level_0,gene,tu,seq_name,chr,strand,range_start,range_end,meme_seqs,full_motif_cluster,most_upstream_motif,most_upstream_motif_start,most_downstream_motif,most_downstream_motif_start
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<lgl>,<chr>,<int>,<chr>,<int>
1,A2ML1,TU22196,TU22196_UTR1,chr12,+,8875461,8875815,"MEME-5,MEME-1,MEME-3",False,MEME-5,8875487,MEME-3,8875701
2,A2ML1,TU22196,TU22196_UTR2,chr12,+,8876058,8876788,"MEME-3,MEME-1,MEME-5",False,MEME-3,8876388,MEME-5,8876606
3,AAK1,TU5339,TU5339_UTR1,chr2,-,69481056,69482589,"MEME-5,MEME-2,MEME-1,MEME-3",False,MEME-5,69482286,MEME-3,69482067
4,AAK1,TU5339,TU5339_UTR2,chr2,-,69457997,69475872,"MEME-3,MEME-1,MEME-2,MEME-5,MEME-4",True,MEME-3,69459527,MEME-4,69459270
5,AAMDC,TU20683,TU20683_UTR0,chr11,+,77918097,77918433,"MEME-3,MEME-2,MEME-5,MEME-4",False,MEME-3,77918146,MEME-4,77918387
6,AARD,TU16086,TU16086_UTR0,chr8,+,116942699,116944488,"MEME-3,MEME-1,MEME-2,MEME-5,MEME-4",True,MEME-3,116942715,MEME-4,116942970


In [6]:
lgthn_sea_res <- read.delim(paste0(filedir,'/sea_res_processed_',fprefix,'.tsv'), header = T)
head(lgthn_sea_res)

Unnamed: 0_level_0,seq_name,site_strand,site_start,pval,flank1,site,flank2,meme_name,strand,cis,chr,range_start,range_end,site_width,start,end
Unnamed: 0_level_1,<chr>,<chr>,<int>,<lgl>,<lgl>,<chr>,<lgl>,<chr>,<chr>,<lgl>,<chr>,<int>,<int>,<int>,<int>,<int>
1,TU13400_UTR5644,+,653,,,TGCAGTGGCGCGATCTCGGCTCACTGCAACCT,,MEME-5,-,False,chr6,116274858,116278520,32,116277836,116277868
2,TU17531_UTR8696,+,1803,,,TGCAGTGGCGCGATCTCGGCTCACTGCAACCT,,MEME-5,+,True,chr9,124810410,124814886,32,124812212,124812244
3,TU15342_UTR7163,+,303,,,TGCAGTGGCGCGATCTCGGCTCACTGCAACCT,,MEME-5,-,False,chr7,139043515,139047597,32,139047263,139047295
4,TU24917_UTR14518,-,640,,,TGCAGTGGCGCGATCTCGGCTCACTGCAACCT,,MEME-5,-,True,chr13,113324845,113325550,32,113324879,113324911
5,TU23065_UTR13222,+,1965,,,TGCAGTGGCGCGATCTCGGCTCACTGCAACCT,,MEME-5,+,True,chr12,131943584,131945897,32,131945548,131945580
6,TU8804_UTR28907,-,3429,,,TGCAGTGGCGCGATCTCGGCTCACTGCAACCT,,MEME-5,+,False,chr4,127832872,127840734,32,127836300,127836332


In [7]:
table(lgthn_sea_res[lgthn_sea_res$meme_name == 'MEME-1',]$site_width) # All 47
table(lgthn_sea_res[lgthn_sea_res$meme_name == 'MEME-2',]$site_width) # All 50
table(lgthn_sea_res[lgthn_sea_res$meme_name == 'MEME-3',]$site_width) # All 40
table(lgthn_sea_res[lgthn_sea_res$meme_name == 'MEME-4',]$site_width) # All 41
table(lgthn_sea_res[lgthn_sea_res$meme_name == 'MEME-5',]$site_width) # All 32


  47 
5194 


   50 
10147 


  40 
7558 


  41 
4262 


  32 
8308 

In [8]:
lgthn_motif_width <- c('MEME-1' = 47, 'MEME-2' = 50, 'MEME-3' = 40, 'MEME-4' = 41, 'MEME-5' = 32)
lgthn_sea_clst_grouped$most_upstream_motif_end <- apply(lgthn_sea_clst_grouped,1,function(x) {
    meme_name <- x[10]
    width <- lgthn_motif_width[meme_name] %>% unname() %>% as.integer()
    most_upstream_motif_end <- as.integer(x[11]) + width - 1
    return(most_upstream_motif_end)
})
lgthn_sea_clst_grouped$most_downstream_motif_end <- apply(lgthn_sea_clst_grouped,1,function(x) {
    meme_name <- x[12]
    width <- lgthn_motif_width[meme_name] %>% unname() %>% as.integer()
    most_downstream_motif_end <- as.integer(x[13]) + width - 1
    return(most_downstream_motif_end)
})
lgthn_sea_clst_grouped$start <- ifelse(lgthn_sea_clst_grouped$strand=='+',
                                         lgthn_sea_clst_grouped$most_upstream_motif_start,
                                         lgthn_sea_clst_grouped$most_downstream_motif_start)
lgthn_sea_clst_grouped$end <- ifelse(lgthn_sea_clst_grouped$strand=='+',
                                       lgthn_sea_clst_grouped$most_downstream_motif_end,
                                       lgthn_sea_clst_grouped$most_upstream_motif_end)
lgthn_sea_clst_grouped <- lgthn_sea_clst_grouped %>% group_by(seq_name) %>% 
                            mutate(motif_cluster_id = paste0(seq_name,'_',row_number())) %>% 
                            as.data.frame() %>%
                            ungroup()
lgthn_sea_clst_grouped$start <- ifelse(lgthn_sea_clst_grouped$strand=='+',
                                         lgthn_sea_clst_grouped$most_upstream_motif_start,
                                         lgthn_sea_clst_grouped$most_downstream_motif_start)
lgthn_sea_clst_grouped$end <- ifelse(lgthn_sea_clst_grouped$strand=='+',
                                       lgthn_sea_clst_grouped$most_downstream_motif_end,
                                       lgthn_sea_clst_grouped$most_upstream_motif_end)
head(lgthn_sea_clst_grouped)


Unnamed: 0_level_0,gene,tu,seq_name,chr,strand,range_start,range_end,meme_seqs,full_motif_cluster,most_upstream_motif,most_upstream_motif_start,most_downstream_motif,most_downstream_motif_start,most_upstream_motif_end,most_downstream_motif_end,start,end,motif_cluster_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<lgl>,<chr>,<int>,<chr>,<int>,<dbl>,<dbl>,<int>,<dbl>,<chr>
1,A2ML1,TU22196,TU22196_UTR1,chr12,+,8875461,8875815,"MEME-5,MEME-1,MEME-3",False,MEME-5,8875487,MEME-3,8875701,8875518,8875740,8875487,8875740,TU22196_UTR1_1
2,A2ML1,TU22196,TU22196_UTR2,chr12,+,8876058,8876788,"MEME-3,MEME-1,MEME-5",False,MEME-3,8876388,MEME-5,8876606,8876427,8876637,8876388,8876637,TU22196_UTR2_1
3,AAK1,TU5339,TU5339_UTR1,chr2,-,69481056,69482589,"MEME-5,MEME-2,MEME-1,MEME-3",False,MEME-5,69482286,MEME-3,69482067,69482317,69482106,69482067,69482317,TU5339_UTR1_1
4,AAK1,TU5339,TU5339_UTR2,chr2,-,69457997,69475872,"MEME-3,MEME-1,MEME-2,MEME-5,MEME-4",True,MEME-3,69459527,MEME-4,69459270,69459566,69459310,69459270,69459566,TU5339_UTR2_1
5,AAMDC,TU20683,TU20683_UTR0,chr11,+,77918097,77918433,"MEME-3,MEME-2,MEME-5,MEME-4",False,MEME-3,77918146,MEME-4,77918387,77918185,77918427,77918146,77918427,TU20683_UTR0_1
6,AARD,TU16086,TU16086_UTR0,chr8,+,116942699,116944488,"MEME-3,MEME-1,MEME-2,MEME-5,MEME-4",True,MEME-3,116942715,MEME-4,116942970,116942754,116943010,116942715,116943010,TU16086_UTR0_1


In [9]:
lgthn_sea_clst_grouped_spl <- split(lgthn_sea_clst_grouped,lgthn_sea_clst_grouped$motif_cluster_id)
lnuc <- lapply(lgthn_sea_clst_grouped_spl,function(x){
    width <- x$end - x$start + 1
    df <- matrix(nrow = width, ncol = 5) %>% as.data.frame()
    colnames(df) <- c('motif_cluster_id','chr','strand','start','end')
    df$motif_cluster_id <- x$motif_cluster_id
    df$chr <- x$chr
    df$start <- seq(x$start,x$end)
    df$end <- seq(x$start,x$end)
    df$strand <- x$strand
    return(df)
})
lnuc <- do.call('rbind',lnuc)
lnuc_gr <- makeGRanges(lnuc,strand=T)

In [10]:
lnuc_re_annot <- repeat_elt_annot(lnuc_gr, repeat_elt_gr, ncores = 4)
lnuc_re_annot_bu <- lnuc_re_annot
lnuc_re_annot <- lnuc_re_annot %>% as.data.frame()

2025-11-04 21:05:00.225226: Finding overlaps between bins and repetitive elements...

2025-11-04 21:05:09.33501: Annotating 632251 bins with repetitive elements using 4 cores...





2025-11-04 21:08:38.240283: Applying results back to bins (vectorized)...

2025-11-04 21:08:40.739501: Annotation complete!



In [11]:
repFamily_idx <- grep('repFamily',colnames(lnuc_re_annot))
lnuc_re_annot$Alu_or_not <- apply(lnuc_re_annot,1,function(x){
  if(grepl('Alu',x[repFamily_idx])){
    return(TRUE)
  } else {
    return(FALSE)
  }
})
head(lnuc_re_annot)

Unnamed: 0_level_0,seqnames,start,end,width,strand,X,repNameID,repName,repClass,repFamily,re_width,Alu_or_not
Unnamed: 0_level_1,<fct>,<int>,<int>,<int>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>
1,chr5,38934887,38934887,1,+,TU10009_UTR0_1,,,,,,False
2,chr5,38934888,38934888,1,+,TU10009_UTR0_1,,,,,,False
3,chr5,38934889,38934889,1,+,TU10009_UTR0_1,,,,,,False
4,chr5,38934890,38934890,1,+,TU10009_UTR0_1,,,,,,False
5,chr5,38934891,38934891,1,+,TU10009_UTR0_1,,,,,,False
6,chr5,38934892,38934892,1,+,TU10009_UTR0_1,,,,,,False


In [12]:
lnuc_re_annot_Alu_enrich <- rep_enrich_calc(bin_annot_re = lnuc_re_annot,
                repFamily='Alu',repeat_elt = repeat_elt_gr, bin_size = 1, 
                threshold = 0.5, re.filter = T, max.ovl.re.only = T)

2025-11-04 21:08:46.670341: Compute repeat enrichment for Alu...



2025-11-04 21:08:57.607461: Only retaining REs with percentage width match at least 50% ...



In [13]:
head(lnuc_re_annot_Alu_enrich)

Unnamed: 0_level_0,X,repNameID,repName,n,re_width,perct_width_match,pass,repFamily
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<lgl>,<chr>
1,TU10026_UTR0_1,3827340,Alu:SINE:AluJb,294,308,0.9545455,True,Alu
2,TU10040_UTR0_1,3828992,Alu:SINE:AluSz6,153,298,0.5134228,True,Alu
3,TU10056_UTR0_1,3836602,Alu:SINE:AluSg,289,299,0.9665552,True,Alu
4,TU10102_UTR0_1,3852976,Alu:SINE:AluSx,294,303,0.970297,True,Alu
5,TU10109_UTR2_1,3857056,Alu:SINE:AluSz,294,312,0.9423077,True,Alu
6,TU10109_UTR6_1,3857486,Alu:SINE:AluYm1,293,311,0.9421222,True,Alu


In [14]:
summary(lnuc_re_annot_Alu_enrich$perct_width_match)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.5000  0.8019  0.9114  0.8622  0.9733  1.0000 

In [15]:
lgthn_sea_clst_grouped$Alu_enriched <- lnuc_re_annot_Alu_enrich$repName[match(lgthn_sea_clst_grouped$motif_cluster_id,
                                                              lnuc_re_annot_Alu_enrich$X)]
lgthn_sea_clst_grouped$Alu_percent_width_overlap <- lnuc_re_annot_Alu_enrich$perct_width_match[match(lgthn_sea_clst_grouped$motif_cluster_id,
                                                              lnuc_re_annot_Alu_enrich$X)]
lgthn_sea_clst_grouped$start <- NULL
lgthn_sea_clst_grouped$end <- NULL
lgthn_sea_clst_grouped$motif_cluster_id <- NULL
lgthn_sea_clst_grouped <- lgthn_sea_clst_grouped %>% 
        relocate(most_upstream_motif_end, .after = most_upstream_motif_start) %>%
        relocate(most_downstream_motif_end, .after = most_downstream_motif_start)
head(lgthn_sea_clst_grouped)
write.table(lgthn_sea_clst_grouped,
            file = paste0(filedir,'/','motif_cluster_detection_res_',fprefix,'_Alu_annotation.tsv'),
            sep = '\t',quote = F,row.names = F)

Unnamed: 0_level_0,gene,tu,seq_name,chr,strand,range_start,range_end,meme_seqs,full_motif_cluster,most_upstream_motif,most_upstream_motif_start,most_upstream_motif_end,most_downstream_motif,most_downstream_motif_start,most_downstream_motif_end,Alu_enriched,Alu_percent_width_overlap
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<lgl>,<chr>,<int>,<dbl>,<chr>,<int>,<dbl>,<chr>,<dbl>
1,A2ML1,TU22196,TU22196_UTR1,chr12,+,8875461,8875815,"MEME-5,MEME-1,MEME-3",False,MEME-5,8875487,8875518,MEME-3,8875701,8875740,,
2,A2ML1,TU22196,TU22196_UTR2,chr12,+,8876058,8876788,"MEME-3,MEME-1,MEME-5",False,MEME-3,8876388,8876427,MEME-5,8876606,8876637,Alu:SINE:AluSz,0.8372881
3,AAK1,TU5339,TU5339_UTR1,chr2,-,69481056,69482589,"MEME-5,MEME-2,MEME-1,MEME-3",False,MEME-5,69482286,69482317,MEME-3,69482067,69482106,,
4,AAK1,TU5339,TU5339_UTR2,chr2,-,69457997,69475872,"MEME-3,MEME-1,MEME-2,MEME-5,MEME-4",True,MEME-3,69459527,69459566,MEME-4,69459270,69459310,Alu:SINE:AluJb,0.9671053
5,AAMDC,TU20683,TU20683_UTR0,chr11,+,77918097,77918433,"MEME-3,MEME-2,MEME-5,MEME-4",False,MEME-3,77918146,77918185,MEME-4,77918387,77918427,Alu:SINE:AluJb,0.9554795
6,AARD,TU16086,TU16086_UTR0,chr8,+,116942699,116944488,"MEME-3,MEME-1,MEME-2,MEME-5,MEME-4",True,MEME-3,116942715,116942754,MEME-4,116942970,116943010,Alu:SINE:AluY,0.9638158


In [16]:
turef_lgthn_mc_genes <- read.delim(paste0(wd,'turef_lgthn_mc_genes.txt'))
lgthn_sea_clst_grouped_outside_discovery_set <- lgthn_sea_clst_grouped[!(lgthn_sea_clst_grouped$tu %in% turef_lgthn_mc_genes$tu),]
write.table(lgthn_sea_clst_grouped_outside_discovery_set,
            file = paste0(wd,filedir,'/','motif_cluster_detection_res_outside_discovery_set_',fprefix,'_Alu_annotation.tsv'),
            sep = '\t',quote = F,row.names = F)

In [17]:
message('Number of TUs with full or partial lengthening motif clusters: ', lgthn_sea_clst_grouped %>%
      distinct(tu) %>%
      nrow()
    )
message('Number of TUs with full or partial lengthening motif cluster with Alu enrichment: ', lgthn_sea_clst_grouped %>%
      filter(!is.na(Alu_enriched)) %>%
      distinct(tu) %>%
      nrow()
    )
message('Number of motif clusters: ', nrow(lgthn_sea_clst_grouped))
message('Number of motif clusters with Alu enrichment: ', lgthn_sea_clst_grouped %>%
      filter(!is.na(Alu_enriched)) %>%
      nrow()
    )
message('Number of motif clusters without Alu enrichment: ', lgthn_sea_clst_grouped %>%
      filter(is.na(Alu_enriched)) %>%
      nrow()
    )

Number of TUs with full or partial lengthening motif clusters: 3791

Number of TUs with full or partial lengthening motif cluster with Alu enrichment: 2126

Number of motif clusters: 4822

Number of motif clusters with Alu enrichment: 2436

Number of motif clusters without Alu enrichment: 2386



In [7]:
message('Number of TUs outside discovery set with full or partial lengthening motif clusters: ', lgthn_sea_clst_grouped_outside_discovery_set %>%
      distinct(tu) %>%
      nrow()
    )
message('Number of TUs outside discovery set with full or partial lengthening motif cluster with Alu enrichment: ', lgthn_sea_clst_grouped_outside_discovery_set %>%
      filter(!is.na(Alu_enriched)) %>%
      distinct(tu) %>%
      nrow()
    )
message('Number of motif clusters: ', nrow(lgthn_sea_clst_grouped_outside_discovery_set))
message('Number of motif clusters with Alu enrichment: ', lgthn_sea_clst_grouped_outside_discovery_set %>%
      filter(!is.na(Alu_enriched)) %>%
      nrow()
    )
message('Number of motif clusters without Alu enrichment: ', lgthn_sea_clst_grouped_outside_discovery_set %>%
      filter(is.na(Alu_enriched)) %>%
      nrow()
    )

Number of TUs outside discovery set with full or partial lengthening motif clusters: 3772

Number of TUs outside discovery set with full or partial lengthening motif cluster with Alu enrichment: 2114

Number of motif clusters: 4797

Number of motif clusters with Alu enrichment: 2422

Number of motif clusters without Alu enrichment: 2375



#### Shortening

In [18]:
filedir=paste0('sea_out_shrtn_',fprefix)
shtn_sea_clst_grouped <- read.delim(paste0(filedir,'/',"motif_cluster_detection_res_all_TU_disjoint_utr3.tsv"), header = T)
head(shtn_sea_clst_grouped)

Unnamed: 0_level_0,gene,tu,seq_name,chr,strand,range_start,range_end,meme_seqs,full_motif_cluster,most_upstream_motif,most_upstream_motif_start,most_downstream_motif,most_downstream_motif_start
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<lgl>,<chr>,<int>,<chr>,<int>
1,A1BG,TU34454,TU34454_UTR0,chr19,-,58345183,58347025,"MEME-5,MEME-3,MEME-1",False,MEME-5,58345641,MEME-1,58345403
2,A2ML1,TU22196,TU22196_UTR1,chr12,+,8875461,8875815,"MEME-3,MEME-2,MEME-1",False,MEME-3,8875503,MEME-1,8875676
3,A2ML1,TU22196,TU22196_UTR2,chr12,+,8876058,8876788,"MEME-1,MEME-2,MEME-3",False,MEME-1,8876412,MEME-3,8876579
4,AAK1,TU5339,TU5339_UTR1,chr2,-,69481056,69482589,"MEME-5,MEME-3,MEME-2,MEME-1;MEME-5,MEME-4,MEME-2,MEME-1",False,MEME-5,69482325,MEME-1,69482091
5,AAK1,TU5339,TU5339_UTR2,chr2,-,69457997,69475872,"MEME-5,MEME-3,MEME-2,MEME-1",False,MEME-5,69462648,MEME-1,69462420
6,AARD,TU16086,TU16086_UTR0,chr8,+,116942699,116944488,"MEME-1,MEME-2,MEME-3,MEME-5",False,MEME-1,116942739,MEME-5,116942971


In [19]:
shtn_sea_res <- read.delim(paste0(filedir,'/sea_res_processed_',fprefix,'.tsv'), header = T)
head(shtn_sea_res)

Unnamed: 0_level_0,seq_name,site_strand,site_start,pval,flank1,site,flank2,meme_name,strand,cis,chr,range_start,range_end,site_width,start,end
Unnamed: 0_level_1,<chr>,<chr>,<int>,<lgl>,<lgl>,<chr>,<lgl>,<chr>,<chr>,<lgl>,<chr>,<int>,<int>,<int>,<int>,<int>
1,TU3972_UTR23332,-,1579,,,GATCTGCCTGCCTCGGCCTCCCAAAGTGCTGGGCTTACAAG,,MEME-1,+,False,chr2,69879002,69881385,41,69880580,69880621
2,TU4470_UTR23681,+,2122,,,GATCCGCCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAAG,,MEME-1,+,True,chr2,167870018,167874046,41,167872139,167872180
3,TU27298_UTR16133,-,890,,,GATCCGCCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAAG,,MEME-1,-,True,chr15,52115100,52122760,41,52121830,52121871
4,TU483_UTR400,+,759,,,GATCCGCCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAAG,,MEME-1,+,True,chr1,42798223,42800828,41,42798981,42799022
5,TU18362_UTR9436,-,4039,,,GATCCGCCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAAG,,MEME-1,-,True,chr9,134903232,134909801,41,134905722,134905763
6,TU19025_UTR9914,+,733,,,GATCCGCCCGCCTCGGCCTCCCAAAGTGCTGGGATTACAAG,,MEME-1,+,True,chr10,102900698,102901900,41,102901430,102901471


In [20]:
table(shtn_sea_res[shtn_sea_res$meme_name == 'MEME-1',]$site_width) # All 41
table(shtn_sea_res[shtn_sea_res$meme_name == 'MEME-2',]$site_width) # All 41
table(shtn_sea_res[shtn_sea_res$meme_name == 'MEME-3',]$site_width) # All 43
table(shtn_sea_res[shtn_sea_res$meme_name == 'MEME-4',]$site_width) # All 50
table(shtn_sea_res[shtn_sea_res$meme_name == 'MEME-5',]$site_width) # All 41


   41 
14080 


  41 
6996 


  43 
6807 


  50 
1615 


  41 
5963 

In [21]:
shtn_motif_width <- c('MEME-1' = 41, 'MEME-2' = 41, 'MEME-3' = 43, 'MEME-4' = 50, 'MEME-5' = 41)
shtn_sea_clst_grouped$most_upstream_motif_end <- apply(shtn_sea_clst_grouped,1,function(x) {
    meme_name <- x[10]
    width <- shtn_motif_width[meme_name] %>% unname() %>% as.integer()
    most_upstream_motif_end <- as.integer(x[11]) + width - 1
    return(most_upstream_motif_end)
})
shtn_sea_clst_grouped$most_downstream_motif_end <- apply(shtn_sea_clst_grouped,1,function(x) {
    meme_name <- x[12]
    width <- shtn_motif_width[meme_name] %>% unname() %>% as.integer()
    most_downstream_motif_end <- as.integer(x[13]) + width - 1
    return(most_downstream_motif_end)
})
shtn_sea_clst_grouped$start <- ifelse(shtn_sea_clst_grouped$strand=='+',
                                         shtn_sea_clst_grouped$most_upstream_motif_start,
                                         shtn_sea_clst_grouped$most_downstream_motif_start)
shtn_sea_clst_grouped$end <- ifelse(shtn_sea_clst_grouped$strand=='+',
                                       shtn_sea_clst_grouped$most_downstream_motif_end,
                                       shtn_sea_clst_grouped$most_upstream_motif_end)
shtn_sea_clst_grouped <- shtn_sea_clst_grouped %>% group_by(seq_name) %>% 
                            mutate(motif_cluster_id = paste0(seq_name,'_',row_number())) %>% 
                            as.data.frame() %>%
                            ungroup()
shtn_sea_clst_grouped$start <- ifelse(shtn_sea_clst_grouped$strand=='+',
                                         shtn_sea_clst_grouped$most_upstream_motif_start,
                                         shtn_sea_clst_grouped$most_downstream_motif_start)
shtn_sea_clst_grouped$end <- ifelse(shtn_sea_clst_grouped$strand=='+',
                                       shtn_sea_clst_grouped$most_downstream_motif_end,
                                       shtn_sea_clst_grouped$most_upstream_motif_end)
head(shtn_sea_clst_grouped)


Unnamed: 0_level_0,gene,tu,seq_name,chr,strand,range_start,range_end,meme_seqs,full_motif_cluster,most_upstream_motif,most_upstream_motif_start,most_downstream_motif,most_downstream_motif_start,most_upstream_motif_end,most_downstream_motif_end,start,end,motif_cluster_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<lgl>,<chr>,<int>,<chr>,<int>,<dbl>,<dbl>,<int>,<dbl>,<chr>
1,A1BG,TU34454,TU34454_UTR0,chr19,-,58345183,58347025,"MEME-5,MEME-3,MEME-1",False,MEME-5,58345641,MEME-1,58345403,58345681,58345443,58345403,58345681,TU34454_UTR0_1
2,A2ML1,TU22196,TU22196_UTR1,chr12,+,8875461,8875815,"MEME-3,MEME-2,MEME-1",False,MEME-3,8875503,MEME-1,8875676,8875545,8875716,8875503,8875716,TU22196_UTR1_1
3,A2ML1,TU22196,TU22196_UTR2,chr12,+,8876058,8876788,"MEME-1,MEME-2,MEME-3",False,MEME-1,8876412,MEME-3,8876579,8876452,8876621,8876412,8876621,TU22196_UTR2_1
4,AAK1,TU5339,TU5339_UTR1,chr2,-,69481056,69482589,"MEME-5,MEME-3,MEME-2,MEME-1;MEME-5,MEME-4,MEME-2,MEME-1",False,MEME-5,69482325,MEME-1,69482091,69482365,69482131,69482091,69482365,TU5339_UTR1_1
5,AAK1,TU5339,TU5339_UTR2,chr2,-,69457997,69475872,"MEME-5,MEME-3,MEME-2,MEME-1",False,MEME-5,69462648,MEME-1,69462420,69462688,69462460,69462420,69462688,TU5339_UTR2_1
6,AARD,TU16086,TU16086_UTR0,chr8,+,116942699,116944488,"MEME-1,MEME-2,MEME-3,MEME-5",False,MEME-1,116942739,MEME-5,116942971,116942779,116943011,116942739,116943011,TU16086_UTR0_1


In [22]:
shtn_sea_clst_grouped_spl <- split(shtn_sea_clst_grouped,shtn_sea_clst_grouped$motif_cluster_id)
snuc <- lapply(shtn_sea_clst_grouped_spl,function(x){
    width <- x$end - x$start + 1
    df <- matrix(nrow = width, ncol = 5) %>% as.data.frame()
    colnames(df) <- c('motif_cluster_id','chr','strand','start','end')
    df$motif_cluster_id <- x$motif_cluster_id
    df$chr <- x$chr
    df$start <- seq(x$start,x$end)
    df$end <- seq(x$start,x$end)
    df$strand <- x$strand
    return(df)
})
snuc <- do.call('rbind',snuc)
snuc_gr <- makeGRanges(snuc,strand=T)

In [23]:
snuc_re_annot <- repeat_elt_annot(snuc_gr, repeat_elt_gr, ncores = 4)
snuc_re_annot_bu <- snuc_re_annot
snuc_re_annot <- snuc_re_annot %>% as.data.frame()

2025-11-04 21:08:59.751874: Finding overlaps between bins and repetitive elements...

2025-11-04 21:09:09.859893: Annotating 482805 bins with repetitive elements using 4 cores...





2025-11-04 21:15:52.430446: Applying results back to bins (vectorized)...

2025-11-04 21:15:56.795108: Annotation complete!



In [24]:
repFamily_idx <- grep('repFamily',colnames(snuc_re_annot))
snuc_re_annot$Alu_or_not <- apply(snuc_re_annot,1,function(x){
  if(grepl('Alu',x[repFamily_idx])){
    return(TRUE)
  } else {
    return(FALSE)
  }
})
head(snuc_re_annot)

Unnamed: 0_level_0,seqnames,start,end,width,strand,X,repNameID,repName,repClass,repFamily,re_width,Alu_or_not
Unnamed: 0_level_1,<fct>,<int>,<int>,<int>,<fct>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>
1,chr5,38934839,38934839,1,+,TU10009_UTR0_1,,,,,,False
2,chr5,38934840,38934840,1,+,TU10009_UTR0_1,,,,,,False
3,chr5,38934841,38934841,1,+,TU10009_UTR0_1,,,,,,False
4,chr5,38934842,38934842,1,+,TU10009_UTR0_1,,,,,,False
5,chr5,38934843,38934843,1,+,TU10009_UTR0_1,,,,,,False
6,chr5,38934844,38934844,1,+,TU10009_UTR0_1,,,,,,False


In [25]:
snuc_re_annot_Alu_enrich <- rep_enrich_calc(bin_annot_re = snuc_re_annot,
                repFamily='Alu',repeat_elt = repeat_elt_gr, bin_size = 1, 
                threshold = 0.5, re.filter = T, max.ovl.re.only = T)

2025-11-04 21:16:01.230126: Compute repeat enrichment for Alu...

2025-11-04 21:16:08.822901: Only retaining REs with percentage width match at least 50% ...



In [26]:
head(snuc_re_annot_Alu_enrich)

Unnamed: 0_level_0,X,repNameID,repName,n,re_width,perct_width_match,pass,repFamily
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>,<lgl>,<chr>
1,TU10026_UTR0_1,3827340,Alu:SINE:AluJb,274,308,0.8896104,True,Alu
2,TU10040_UTR0_1,3828992,Alu:SINE:AluSz6,274,298,0.9194631,True,Alu
3,TU10056_UTR0_1,3836602,Alu:SINE:AluSg,269,299,0.8996656,True,Alu
4,TU10071_UTR0_1,3840132,Alu:SINE:AluSz,168,324,0.5185185,True,Alu
5,TU10102_UTR0_1,3852976,Alu:SINE:AluSx,274,303,0.9042904,True,Alu
6,TU10109_UTR2_1,3857056,Alu:SINE:AluSz,274,312,0.8782051,True,Alu


In [27]:
summary(snuc_re_annot_Alu_enrich$perct_width_match)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.5107  0.7416  0.8907  0.8444  0.9155  1.0000 

In [28]:
shtn_sea_clst_grouped$Alu_enriched <- snuc_re_annot_Alu_enrich$repName[match(shtn_sea_clst_grouped$motif_cluster_id,
                                                              snuc_re_annot_Alu_enrich$X)]
shtn_sea_clst_grouped$Alu_percent_width_overlap <- snuc_re_annot_Alu_enrich$perct_width_match[match(shtn_sea_clst_grouped$motif_cluster_id,
                                                              snuc_re_annot_Alu_enrich$X)]
shtn_sea_clst_grouped$start <- NULL
shtn_sea_clst_grouped$end <- NULL
shtn_sea_clst_grouped$motif_cluster_id <- NULL
shtn_sea_clst_grouped <- shtn_sea_clst_grouped %>% 
        relocate(most_upstream_motif_end, .after = most_upstream_motif_start) %>%
        relocate(most_downstream_motif_end, .after = most_downstream_motif_start)
head(shtn_sea_clst_grouped)
write.table(shtn_sea_clst_grouped,
            file = paste0(filedir,'/','motif_cluster_detection_res_',fprefix,'_Alu_annotation.tsv'),
            sep = '\t',quote = F,row.names = F)

Unnamed: 0_level_0,gene,tu,seq_name,chr,strand,range_start,range_end,meme_seqs,full_motif_cluster,most_upstream_motif,most_upstream_motif_start,most_upstream_motif_end,most_downstream_motif,most_downstream_motif_start,most_downstream_motif_end,Alu_enriched,Alu_percent_width_overlap
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<lgl>,<chr>,<int>,<dbl>,<chr>,<int>,<dbl>,<chr>,<dbl>
1,A1BG,TU34454,TU34454_UTR0,chr19,-,58345183,58347025,"MEME-5,MEME-3,MEME-1",False,MEME-5,58345641,58345681,MEME-1,58345403,58345443,,
2,A2ML1,TU22196,TU22196_UTR1,chr12,+,8875461,8875815,"MEME-3,MEME-2,MEME-1",False,MEME-3,8875503,8875545,MEME-1,8875676,8875716,,
3,A2ML1,TU22196,TU22196_UTR2,chr12,+,8876058,8876788,"MEME-1,MEME-2,MEME-3",False,MEME-1,8876412,8876452,MEME-3,8876579,8876621,Alu:SINE:AluSz,0.7118644
4,AAK1,TU5339,TU5339_UTR1,chr2,-,69481056,69482589,"MEME-5,MEME-3,MEME-2,MEME-1;MEME-5,MEME-4,MEME-2,MEME-1",False,MEME-5,69482325,69482365,MEME-1,69482091,69482131,,
5,AAK1,TU5339,TU5339_UTR2,chr2,-,69457997,69475872,"MEME-5,MEME-3,MEME-2,MEME-1",False,MEME-5,69462648,69462688,MEME-1,69462420,69462460,,
6,AARD,TU16086,TU16086_UTR0,chr8,+,116942699,116944488,"MEME-1,MEME-2,MEME-3,MEME-5",False,MEME-1,116942739,116942779,MEME-5,116942971,116943011,Alu:SINE:AluY,0.8980263


In [29]:
turef_shtn_mc_genes <- read.delim(paste0(wd,'turef_shtn_mc_genes.txt'))
shtn_sea_clst_grouped_outside_discovery_set <- shtn_sea_clst_grouped[!(shtn_sea_clst_grouped$tu %in% turef_shtn_mc_genes$tu),]
write.table(shtn_sea_clst_grouped_outside_discovery_set,
            file = paste0(wd,filedir,'/','motif_cluster_detection_res_outside_discovery_set_',fprefix,'_Alu_annotation.tsv'),
            sep = '\t',quote = F,row.names = F)

In [30]:
message('Number of TUs with full or partial shortening motif clusters: ', shtn_sea_clst_grouped %>%
      distinct(tu) %>%
      nrow()
    )
message('Number of TUs with full or partial shortening motif cluster with Alu enrichment: ', shtn_sea_clst_grouped %>%
      filter(!is.na(Alu_enriched)) %>%
      distinct(tu) %>%
      nrow()
    )
message('Number of motif clusters: ', nrow(shtn_sea_clst_grouped))
message('Number of motif clusters with Alu enrichment: ', shtn_sea_clst_grouped %>%
      filter(!is.na(Alu_enriched)) %>%
      nrow()
    )
message('Number of motif clusters without Alu enrichment: ', shtn_sea_clst_grouped %>%
      filter(is.na(Alu_enriched)) %>%
      nrow()
    )

Number of TUs with full or partial shortening motif clusters: 3443

Number of TUs with full or partial shortening motif cluster with Alu enrichment: 1808

Number of motif clusters: 3722

Number of motif clusters with Alu enrichment: 1887

Number of motif clusters without Alu enrichment: 1835



In [31]:
message('Number of TUs outside discovery set with full or partial shortening motif clusters: ', shtn_sea_clst_grouped_outside_discovery_set %>%
      distinct(tu) %>%
      nrow()
    )
message('Number of TUs outside discovery set with full or partial shortening motif cluster with Alu enrichment: ', shtn_sea_clst_grouped_outside_discovery_set %>%
      filter(!is.na(Alu_enriched)) %>%
      distinct(tu) %>%
      nrow()
    )
message('Number of motif clusters: ', nrow(shtn_sea_clst_grouped_outside_discovery_set))
message('Number of motif clusters with Alu enrichment: ', shtn_sea_clst_grouped_outside_discovery_set %>%
      filter(!is.na(Alu_enriched)) %>%
      nrow()
    )
message('Number of motif clusters without Alu enrichment: ', shtn_sea_clst_grouped_outside_discovery_set %>%
      filter(is.na(Alu_enriched)) %>%
      nrow()
    )

Number of TUs outside discovery set with full or partial shortening motif clusters: 3419

Number of TUs outside discovery set with full or partial shortening motif cluster with Alu enrichment: 1792

Number of motif clusters: 3695

Number of motif clusters with Alu enrichment: 1870

Number of motif clusters without Alu enrichment: 1825

