In [1]:
library(goldmine)
library(dplyr)

Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: BiocGenerics

Loading required package: generics


Attaching package: ‘generics’


The following objects are masked from ‘package:base’:

    as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
    setequal, union



Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, saveRDS, table, tapply, unique,
    unsplit, which.max, which.min


Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following object is masked from ‘pac

In [2]:
wd <- "/Users/ninhle/Desktop/Research/scPASU_pipeline_runs/Ureter10_scPASU_run/outputs/differentiation_stage_cellranger_peakcount/"
setwd(wd)
genes <- readRDS("/Users/ninhle/Desktop/Research/mcast_analyses/genes.rds")

In [3]:
# Check if any ranges overlap in a GRanges object
check_overlaps <- function(gr) {
  overlaps <- findOverlaps(gr, gr, ignore.strand = TRUE)
  # Remove self-overlaps
  overlaps <- overlaps[queryHits(overlaps) != subjectHits(overlaps)]
  has_overlaps <- length(overlaps) > 0
  
  return(list(
    has_overlaps = has_overlaps,
    n_overlapping_pairs = length(overlaps),
    overlapping_indices = overlaps
  ))
}

In [4]:
library(pbapply)

# Sanity check to make sure no UTRs are overlapping within the same transcript unit
unique_tus <- unique(genes$utr3$tu)
start_time <- Sys.time()
cat("Starting overlap checking for", length(unique_tus), "transcript units at", format(start_time), "\n")

# Force progress bar to show
pboptions(type = "txt")

tu_ovl_results <- pblapply(unique_tus, function(tu) {
  gr_subset <- genes$utr3[genes$utr3$tu == tu]
  result <- check_overlaps(gr_subset)
  result$tu <- tu
  return(result)
})

names(tu_ovl_results) <- unique_tus

total_time <- as.numeric(difftime(Sys.time(), start_time, units = "secs"))
cat(sprintf("Overlap checking completed! Total time: %.2f seconds (%.2f minutes)\n", 
           total_time, total_time/60))

Starting overlap checking for 19359 transcript units at 2025-12-09 03:08:54 
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100%
Overlap checking completed! Total time: 112.03 seconds (1.87 minutes)


In [5]:
# flank <- as.data.frame(genes$flank)
df <- as.data.frame(genes$utr3)
# df <- df %>%
#  group_by(name) %>%
#  arrange(name, ifelse(strand == '+', start, -start)) %>%
#  mutate(most_downstream = row_number() == n()) %>%
#  ungroup()
# df$flank_start <- flank$start[match(df$name, flank$name)]
# df$flank_end <- flank$end[match(df$name, flank$name)]
# df$new_coord <- ifelse(df$strand == '+',
#                        ifelse(df$most_downstream, df$end + 5000, df$end),
#                        ifelse(df$most_downstream, df$start - 5000, df$start)) 
# df$end <- ifelse(df$strand == '+', df$new_coord, df$end)
# df$start <- ifelse(df$strand == '-', df$new_coord, df$start)
# df$width <- df$end - df$start+1
head(df)

Unnamed: 0_level_0,seqnames,start,end,width,strand,tu,gene.id,name,utr
Unnamed: 0_level_1,<fct>,<int>,<int>,<int>,<fct>,<chr>,<chr>,<chr>,<chr>
1,chr1,70006,71585,1580,+,TU4,ENSG00000186092,OR4F5,UTR1
2,chr1,944151,944575,425,+,TU17,ENSG00000187634,SAMD11,UTR2
3,chr1,965189,965719,531,+,TU18,ENSG00000187961,KLHL17,UTR3
4,chr1,974573,975865,1293,+,TU19,ENSG00000187583,PLEKHN1,UTR4
5,chr1,1014476,1014540,65,+,TU20,ENSG00000187608,ISG15,UTR5
6,chr1,1054979,1056118,1140,+,TU21,ENSG00000188157,AGRN,UTR6


In [6]:
# Sequences with length less than 8 are too short for SEA
df <- df[df$width >= 8,]
df$score <- 1
df$tu <- paste0(df$tu,'_',df$utr)
all_TU_disjoint_utr3 <- df %>% select(seqnames, start, end, tu, score, strand)
colnames(all_TU_disjoint_utr3) <- c("chr", "start", "end", "name", "score", "strand")
all_TU_disjoint_utr3$end <- all_TU_disjoint_utr3$end + 1
head(all_TU_disjoint_utr3)
# write.table(all_TU_disjoint_utr3,paste0('all_TU_disjoint_utr3_5kbflank.bed'),sep='\t',col.names=FALSE,row.names=FALSE,quote=FALSE)

Unnamed: 0_level_0,chr,start,end,name,score,strand
Unnamed: 0_level_1,<fct>,<int>,<dbl>,<chr>,<dbl>,<fct>
1,chr1,70006,71586,TU4_UTR1,1,+
2,chr1,944151,944576,TU17_UTR2,1,+
3,chr1,965189,965720,TU18_UTR3,1,+
4,chr1,974573,975866,TU19_UTR4,1,+
5,chr1,1014476,1014541,TU20_UTR5,1,+
6,chr1,1054979,1056119,TU21_UTR6,1,+


In [7]:
num_TUs <- gsub('_UTR[0-9]+$', '', all_TU_disjoint_utr3$name) %>% unique() %>% length()
message('Number of TUs: ', num_TUs)

Number of TUs: 19002



In [8]:
message('Number of UTR3s: ', nrow(all_TU_disjoint_utr3))

Number of UTR3s: 28581



In [9]:
sessionInfo()

R version 4.5.1 (2025-06-13)
Platform: aarch64-apple-darwin20
Running under: macOS Ventura 13.0

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1

locale:
[1] en_US/en_US/en_US/C/en_US/en_US

time zone: America/New_York
tzcode source: internal

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] pbapply_1.7-4        dplyr_1.1.4          goldmine_1.0        
 [4] ggplot2_4.0.0        stringr_1.5.2        data.table_1.17.8   
 [7] GenomicRanges_1.60.0 GenomeInfoDb_1.44.1  IRanges_2.42.0      
[10] S4Vectors_0.46.0     BiocGenerics_0.54.0  generics_0.1.4      

loaded via a namespace (and not attached):
 [1] gtable_0.3.6            jsonlite_2.0.0          compiler_4.5.1         
 [4] crayon_1.5.3            tidyselect_1.2

### Extract UTR3 sequences

In [None]:
!ref_genome=/Users/ninhle/Desktop/Research/bioinformatics_tools/ref_genomes/refdata-gex-GRCh38-2020-A/fasta/genome.fa
!bed_input=all_TU_disjoint_utr3.bed
!fa_output=all_TU_disjoint_utr3.fa
!bedtools getfasta -fi $ref_genome -bed $bed_input -s -nameOnly -fo $fa_output

### Run SEA motif enrichment analysis

In [None]:
!docker run -v `pwd`:/home/meme/ memesuite/memesuite:latest sea --verbosity 1 --thresh 10.0 --align center --p /home/meme/all_TU_disjoint_utr3.fa --m /home/meme/meme_motif_results/raw_results/appMEME_5.5.517125209323241165022785_meme.txt -o sea_out_lgthn_all_TU_disjoint_utr3

In [None]:
!docker run -v `pwd`:/home/meme/ memesuite/memesuite:latest sea --verbosity 1 --thresh 10.0 --align center --p /home/meme/all_TU_disjoint_utr3_5kbflank.fa --m /home/meme/meme_motif_results/raw_results/appMEME_5.5.517125209323241165022785_meme.txt -o sea_out_lgthn_all_TU_disjoint_utr3_5kbflank

In [None]:
!docker run -v `pwd`:/home/meme/ memesuite/memesuite:latest sea --verbosity 1 --thresh 10.0 --align center --p /home/meme/all_TU_disjoint_utr3_5kbflank.fa --m /home/meme/meme_motif_results/raw_results/appMEME_5.5.51712521030229-1466392474_meme.txt -o sea_out_shrtn_all_TU_disjoint_utr3_5kbflank

In [None]:
!docker run -v `pwd`:/home/meme/ memesuite/memesuite:latest sea --verbosity 1 --thresh 10.0 --align center --p /home/meme/all_TU_disjoint_utr3.fa --m /home/meme/meme_motif_results/raw_results/appMEME_5.5.51712521030229-1466392474_meme.txt -o sea_out_shrtn_all_TU_disjoint_utr3