In [1]:
suppressPackageStartupMessages({

    library(data.table)
    library(tidyverse)
    library(ggpubr)
    library(DESeq2)
    library(GenomicRanges)
    library(rtracklayer)
    library(GenomicFeatures)
    library(SummarizedExperiment)
    })

save_plot <- function(p, fn, w, h){
    for(ext in c(".pdf", ".png")){
        ggsave(filename=paste0(fn,ext), plot=p, width=w, height=h)
    }
}

dir.create('../_m', showWarnings = TRUE, recursive = TRUE)
setwd('../_m')

“'../_m' already exists”


In [2]:
#taf1_transcripts <- gtf %>% subset(type == "transcript" & gene_name=="TAF1")

In [3]:
# get TAF1 cannonical transcript:

# Set the file path for the GTF file
gtf_file <- "../../../expression_profiles/exons_ratio/_h/transcripts_hg38_gencode_v42_primary.gtf"

gtf <- rtracklayer::import(gtf_file, format = "gtf")

# Create a transcript database (TxDb)
#txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)

# Subset the GTF to only include exons from the TAF1 gene using the "subset" function
taf1_exons <- subset(gtf, type == "exon" & gene_name == "TAF1")

print(paste0('total # TAF1 exons: ', taf1_exons %>% length))

#Find the unique exons with the "appris_principal_3" tag
taf1_canonical_exons <- subset(taf1_exons, tag == "appris_principal_3")

print(paste0('total # TAF1 canonical exons: ', taf1_canonical_exons %>% length))

[1] "total # TAF1 exons: 418"


[1] "total # TAF1 canonical exons: 38"


In [4]:
rm(taf1_exons)

In [5]:
taf1_canonical_exons %>% as.data.frame() %>% dplyr::select(15:27) %>% head()

Unnamed: 0_level_0,transcript_id,transcript_type,transcript_name,transcript_support_level,havana_transcript,exon_number,exon_id,hgnc_id,havana_gene,ont,protein_id,ccdsid,artif_dupl
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,ENST00000423759.6,protein_coding,TAF1-204,5,OTTHUMT00000058996.3,1,ENSE00002057736.2,HGNC:11535,OTTHUMG00000022723.7,,ENSP00000406549.2,,
2,ENST00000423759.6,protein_coding,TAF1-204,5,OTTHUMT00000058996.3,2,ENSE00001693606.1,HGNC:11535,OTTHUMG00000022723.7,,ENSP00000406549.2,,
3,ENST00000423759.6,protein_coding,TAF1-204,5,OTTHUMT00000058996.3,3,ENSE00001806491.1,HGNC:11535,OTTHUMG00000022723.7,,ENSP00000406549.2,,
4,ENST00000423759.6,protein_coding,TAF1-204,5,OTTHUMT00000058996.3,4,ENSE00001762844.1,HGNC:11535,OTTHUMG00000022723.7,,ENSP00000406549.2,,
5,ENST00000423759.6,protein_coding,TAF1-204,5,OTTHUMT00000058996.3,5,ENSE00001615535.1,HGNC:11535,OTTHUMG00000022723.7,,ENSP00000406549.2,,
6,ENST00000423759.6,protein_coding,TAF1-204,5,OTTHUMT00000058996.3,6,ENSE00001657433.1,HGNC:11535,OTTHUMG00000022723.7,,ENSP00000406549.2,,


In [6]:
taf1_canonical_exons %>% as.data.frame() %>% colnames()

In [7]:
files_path <- Sys.glob('../../_m/*/exon_deseq2_results.tsv') %>%
                 .[grepl('../../_m/Mature_organoids_all_comparison_design1_deseq2|../../_m/Day',.)]
files_path

In [8]:
#files_path <- Sys.glob('../../_m/Mature_organoids_all_comparison_design1_deseq2/exon_deseq2_results.tsv')

#df <- fread(files_path) %>% filter(Symbol == 'TAF1')

In [9]:
# Create an empty list to store the data frames
df_list <- list()

# Iterate over the files

for (i in seq_along(files_path)) {
    # Print file inside a loop, in the correct order
    IRdisplay::display(files_path[i])
    
    # Read the file into a data frame
    tmp_df <- fread(files_path[i])
    
    # Add a column with the filename
    tmp_df$filename <- gsub('.*_m/|/exon_.*','', files_path[i])
    
    # Filter and select columns
    tmp_df <- tmp_df %>%
        filter(grepl('^TAF1$', Symbol)) %>%
        #filter(pvalue < 0.05) %>%
        dplyr::select(1:15, filename)
    
    tmp_df2 <- taf1_canonical_exons %>% 
                    as.data.frame() %>%
                    left_join(tmp_df %>% 
                              dplyr::select(-c(gene_type, gene_id)), 
                              by=c('exon_id'='exon_gencodeID'))
    
    df_list[[i]] <- tmp_df2
    
    }


# Combine all data frames into a single data frame
combined_df <- do.call(rbind, df_list)

# Display the combined data frame
IRdisplay::display(combined_df)
    

seqnames,start,end,width,strand,source,type,score,phase,gene_id,⋯,lfcSE,stat,pvalue,padj,Length,gencodeID,ensemblID,Symbol,EntrezID,filename
<fct>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<dbl>,<int>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<chr>,<int>,<chr>
chrX,71366357,71366494,138,+,HAVANA,exon,,,ENSG00000147133.17,⋯,2.8839956,-0.25836738,0.79612339,,138,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day120_All_Male_Pairwise_controldelta_deseq2
chrX,71367499,71367613,115,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.6299088,-0.43478295,0.66371998,,115,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day120_All_Male_Pairwise_controldelta_deseq2
chrX,71368054,71368170,117,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.4510797,-0.33422636,0.73820876,0.9876136,117,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day120_All_Male_Pairwise_controldelta_deseq2
chrX,71375167,71375286,120,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.4413288,-0.08731141,0.93042398,0.9967020,120,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day120_All_Male_Pairwise_controldelta_deseq2
chrX,71376950,71377191,242,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.3927517,-0.76318352,0.44535396,0.9440471,242,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day120_All_Male_Pairwise_controldelta_deseq2
chrX,71377603,71377821,219,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.3745720,-0.52766825,0.59772961,0.9744175,219,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day120_All_Male_Pairwise_controldelta_deseq2
chrX,71378235,71378453,219,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.3011086,-0.32109339,0.74813962,0.9882795,219,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day120_All_Male_Pairwise_controldelta_deseq2
chrX,71378824,71379031,208,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.3333531,-0.59458448,0.55212126,0.9674580,208,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day120_All_Male_Pairwise_controldelta_deseq2
chrX,71381743,71381919,177,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.3471289,-0.67620451,0.49891081,0.9565303,177,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day120_All_Male_Pairwise_controldelta_deseq2
chrX,71382536,71382663,128,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.3197973,-1.08140808,0.27951563,0.8734960,128,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day120_All_Male_Pairwise_controldelta_deseq2


In [10]:
combined_df %>% filter(pvalue < 0.05)

seqnames,start,end,width,strand,source,type,score,phase,gene_id,⋯,lfcSE,stat,pvalue,padj,Length,gencodeID,ensemblID,Symbol,EntrezID,filename
<fct>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<dbl>,<int>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<chr>,<int>,<chr>
chrX,71368054,71368170,117,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.8186369,-1.988405,0.04676691,,117,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day90_All_Male_Pairwise_controldelta_deseq2
chrX,71377603,71377821,219,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.7343438,-2.134115,0.03283337,0.2267539,219,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day90_All_Male_Pairwise_controldelta_deseq2
chrX,71392875,71392994,120,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.7729061,-2.215155,0.02674943,0.2082915,120,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day90_All_Male_Pairwise_controldelta_deseq2
chrX,71393301,71393476,176,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.7208449,-2.354975,0.01852395,0.1830618,176,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day90_All_Male_Pairwise_controldelta_deseq2
chrX,71397253,71397466,214,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.7462627,-2.860077,0.00423538,0.1059736,214,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day90_All_Male_Pairwise_controldelta_deseq2
chrX,71398572,71398737,166,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.5953926,-1.9838,0.04727819,0.2633737,166,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day90_All_Male_Pairwise_controldelta_deseq2
chrX,71398572,71398737,166,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.6367382,-2.292999,0.02184804,0.9994948,166,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Day90_All_Male_Pairwise_controlxdp_deseq2
chrX,71367499,71367613,115,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.5907289,-2.261614,0.02372127,0.9999996,115,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71398572,71398737,166,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.328855,-2.049217,0.04044088,0.9999996,166,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2


In [11]:
combined_df %>% filter(padj < 0.05)

“number of rows of result is not a multiple of vector length (arg 2)”


“number of rows of result is not a multiple of vector length (arg 2)”


“number of rows of result is not a multiple of vector length (arg 2)”


“number of rows of result is not a multiple of vector length (arg 2)”


seqnames,start,end,width,strand,source,type,score,phase,gene_id,⋯,lfcSE,stat,pvalue,padj,Length,gencodeID,ensemblID,Symbol,EntrezID,filename
<fct>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<dbl>,<int>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<chr>,<int>,<chr>


In [12]:
# from deseq2 main model (Mature_organoids_all_comparison_design1_deseq2)


combined_df %>% filter(filename == 'Mature_organoids_all_comparison_design1_deseq2')



seqnames,start,end,width,strand,source,type,score,phase,gene_id,⋯,lfcSE,stat,pvalue,padj,Length,gencodeID,ensemblID,Symbol,EntrezID,filename
<fct>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<dbl>,<int>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<chr>,<int>,<chr>
chrX,71366357,71366494,138,+,HAVANA,exon,,,ENSG00000147133.17,⋯,1.4956803,0.26455388,0.79135315,0.9999996,138,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71367499,71367613,115,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.5907289,-2.2616138,0.02372127,0.9999996,115,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71368054,71368170,117,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.4443199,-1.52056516,0.128369,0.9999996,117,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71375167,71375286,120,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.3765804,-1.59396114,0.11094474,0.9999996,120,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71376950,71377191,242,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.3760069,-1.01306669,0.31102831,0.9999996,242,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71377603,71377821,219,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.368527,-1.08945763,0.27595213,0.9999996,219,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71378235,71378453,219,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.2618394,-0.43819839,0.66124248,0.9999996,219,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71378824,71379031,208,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.2513687,-0.22846286,0.81928643,0.9999996,208,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71381743,71381919,177,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.3178317,-1.20564629,0.2279539,0.9999996,177,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71382536,71382663,128,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.2606408,-0.03877354,0.96907094,0.9999996,128,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2


In [13]:
combined_df %>% filter(filename == 'Mature_organoids_all_comparison_design1_deseq2') %>% filter(pvalue < 0.05)


seqnames,start,end,width,strand,source,type,score,phase,gene_id,⋯,lfcSE,stat,pvalue,padj,Length,gencodeID,ensemblID,Symbol,EntrezID,filename
<fct>,<int>,<int>,<int>,<fct>,<fct>,<fct>,<dbl>,<int>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>,<chr>,<int>,<chr>
chrX,71367499,71367613,115,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.5907289,-2.261614,0.02372127,0.9999996,115,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2
chrX,71398572,71398737,166,+,HAVANA,exon,,,ENSG00000147133.17,⋯,0.328855,-2.049217,0.04044088,0.9999996,166,ENSG00000147133.17,ENSG00000147133,TAF1,6872,Mature_organoids_all_comparison_design1_deseq2


In [14]:
fwrite(combined_df, 'all_taf1exons_detected_cannonical.tsv',quote=F,sep='\t',row.names=F)


combined_df %>% 
        filter(filename == 'Mature_organoids_all_comparison_design1_deseq2') %>% 
        fwrite('deseq2_mainmodel_taf1exons_cannonical.tsv',quote=F,sep='\t',row.names=F)


combined_df %>% 
        filter(filename == 'Mature_organoids_all_comparison_design1_deseq2') %>% 
        filter(pvalue < 0.05) %>% 
        fwrite('deseq2_mainmodel_taf1exons__cannonical_nominal_pval.tsv',quote=F,sep='\t',row.names=F)


In [15]:
dev.off()