# 2CR_diff_rna_cancer_normal

Margaret Guo
11/2/2021 -

2. How many TFs are differentially expressed between cancer and normal? (SCC vs KC, MM vs MC)


In [2]:
library(pheatmap)  
library(tidyverse)
library(RColorBrewer)
library(viridis)
library(reshape2)
library( org.Hs.eg.db ) 
library(AnnotationDbi) 
library(Biobase)
library(limma)
library(tximport)

library(stringr)

library(Rtsne)
library(caret)  
library(clusterProfiler)
library(pheatmap)
library(ReactomePA)
library(annotate)
library(seqinr)
# library(qlcMatrix)
save_pheatmap_png <- function(x, filename, width=1200, height=1000, res = 200) {
  png(filename, width = width, height = height, res = res)
  grid::grid.newpage()
  grid::grid.draw(x$gtable)
  dev.off()
}
save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
   stopifnot(!missing(x))
   stopifnot(!missing(filename))
   pdf(filename, width=width, height=height)
   grid::grid.newpage()
   grid::grid.draw(x$gtable)
   dev.off()
}

In [3]:
save_prefix = '../data/processed/fig1/rna/'

# import data

In [56]:
tfs = read.table('../data/external//HOCOMOCOv11_full_annotation_HUMAN_mono.tsv', sep='\t',header=1)$Transcription.factor
length(tfs)

In [4]:
rna_files = as.array(list.files('../data/raw/rna/', pattern=('*/*.genes.results'),recursive=TRUE))
# data.frame(file_path = rna_files)
tissues = as.array(lapply(rna_files, function(x) strsplit(x, '/')[[1]][1]))
sample = as.array(lapply(rna_files, function(x) strsplit(x, '/')[[1]][2]))
sample = as.array(lapply(sample, function(x) strsplit(x, '.genes.results')[[1]][1]))

full_path = as.array(lapply(rna_files, function(x) paste0('../data/raw/rna/',x)))
                
rna_genes_result = data.frame("tissues" = tissues,"sample"=sample,"full_path"=full_path )  
rownames(rna_genes_result) = rna_genes_result$sample        
head(rna_genes_result  )                          

Unnamed: 0_level_0,tissues,sample,full_path
Unnamed: 0_level_1,<list>,<list>,<list>
A431-1-CTRLi,A431-CTRLi,A431-1-CTRLi,../data/raw/rna/A431-CTRLi/A431-1-CTRLi.genes.results
A431-2-CTRLi,A431-CTRLi,A431-2-CTRLi,../data/raw/rna/A431-CTRLi/A431-2-CTRLi.genes.results
A431-1-p63i,A431-p63i,A431-1-p63i,../data/raw/rna/A431-p63i/A431-1-p63i.genes.results
A431-2-p63i,A431-p63i,A431-2-p63i,../data/raw/rna/A431-p63i/A431-2-p63i.genes.results
Airway-B1,Airway,Airway-B1,../data/raw/rna/Airway/Airway-B1.genes.results
Airway-B2,Airway,Airway-B2,../data/raw/rna/Airway/Airway-B2.genes.results


In [35]:

createSig <- function (data, metadata, col_sel,tissue=TRUE, logFC_thres=.1, p_thres=0.05,max_return = 500) {
    # Differential expression analysis with limmma
    # This function takes in a target
    # Output is a result table of differential expression analysis for target vs control

    
#     metadata = data.frame("column" =cols_all)
    if (tissue){
    metadata = metadata %>% 
        mutate(label = if_else(tissues==col_sel, 'target', 'control'))
    }
    else{
         metadata = metadata %>% 
        mutate(label = if_else(group==col_sel, 'target', 'control'))
    }

    # set up the design
    labels <- factor(metadata$label)
    design <- model.matrix(~ labels + 1)
    colnames(design) <- levels(labels)
    rownames(design) <- metadata$tissue

    # proceed with analysis
    fit <- lmFit(data, design)
    fit <- eBayes(fit, trend=TRUE)
    tT = topTable(fit, coef=ncol(design),adjust="fdr", sort.by="p", number=Inf)
    tT$gene = rownames(tT)
    tT = na.omit(tT)
    tT_filt = tT[abs(tT$logFC)>logFC_thres  & tT$adj.P.Val<p_thres,]
    tT_filt = tT_filt%>% arrange(desc(logFC))
    print(dim(tT_filt))
    tT_filt = tT_filt[1:min(max_return, dim(tT_filt)[1]),]

  return(tT_filt)
}

In [36]:
rna_tpm_file_tissue = '../data/interim/rna/tissue_tpm_sym.csv'
rna_df_tissue = read.csv(rna_tpm_file_tissue, row.names=1,stringsAsFactors=F,check.names = FALSE)
rna_df_tissue_log = log2(rna_df_tissue+1e-2)
rna_df_tissue_norm = as.data.frame(scale(rna_df_tissue_log, center = TRUE, scale = TRUE))

rna_tpm_file = '../data/interim/rna/sample_tpm_sym.csv'
rna_df = read.csv(rna_tpm_file, row.names=1,stringsAsFactors=F,check.names = FALSE)
rna_df_log = log2(rna_df+1) ## used log2 tpm values!
rna_df_norm = as.data.frame(scale(rna_df_log, center = TRUE, scale = TRUE))
head(rna_df_log)

Unnamed: 0_level_0,A431-1-CTRLi,A431-1-p63i,A431-2-CTRLi,A431-2-p63i,Airway-B1,Airway-B2,Astrocytes-B1,Astrocytes-B2,Bladder-B1,Bladder-B2,⋯,Uterine-B1,Uterine-B2,WM2664-SCR-DMSO-BR1,WM2664-SCR-DMSO-BR2,WM2664-SCR-PLX-BR1,WM2664-SCR-PLX-BR2,WM2664-shMITF-DMSO-BR1,WM2664-shMITF-DMSO-BR2,WM2664-shMITF-PLX-BR1,WM2664-shMITF-PLX-BR2
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A1BG,1.78659636,2.13093087,2.592158,2.47248777,2.00360224,2.36176836,4.2794713,4.31469653,1.97819563,2.2265085,⋯,1.84799691,1.70043972,6.49553528,6.58330846,7.05734196,7.27211653,6.91635721,6.7265588,7.1604765,7.2303567
A1BG-AS1,0.81557543,0.07038933,0.07038933,0.05658353,0.07038933,0.08406426,0.36737107,0.65076456,0.08406426,0.0,⋯,0.07038933,0.01435529,0.67807191,0.21412481,0.26303441,0.68706069,0.47508488,0.3334237,0.5459684,0.4005379
A1CF,0.0,0.0,0.0,0.0,0.0,0.01435529,0.01435529,0.01435529,0.02856915,0.0,⋯,0.0,0.01435529,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.16349873,0.07038933,0.0,0.0,0.0,6.14282184,5.03033608,0.38956681,1.5310695,⋯,0.79077204,1.11769504,5.39814512,6.11103131,5.70099449,6.76738996,5.94649694,6.0127921,6.053763,6.1965284
A2M-AS1,0.0,0.0,0.0,0.01435529,0.0,0.07038933,0.05658353,0.13750352,0.0,0.0,⋯,0.0,0.0,0.08406426,0.05658353,0.02856915,0.05658353,0.04264434,0.0976108,0.1375035,0.2265085
A2ML1,0.04264434,0.05658353,0.0,0.0,1.01435529,0.95605665,0.08406426,0.07038933,1.21412481,0.9107327,⋯,0.50589093,0.16349873,0.0,0.0,0.0,0.0,0.08406426,0.0,0.1634987,0.0


In [37]:
# get total number of counts rsem per sample
rna_counts_file = '../data/interim/rna/sample_count_sym.csv'
rna_count_df = read.csv(rna_counts_file, row.names=1,stringsAsFactors=F,check.names = FALSE)
rna_count_df[is.na(rna_count_df)] = 0 
rna_count_df= log2(rna_count_df+1)# pseudocount
# round(colSums(rna_count_df))

In [38]:
# # get per tissue correlation
# for (tissue in unique(rna_genes_result$tissues)){
#     print(tissue)
#     samples = rna_genes_result[rna_genes_result$tissues==tissue,]$sample
# #     print(samples)
#     print(cor(rna_count_df[,as.character(samples)],method='pearson')[1,2])
#     print(cor(rna_count_df[,as.character(samples)],method='spearman')[1,2])
# }

In [39]:
# normal_tissues = c('Airway','Astrocytes','Bladder','Colon','Esophageal',
# # 'GDSD0',
# # 'GDSD3',
# 'GDSD6',
# 'GM12878',
# 'HMEC',
# 'Melanocytes',
# 'Ovarian',
# 'Pancreas',
# 'Prostate',
# 'Renal',
# 'Thyroid',
# 'Uterine')

unique(rna_genes_result$tissues)

# Run differential expression,
- run all three cancer lines separately take intersection find tfs

In [40]:
cancer_tissues_scc = c('A431-CTRLi','CAL27-CTRLi','SCC13-CTRLi')
normal_tissues_scc = c('GDSD6')#GDSD0

cancer_tissues_mm = c('COLO_SCR_DMSO','SKMEL5_SCR_DMSO','WM_SCR_DMSO')
normal_tissues_mm = c('Melanocytes')


In [82]:
sig_genes_list_scc = list()
tT_filt_scc = data.frame()
for (cancer_tissue in cancer_tissues_scc) {
    for (normal_tissue in normal_tissues_scc)
        group = paste(cancer_tissue,normal_tissue,sep='|')
        rna_genes_result_sel = rna_genes_result[rna_genes_result$tissues %in% c(cancer_tissue,normal_tissue),]
        sel_samples = as.array(rna_genes_result_sel$sample)
        rna_sel = rna_count_df[,colnames(rna_df) %in% sel_samples]

        tT_filt = createSig(rna_sel, rna_genes_result_sel,  col_sel=normal_tissue,
                    tissue=TRUE,logFC_thres=.1, p_thres=0.05,max_return = 10000)
        print(group)
        print(dim(tT_filt))
        sig_genes_list_scc[[group]] = tT_filt$gene
        tT_filt$cancer_tissue = cancer_tissue
        tT_filt$normal_tissue = normal_tissue
        tT_filt_scc = rbind(tT_filt_scc, tT_filt)
}

    
    
    

“Zero sample variances detected, have been offset away from zero”


[1] 5244    7
[1] "A431-CTRLi|GDSD6"
[1] 5244    7


“Zero sample variances detected, have been offset away from zero”


[1] 7175    7
[1] "CAL27-CTRLi|GDSD6"
[1] 7175    7


“Zero sample variances detected, have been offset away from zero”


[1] 8050    7
[1] "SCC13-CTRLi|GDSD6"
[1] 8050    7


In [83]:
genes_in_all = tT_filt_scc%>%
    group_by(gene)%>%
    summarise(count_gene = n())%>%
    filter(count_gene==3)
genes_in_all = sort(unique(genes_in_all$gene))
length(genes_in_all)
tT_filt_scc = tT_filt_scc%>%
    filter(gene %in% genes_in_all) %>%
    group_by(gene)%>%
    summarise(logFC=mean(logFC), adj.P.Val = max(adj.P.Val))%>%
    arrange(logFC)%>%
    mutate(is_TF = gene %in% tfs)
#     summarise()

In [84]:
tT_filt_scc%>%summary()

     gene               logFC           adj.P.Val           is_TF        
 Length:3209        Min.   :-11.844   Min.   :0.0003503   Mode :logical  
 Class :character   1st Qu.: -3.985   1st Qu.:0.0191281   FALSE:3094     
 Mode  :character   Median : -2.811   Median :0.0264588   TRUE :115      
                    Mean   : -2.162   Mean   :0.0280216                  
                    3rd Qu.: -1.827   3rd Qu.:0.0368338                  
                    Max.   : 14.827   Max.   :0.0499891                  

In [85]:
write.csv(tT_filt_scc,paste0(save_prefix,'tT_filt_scc.csv'))

In [96]:
sort(tT_filt_scc[tT_filt_scc$is_TF,]$gene)

## for mm

In [86]:
sig_genes_list_mm = list()
tT_filt_mm = data.frame()
for (cancer_tissue in cancer_tissues_mm) {
    for (normal_tissue in normal_tissues_mm)
        group = paste(cancer_tissue,normal_tissue,sep='|')
        rna_genes_result_sel = rna_genes_result[rna_genes_result$tissues %in% c(cancer_tissue,normal_tissue),]
        sel_samples = as.array(rna_genes_result_sel$sample)
        rna_sel = rna_count_df[,colnames(rna_df) %in% sel_samples]

        tT_filt = createSig(rna_sel, rna_genes_result_sel,  col_sel=normal_tissue,
                    tissue=TRUE,logFC_thres=.1, p_thres=0.05,max_return = 10000)
        print(group)
        print(dim(tT_filt))
        sig_genes_list_mm[[group]] = tT_filt$gene
        tT_filt$cancer_tissue = cancer_tissue
        tT_filt$normal_tissue = normal_tissue
        tT_filt_mm = rbind(tT_filt_mm, tT_filt)
}

    
    
    

“Zero sample variances detected, have been offset away from zero”


[1] 1229    7
[1] "COLO_SCR_DMSO|Melanocytes"
[1] 1229    7


“Zero sample variances detected, have been offset away from zero”


[1] 4023    7
[1] "SKMEL5_SCR_DMSO|Melanocytes"
[1] 4023    7


“Zero sample variances detected, have been offset away from zero”


[1] 2597    7
[1] "WM_SCR_DMSO|Melanocytes"
[1] 2597    7


In [88]:
genes_in_all = tT_filt_mm%>%
    group_by(gene)%>%
    summarise(count_gene = n())%>%
    filter(count_gene==3)
genes_in_all = sort(unique(genes_in_all$gene))
length(genes_in_all)
tT_filt_mm = tT_filt_mm%>%
    filter(gene %in% genes_in_all) %>%
    group_by(gene)%>%
    summarise(logFC=mean(logFC), adj.P.Val = max(adj.P.Val))%>%
    arrange(logFC)%>%
    mutate(is_TF = gene %in% tfs)
#     summarise()

In [90]:
tT_filt_mm%>%summary()

     gene               logFC           adj.P.Val           is_TF        
 Length:694         Min.   :-12.737   Min.   :0.0001995   Mode :logical  
 Class :character   1st Qu.: -4.333   1st Qu.:0.0282855   FALSE:674      
 Mode  :character   Median : -2.988   Median :0.0388041   TRUE :20       
                    Mean   : -2.821   Mean   :0.0357307                  
                    3rd Qu.: -2.109   3rd Qu.:0.0457767                  
                    Max.   : 12.468   Max.   :0.0499854                  

In [91]:
write.csv(tT_filt_mm,paste0(save_prefix,'tT_filt_mm.csv'))

In [95]:
sort(tT_filt_mm[tT_filt_mm$is_TF,]$gene)