*For demonstration, data from batch 1 and OS are used*

In [None]:
if (!requireNamespace('BiocManager', quietly = TRUE))
        install.packages('BiocManager')

BiocManager::install('RegParallel')

library(RegParallel)
library(survival)

In [None]:
# import the gene expression dataset
ori_data<-read.csv("/localhome/bs22tmhn/[ResearchProject]/Batch1/survData_batch1.csv",row.names = 1)

# import the clinical dataset
clin<-read.csv("/localhome/bs22tmhn/[ResearchProject]/ClinData_batch1.csv",row.names = 1)

In [None]:
# import the file of gene information (containing Ensembl ID, gene name, gene biotype)
gene_info<-read.csv("/localhome/bs22tmhn/[ResearchProject]/Batch1/genes.csv")

1. Univariate Cox PH analysis using continuous gene expression values

In [None]:
# create a new data frame combining gene expression data set and OS_time and status
# (gene expression data from ori_data start at column number 35)
survana_OS<-cbind(ori_data[!is.na(ori_data$status),35:ncol(ori_data)],clin[!is.na(ori_data$status),c("OS_time","status")])

In [None]:
# run the analysis
res_OS <- RegParallel(
    data = survana_OS,
    formula = 'Surv(OS_time, status) ~ [*]',
    FUN = function(formula, data)
      coxph(formula = formula,
        data = data,
        ties = 'breslow',
        singular.ok = TRUE),
    FUNtype = 'coxph',
    variables = colnames(survana_OS)[1:(ncol(survana_OS)-2)],
    blocksize = 2000,
    p.adjust = "BH")

In [None]:
# modify EnsemblID from res_OS to match gene_info's EnsemblID format
res_OS$Variable<-sub("\\..*", "", res_OS$Variable)

# match gene name and biotype from gene_info to EnsemblID from res_OS
res_OS_merged <- merge(res_OS, gene_info, by.x = "Variable", by.y = "ensembl_gene_id", all.x = TRUE)
# create new variable from res_OS_merged indicating the magnitude of the effect
res_OS_merged$beta_sig<-abs(res_OS_merged$Beta)

In [None]:
# export the result
write.csv(res_OS_merged,"/localhome/bs22tmhn/[ResearchProject]/batch1_OS_coxcont.csv",row.names = TRUE)

2. Univariate Cox PH analysis + log-rank test using dichotomised gene expression levels

In [None]:
# create a data frame containing only gene expression values
readcounts=ori_data[,35:ncol(ori_data)]

# create a data frame with the same number of rows and columns
surv_50<-data.frame(matrix(nrow = nrow(readcounts), ncol = ncol(readcounts)))

# Set column names
colnames(surv_50) <- colnames(readcounts)
rownames(surv_50)<- rownames(readcounts)

# split the data set into 2 groups using median as threshold
med<-apply(readcounts[,], 2, median)
# group 1 below median, group 2 above median
for (i in 1:ncol(surv_50)) {
  surv_50[, i] <- ifelse(readcounts[, i] <= med[i], 1, 2)
}

# combine dichotomised gene expression with OS_time and status
surv_50<-cbind(surv_50,clin[,c("OS_time","status")])

In [None]:
#perform Cox PH
res_cox50 <- RegParallel(
    data = surv_50,
    formula = 'Surv(OS_time, status) ~ [*]',
    FUN = function(formula, data)
      coxph(formula = formula,
        data = data,
        ties = 'breslow',
        singular.ok = TRUE),
    FUNtype = 'coxph',
    variables = colnames(surv_50)[1:(ncol(surv_50)-2)],
    blocksize = 2000,
    p.adjust = "BH")

In [None]:
# modify EnsemblID from res_OS to match gene_info's EnsemblID format
res_cox50$Variable<-sub("\\..*", "",res_cox50$Variable)

# match gene name and biotype from gene_info to EnsemblID from res_cox50 
res_cox50_merged <- merge(res_cox50, gene_info, by.x = "Variable", by.y = "ensembl_gene_id", all.x = TRUE)
# create new variable from res_cox50_merged indicating the magnitude of the effect
res_cox50_merged$beta_sig<-abs(res_cox50_merged$Beta)

In [None]:
# export the result
write.csv(res_cox50_merged,"/localhome/bs22tmhn/[ResearchProject]/cox50_batch1_OS.csv",row.names = TRUE)

3. Overlap significant genes across log rank test, coxph50, coxph cont

In [None]:
#filter genes with p<0.05 in cox50
cox50<-res_cox50_merged[res_cox50_merged$LRT<0.05,]

#filter genes with p<0.05 in coxcont
coxcont<-res_OS_merged[res_OS_merged$LRT<0.05,]

#filter genes with Logrank <0.05 in cox50
logrank<-res_cox50_merged[res_cox50_merged$LogRank<0.05,]

# overlap the EnsemblID from 3 data frames above
common_genes <- intersect(cox50$Variable, intersect(coxcont$Variable, logrank$Variable))

cox50_filtered<-cox50[cox50$Variable %in% common_genes,]
coxcont_filtered<-coxcont[coxcont$Variable %in% common_genes,]
#merge the 2 data frames
overlap<-merge(cox50_filtered, coxcont_filtered, by = "Variable", suffixes = c("_cox50", "_coxcont"), all.x = TRUE)

# export the result table
write.csv(overlap,"/localhome/bs22tmhn/[ResearchProject]/overlap_batch1_OS.csv",row.names = TRUE)

4. Perform candidate genes selection

- For all genes

In [None]:
# ranked by abs_beta_cox50
top100_cox50 <- overlap[order(overlap$abs_beta_cox50, decreasing = TRUE)[1:100], ]
# ranked by abs_beta_coxcont
top100_coxcont<-overlap[order(overlap$abs_beta_coxcont, decreasing = TRUE)[1:100], ]
# ranked by LogRank_cox50
top100_logrank<-overlap[order(overlap$LogRank_cox50, decreasing = FALSE)[1:100], ]

# select the overlapping genes from the above 3 lists
candidate_all_genes<-intersect(top100_cox50$Variable,intersect(top100_coxcont$Variable, top100_logrank$Variable))

- For protein-coding genes (pcg) only

In [None]:
#retrieve top 100 protein coding genes 
overlap_pcg <- overlap[overlap$gene_biotype_cox50=="protein_coding",]

# ranked by abs_beta_cox50
top100_cox50_pcg <- overlap_pcg[order(overlap_pcg$abs_beta_cox50, decreasing = TRUE)[1:100], ]
# ranked by abs_beta_coxcont
top100_coxcont_pcg <- overlap_pcg[order(overlap_pcg$abs_beta_coxcont, decreasing = TRUE)[1:100], ]
# ranked by LogRank_cox50
top100_logrank_pcg <- overlap_pcg[order(overlap_pcg$LogRank_cox50, decreasing = FALSE)[1:100], ]

# select the overlapping genes from the above 3 lists
candidate_pc_genes<-intersect(top100_cox50_pcg$Variable,intersect(top100_coxcont_pcg$Variable, top100_logrank_pcg$Variable))

In [None]:
# combine the lists of all genes + protein-coding genes
candidate_genes<-union(candidate_all_genes,candidate_pc_genes)

# retrieve data of the candidate_genes from the "overlap" dataframe
overlap100<- overlap[overlap$Variable %in% candidate_genes,]

#export the candidate genes
write.csv(overlap100,'/localhome/bs22tmhn/[ResearchProject]/candidate_genes_OS.csv',row.names=TRUE)