These analyses are performed for OS event as an example

In [None]:
library(survival)
library(ggplot2)

1. Import data (e.g. of batch 1)

In [None]:
ori_data<-read.csv("/localhome/bs22tmhn/[ResearchProject]/Batch1/survData_batch1.csv",row.names = 1)

# modify the EnsemblID of ori_data
colnames(ori_data)<-sub("\\..*", "",colnames(ori_data))

2. 23-gene signature

- analyse the signature score

In [None]:
# Import 23-gene coefficients
coef<-read.csv("/localhome/bs22tmhn/[ResearchProject]/23sig.csv")

In [None]:
variables<-c(coef$ID,"OS_time","status","NHLDeath")
# retrieve the above variables from ori_data dataframe
OS<-ori_data[,variables]

OS<-OS[!is.na(OS$status),]     #remove samples without status information (applicable only to OS and PFS)
OS$status[OS$NHLDeath==2]<-0   # convert status=1 of cases with NHLDeath=2 (non-NHLDeath) to status=0

# create a column in OS for the 23 sig score
OS$score<-as.matrix(OS[,1:23]) %*% as.matrix(coef[,3])
OS$score<-OS$score[,1]

In [None]:
# test for survival using dichotomised signature score split at median
# split the data set into 2 groups using median as threshold
med<-median(OS$score)
# Group 1 below median, Group 2 above median
OS$median <- ifelse(OS$score <= med, 1, 2)
# perform Cox PH regression
sig23_50<-coxph(Surv(OS_time, status) ~ median, data = OS)


# test for survival using dichotomised signature score split at the 25th percentile
# split the data set into 2 groups using q25th as threshold
q25th<- quantile(OS$score, probs = 0.25)
# Group 1 below median, Group 2 above q25th
OS$q25th <- ifelse(OS$score <= q25th, 1, 2)
# perform Cox PH regression
sig23_25<-coxph(Surv(OS_time, status) ~ q25th, data = OS)


# test for survival using dichotomised signature score split at the 75th percentile
# split the data set into 2 groups using q75th as threshold
q75th<- quantile(OS$score, probs = 0.75)
# Group 1 below median, Group 2 above q75th
OS$q75th <- ifelse(OS$score <= q75th, 1, 2)
# perform Cox PH regression
sig23_75<-coxph(Surv(OS_time, status) ~ q75th, data = OS)

- analyse each of the 23 genes separately

In [None]:
# the results were taken from the univariate Cox PH analysis using continuous gene expression values
OS_cont<-read.csv("/localhome/bs22tmhn/[ResearchProject]/cox_cont_batch1.csv")
OS<-OS_cont[OS_cont$Variable %in% coef$ID,]

#create forest plot
ggplot(data=OS, aes(y=1:23, x=HR,
                             xmin=HRlower, 
                             xmax=HRupper)) +
  
  geom_errorbarh(height=.1) +
  scale_y_continuous(labels=OS$external_gene_name, breaks=seq(1, 23, by=1))+
  geom_point( color= "red", pch= 18, size=3) +
  labs(title='23 genes', x='Hazard ratio', y = 'Genes')+
  geom_vline(xintercept=1, color='red', linetype='dashed', alpha=.8)+
  theme(panel.grid = element_blank(),panel.background = element_blank(),axis.line = element_line(colour = "black"))+
  coord_flip()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

3. IR1 and IR2 signatures

In [None]:
# Import the list of genes from the signatures
sig<-read.csv("/localhome/bs22tmhn/[ResearchProject]/IR1andIR2.csv")

- analysing the signature score

In [None]:
# retrieve the signature genes from ori_data 
genes<-colnames(ori_data)[colnames(ori_data) %in% sig$ensembl_gene_id]
# retrieve the IR1 signature genes from ori_data 
IR1<-colnames(ori_data)[colnames(ori_data) %in% sig$ensembl_gene_id[sig$IR==1]]
# retrieve the IR2 signature genes from ori_data 
IR2<-colnames(ori_data)[colnames(ori_data) %in% sig$ensembl_gene_id[sig$IR==2]]

variables<-c(genes,"OS_time","status","NHLDeath")
# retrieve the above variables from ori_data dataframe
OS<-ori_data[,variables]

OS<-OS[!is.na(OS$status),]           #remove samples without status information (applicable only to OS and PFS)
OS$status[OS$NHLDeath==2]<-0         # convert status=1 of cases with NHLDeath=2 (non-NHLDeath) to status=0

# calculate the mean of IR1 genes' readcounts for each row (sample)
OS$IR1 <- rowMeans(OS[, colnames(OS) %in% IR1], na.rm = TRUE)
# calculate the mean of IR2 genes' readcounts for each row (sample)
OS$IR2 <- rowMeans(OS[, colnames(OS) %in% IR2], na.rm = TRUE)

# calculate signature score for each row (sample)
OS$score<- 2.71*OS$IR2 - 2.36*OS$IR1

In [None]:
# test for survival using dichotomised signature score split at median
# split the data set into 2 groups using median as threshold
med<-median(OS$score)
# Group 1 below median, Group 2 above median
OS$median <- ifelse(OS$score <= med, 1, 2)
# perform Cox PH regression
OS_50<-coxph(Surv(OS_time, status) ~ median, data = OS)


# test for survival using dichotomised signature score split at the 25th percentile
# split the data set into 2 groups using q25th as threshold
q25th<- quantile(OS$score, probs = 0.25)
# Group 1 below median, Group 2 above q25th
OS$q25th <- ifelse(OS$score <= q25th, 1, 2)
# perform Cox PH regression
OS_25<-coxph(Surv(OS_time, status) ~ q25th, data = OS)


# test for survival using dichotomised signature score split at the 75th percentile
# split the data set into 2 groups using q75th as threshold
q75th<-quantile(OS$score, probs = 0.75)
# Group 1 below median, Group 2 above q75th
OS$q75th <- ifelse(OS$score <= q75th, 1, 2)
# perform Cox PH regression
OS_75<-coxph(Surv(OS_time, status) ~ q75th, data = OS)

- analyse each IR signature separately

In [None]:
# IR1

# test for survival using dichotomised average reads split at median
# split the data set into 2 groups using median as threshold
med<-median(OS$IR1)
# Group 1 below median, Group 2 above median
OS$median_IR1 <- ifelse(OS$IR1 <= med, 1, 2)
# perform Cox PH regression
OS_50_IR1<-coxph(Surv(OS_time, status) ~ median_IR1, data = OS)


# test for survival using dichotomised average reads split at the 25th percentile
# split the data set into 2 groups using q25th as threshold
q25th<-quantile(OS$IR1, probs = 0.25)
# Group 1 below median, Group 2 above q25th
OS$q25th_IR1 <- ifelse(OS$IR1 <= q25th, 1, 2)
# perform Cox PH regression
OS_25_IR1<-coxph(Surv(OS_time, status) ~ q25th_IR1, data = OS)


# test for survival using dichotomised average reads split at the 75th percentile
# split the data set into 2 groups using q75th as threshold
q75th<-quantile(OS$IR1, probs = 0.75)
# Group 1 below median, Group 2 above q75th
OS$q75th_IR1 <- ifelse(OS$IR1 <= q75th, 1, 2)
# perform Cox PH regression
OS_75_IR1<-coxph(Surv(OS_time, status) ~ q75th_IR1, data = OS)

In [None]:
# IR2

# test for survival using dichotomised average reads split at median
# split the data set into 2 groups using median as threshold
med<-median(OS$IR2)
# Group 1 below median, Group 2 above median
OS$median_IR2 <- ifelse(OS$IR2 <= med, 1, 2)
# perform Cox PH regression
OS_50_IR2<-coxph(Surv(OS_time, status) ~ median_IR2, data = OS)


# test for survival using dichotomised average reads split at the 25th percentile
# split the data set into 2 groups using q25th as threshold
q25th<-quantile(OS$IR2, probs = 0.25)
# Group 1 below median, Group 2 above q25th
OS$q25th_IR2 <- ifelse(OS$IR2 <= q25th, 1, 2)
# perform Cox PH regression
OS_25_IR2<-coxph(Surv(OS_time, status) ~ q25th_IR2, data = OS)


# test for survival using dichotomised average reads split at the 75th percentile
# split the data set into 2 groups using q75th as threshold
q75th<-quantile(OS$IR2, probs = 0.75)
# Group 1 below median, Group 2 above q75th
OS$q75th_IR2 <- ifelse(OS$IR2 <= q75th, 1, 2)
# perform Cox PH regression
OS_75_IR2<-coxph(Surv(OS_time, status) ~ q75th_IR2, data = OS)