In [11]:
library(breastCancerMAINZ)
library(breastCancerTRANSBIG)
library(breastCancerUPP)
library(breastCancerUNT)
library(breastCancerNKI)

suppressPackageStartupMessages(library("genefu"))
suppressPackageStartupMessages(library("AIMS"))
suppressPackageStartupMessages(library("caret"))

“replacing previous import ‘ellipsis::check_dots_unnamed’ by ‘rlang::check_dots_unnamed’ when loading ‘hms’”
“replacing previous import ‘ellipsis::check_dots_used’ by ‘rlang::check_dots_used’ when loading ‘hms’”
“replacing previous import ‘ellipsis::check_dots_empty’ by ‘rlang::check_dots_empty’ when loading ‘hms’”
“replacing previous import ‘ellipsis::check_dots_unnamed’ by ‘rlang::check_dots_unnamed’ when loading ‘tibble’”
“replacing previous import ‘ellipsis::check_dots_used’ by ‘rlang::check_dots_used’ when loading ‘tibble’”
“replacing previous import ‘ellipsis::check_dots_empty’ by ‘rlang::check_dots_empty’ when loading ‘tibble’”


In [6]:
data_folder <- "preprocessed/"
# read the file mapping gene ids in expression matrix to Entrez ids 
#gene_annot_file <- "../Xena_gene_info.tsv"
gene_annot_file <- "gene_id_mapping.tsv"


### input

### log2(x+1) but *not* z-score transformed expressions
exprs_file <- "TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_v6.tsv" 
#exprs_file <-"METABRIC_1904_17Kgenes.log2_exprs_v6.tsv"


### output
output_file <- "TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.subtypes_and_signatures_v6.tsv"
#output_file  <- "METABRIC_1904_17Kgenes.subtypes_and_signatures_v6.tsv"

In [7]:
exprs <- t(read.delim(paste0(data_folder,exprs_file), row.names = 1))
rownames(exprs) <- gsub("\\.", "-",rownames(exprs))
exprs <- exprs[,sort(colnames(exprs))]

head(exprs,3)

Unnamed: 0,A1BG,A1CF,A2M,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,⋯,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
TCGA-3C-AAAU-01,7.9692,0.0,13.3261,0.0,6.9353,3.9756,10.3116,10.9377,0.0,0,⋯,9.5575,10.7186,8.6854,10.7141,10.6614,8.8382,11.0071,12.591,11.6938,11.2516
TCGA-3C-AALI-01,8.6285,3.3945,13.7408,0.0,8.1574,0.9647,10.8011,10.5831,1.5376,0,⋯,11.0446,10.8088,7.6605,9.5963,10.9938,8.487,10.0526,13.2659,11.208,9.7164
TCGA-3C-AALJ-01,9.3174,2.3532,13.7631,1.2448,9.7843,0.0,10.2706,11.2474,1.2448,0,⋯,8.9435,11.9863,6.7841,9.6513,9.2955,9.0229,9.6966,13.0363,10.5635,9.9067


In [8]:
gene_anno <- read.delim(paste0(data_folder,gene_annot_file), row.names = 1,sep = "\t")
gene_anno <- gene_anno[,c("probe","EntrezGene.ID","Gene.Symbol")]
gene_anno <- gene_anno[colnames(exprs),]
head(gene_anno,3)

Unnamed: 0_level_0,probe,EntrezGene.ID,Gene.Symbol
Unnamed: 0_level_1,<fct>,<dbl>,<fct>
A1BG,A1BG,1,A1BG
A1CF,A1CF,29974,A1CF
A2M,A2M,2,A2M


In [9]:
dim(gene_anno)
dim(exprs)

## PAM50

In [9]:
SubtypePredictions<-molecular.subtyping(sbt.model = "pam50",data = scale(exprs, scale = F, center = T),
                                            annot = gene_anno,do.mapping = T, verbose = T)
# missed genes
#df <- pam50$centroids.map
#df[!(df$EntrezGene.ID %in% gene_anno$EntrezGene.ID),]
pam50_subt <- SubtypePredictions$subtype
table(pam50_subt)
table(pam50_subt)/dim(exprs)[[1]]

pam50_subt
 Basal   Her2   LumB   LumA Normal 
   195    108    318    426     32 

pam50_subt
     Basal       Her2       LumB       LumA     Normal 
0.18072289 0.10009268 0.29471733 0.39481001 0.02965709 

In [10]:
# missed genes
df <- pam50$centroids.map
df[!(df$EntrezGene.ID %in% gene_anno$EntrezGene.ID),]

probe,probe.centroids,EntrezGene.ID
<chr>,<chr>,<int>


In [11]:
SubtypePredictions<-molecular.subtyping(sbt.model = "scmod2",data = scale(exprs, scale = F, center = T),
                                            annot = gene_anno,do.mapping = T, verbose = TRUE)
scmod2_subt <- SubtypePredictions$subtype
table(scmod2_subt)
table(scmod2_subt)/dim(exprs)[[1]]

scmod2_subt
            ER-/HER2- ER+/HER2- High Prolif  ER+/HER2- Low Prolif 
                  218                   405                   360 
                HER2+ 
                   96 

scmod2_subt
            ER-/HER2- ER+/HER2- High Prolif  ER+/HER2- Low Prolif 
           0.20203892            0.37534754            0.33364226 
                HER2+ 
           0.08897127 

In [12]:
SubtypePredictions<-molecular.subtyping(sbt.model = "scmod1", data = scale(exprs, scale = F, center = T),
                                            annot = gene_anno,do.mapping = T, verbose = T)
scmod1_subt <- SubtypePredictions$subtype
table(scmod1_subt)
table(scmod1_subt)/dim(exprs)[[1]]

scmod1_subt
            ER-/HER2- ER+/HER2- High Prolif  ER+/HER2- Low Prolif 
                  199                   398                   331 
                HER2+ 
                  151 

scmod1_subt
            ER-/HER2- ER+/HER2- High Prolif  ER+/HER2- Low Prolif 
            0.1844300             0.3688601             0.3067655 
                HER2+ 
            0.1399444 

In [13]:
SubtypePredictions<-molecular.subtyping(sbt.model = "intClust",data = scale(exprs, scale = F, center = T),
                                            annot = gene_anno,do.mapping = T, verbose = T)
intClust_subt <- SubtypePredictions$subtype
table(intClust_subt)
table(intClust_subt)/dim(exprs)[[1]]

Found  570  out of  612  Exp features
running classifier with only expression...
12345678910111213141516171819202122232425262728293012345678910Fold 1 :123456789101112131415161718192021222324252627282930
Fold 2 :123456789101112131415161718192021222324252627282930
Fold 3 :123456789101112131415161718192021222324252627282930
Fold 4 :123456789101112131415161718192021222324252627282930
Fold 5 :123456789101112131415161718192021222324252627282930
Fold 6 :123456789101112131415161718192021222324252627282930
Fold 7 :123456789101112131415161718192021222324252627282930
Fold 8 :123456789101112131415161718192021222324252627282930
Fold 9 :123456789101112131415161718192021222324252627282930
Fold 10 :123456789101112131415161718192021222324252627282930


intClust_subt
 iC1  iC2  iC3  iC4  iC5  iC6  iC7  iC8  iC9 iC10 
  93   33  214  163   66   39  108  120   78  165 

intClust_subt
       iC1        iC2        iC3        iC4        iC5        iC6        iC7 
0.08619092 0.03058387 0.19833179 0.15106580 0.06116775 0.03614458 0.10009268 
       iC8        iC9       iC10 
0.11121409 0.07228916 0.15291937 

In [14]:
SubtypePredictions<-molecular.subtyping(sbt.model = "AIMS",data = scale(exprs, scale = F, center = F),
                                            annot = gene_anno,do.mapping = T)
AIMS_subt <- SubtypePredictions$subtype
table(AIMS_subt)
table(AIMS_subt)/dim(exprs)[[1]]
AIMS_subt <- AIMS_subt[,"20"]

You are missing the pair or have more than one 1466<79682 in 

You are missing the pair or have more than one 29842<55765 in 

You are missing the pair or have more than one 18<79682 in 

You are missing the pair or have more than one 2330<7272 in 

You are missing the pair or have more than one 142<7450 in 

Current k = 20



AIMS_subt
 Basal   Her2   LumA   LumB Normal 
   189    105    390    321     74 

AIMS_subt
     Basal       Her2       LumA       LumB     Normal 
0.17516219 0.09731233 0.36144578 0.29749768 0.06858202 

### Claudin-low

In [15]:
data(claudinLowData)

# from https://github.com/clfougner/ClaudinLow/blob/master/Code/METABRIC_patientData.r
entrezID_CLgenes <- claudinLowData$fnames
# genes x samples, Entrez gene ids
exprs_entrez <- t(exprs)
rownames(exprs_entrez) <-gene_anno[rownames(exprs_entrez),"EntrezGene.ID"]
exprs_entrez <- exprs_entrez[na.omit(rownames(exprs_entrez)),]

overlappingCL_entrezID <- intersect(entrezID_CLgenes, rownames(exprs_entrez))
length(overlappingCL_entrezID )


In [16]:
# Select relevant rows
exprs_CLGenes <- exprs_entrez[row.names(exprs_entrez) %in% overlappingCL_entrezID, ]
dim(exprs_CLGenes )

# gene means = 0, sd =1
exprs_CLGenes_scaled <- t(scale(t(exprs_CLGenes), scale = T, center = T)) 


In [17]:
# Train centroids based on available genes
trainingData <- claudinLowData
trainingData$xd <- medianCtr(trainingData$xd)
trainingData$xd <- trainingData$xd[rownames(trainingData$xd) %in% rownames(exprs_CLGenes), ]
dim(trainingData$xd)

# gene means = 0, sd =1
trainingData_scaled <- t(scale(t(trainingData$xd), scale = TRUE, center = TRUE))

In [18]:
#mean(exprs_CLGenes_scaled["6712",])
#sd(exprs_CLGenes_scaled["6712",])
#mean(trainingData_scaled ["6712",])
#sd(trainingData_scaled  ["6712",])

In [19]:
cl_class <- claudinLow(x = trainingData_scaled, 
                       classes = as.matrix(trainingData$classes$Group, ncol = 1),
                       y = exprs_CLGenes_scaled,
                       distm = "euclidean")

pred_cl <- cl_class$predictions
pred_cl <- data.frame(sample_id = gsub("\\.", "-", rownames(pred_cl)),
                      ClaudinLow = as.character(pred_cl$Call),
                      stringsAsFactors = FALSE)
pred_cl[pred_cl$ClaudinLow=="Others","claudin_low"] <- 0
length(pred_cl[pred_cl$ClaudinLow=="Claudin","claudin_low"])
pred_cl[pred_cl$ClaudinLow=="Claudin","claudin_low"] <- 1


[1] "Number of genes used: 745"


In [20]:
sample_id <- pred_cl$sample_id
claudin_low <- pred_cl[,"claudin_low"]
pam50_subt <- as.character(pam50_subt)
subtypes <- cbind("sample_id"=sample_id,"PAM50"=pam50_subt,
                  "claudin_low"=claudin_low,
                  "SCMOD2"=scmod2_subt,"SCMOD1"=scmod1_subt,
                  "intClust"=intClust_subt,"AIMS"=AIMS_subt)
head(subtypes,3)
#write.table(subtypes,paste0(data_folder,subtype_file),sep = "\t",quote = FALSE,row.names = F)

Unnamed: 0,sample_id,PAM50,claudin_low,SCMOD2,SCMOD1,intClust,AIMS
TCGA-3C-AAAU-01,TCGA-3C-AAAU-01,LumB,0,ER+/HER2- High Prolif,ER+/HER2- High Prolif,2,LumB
TCGA-3C-AALI-01,TCGA-3C-AALI-01,Her2,0,HER2+,HER2+,1,Her2
TCGA-3C-AALJ-01,TCGA-3C-AALJ-01,LumB,0,ER+/HER2- High Prolif,ER+/HER2- High Prolif,6,LumB


In [21]:
unique(subtypes[,"SCMOD2"])
#rownames(subtypes)[subtypes[,"scmod2_subt"]=="HER2+"]

# Known signatures and risk scores 

In [22]:
data(mod1)
signatures <- NULL
# 
print("SCMGENE- three gene")
modt <- scmgene.robust$mod$AURKA
signatures <- cbind(signatures, "AURKA"=sig.score(x=modt, data=exprs, annot=gene_anno,do.mapping=TRUE, verbose = TRUE)$score)
modt <- scmgene.robust$mod$ESR1
signatures <- cbind(signatures, "ESR1"=sig.score(x=modt, data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)$score)
modt <- scmgene.robust$mod$ERBB2
signatures <- cbind(signatures, "ERBB2"=sig.score(x=modt, data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)$score)

# seven modules from Desmedt2008 et al
for (module in names(mod1)){
    x <- sig.score(mod1[[module]], exprs, gene_anno, do.mapping = TRUE, signed = TRUE, verbose = TRUE)$score
    module_name <- paste0('Desmedt2008_',module) 
    print(module_name)
    signatures <- cbind(signatures, x)
    colnames(signatures)[length(colnames(signatures))] <- module_name
}

print("GENIUS")
# GENIUSM1 Risk score from the ER-/HER2- subtype signature in GENIUS model.
# GENIUSM2 HER2+ subtype signature
# GENIUSM3 ER+/HER2- subtype 
genius_results <- genius(data=exprs, annot=gene_anno, do.mapping=TRUE)
signatures <- cbind(signatures, "GENIUS"=genius_results$score,"GENIUSM1"=genius_results$GENIUSM1,
              "GENIUSM2"=genius_results$GENIUSM2,
              "GENIUSM3"=genius_results$GENIUSM3)

# Filipits et al. (2011). "A new molecular predictor of distant recurrence in ER-positive, HER2-negative breast cancer adds independent information to conventional clinical risk factors." 
print("EndoPredict")
ep_results <- genefu::endoPredict(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)
signatures <- cbind(signatures, "EndoPredict"=ep_results$score, "EndoPredict_risk"=ep_results$risk)

# Paik et al. (2004) "A Multigene Assay to Predict Recurrence of Tamoxifen-Treated, Node-Negative Breast Cancer"
print("OncotypeDx - GENE21")
od_results <- genefu::oncotypedx(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)
signatures <- cbind(signatures, "OncotypeDx"=od_results$score, "OncotypeDx_risk"=od_results$risk)

print("GGI - grade index")
signatures <- cbind(signatures, "GGI"=ggi(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)$score)

print("TAMR13 - Tamoxifen response")
signatures <- cbind(signatures, "TAMR13"=tamr13(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)$score)

# mammaPrint
print("Mammaprint - GENE70")
gene70_results <- gene70(data=exprs, annot=gene_anno, std="none",do.mapping=TRUE, verbose = TRUE)
signatures <- cbind(signatures, "GENE70"=gene70_results$score,"GENE70_risk"=gene70_results$risk)

print("PIK3CA")
signatures <- cbind(signatures, "PIK3CA"=pik3cags(data=exprs, annot=gene_anno, do.mapping=TRUE))
print("RorS")
rors_results <- rorS(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)
signatures <- cbind(signatures, "rorS"=rors_results$score,"rorS_risk"=rors_results$risk)

head(signatures,5)

[1] "SCMGENE- three gene"


probe candidates: 1/1

probe candidates: 1/1

probe candidates: 1/1

probe candidates: 433/469



[1] "Desmedt2008_ESR1"


probe candidates: 28/28



[1] "Desmedt2008_ERBB2"


probe candidates: 223/229



[1] "Desmedt2008_AURKA"


probe candidates: 67/68



[1] "Desmedt2008_PLAU"


probe candidates: 13/14



[1] "Desmedt2008_VEGF"


probe candidates: 88/95



[1] "Desmedt2008_STAT1"


probe candidates: 9/10



[1] "Desmedt2008_CASP3"
[1] "GENIUS"
[1] "EndoPredict"
[1] "OncotypeDx - GENE21"
[1] "GGI - grade index"


probe candidates: 102/128



[1] "TAMR13 - Tamoxifen response"


probe candidates: 4/7

probe candidates: 27/45

probe candidates: 10/14

probe candidates: 24/38

probe candidates: 3/8

probe candidates: 25/36

probe candidates: 4/7

probe candidates: 7/21

probe candidates: 15/26

probe candidates: 15/30

probe candidates: 3/7



[1] "Mammaprint - GENE70"


49/70 probes are used to compute the score

no standardization of the gene expressions



[1] "PIK3CA"
[1] "RorS"


Unnamed: 0,AURKA,ESR1,ERBB2,Desmedt2008_ESR1,Desmedt2008_ERBB2,Desmedt2008_AURKA,Desmedt2008_PLAU,Desmedt2008_VEGF,Desmedt2008_STAT1,Desmedt2008_CASP3,⋯,EndoPredict_risk,OncotypeDx,OncotypeDx_risk,GGI,TAMR13,GENE70,GENE70_risk,PIK3CA,rorS,rorS_risk
TCGA-3C-AAAU-01,9.7389,12.5624,13.6124,3.164715,6.849571,2.710936,7.014367,-0.1717,6.980868,-2.32,⋯,1,70.66625,1,6.860194,-34.29274,-0.125612245,1,2.104245,51.80392,2
TCGA-3C-AALI-01,10.6689,7.1768,18.4282,2.287569,8.262425,2.822298,7.860093,-0.4111538,8.815984,-2.485467,⋯,1,100.0,1,7.559643,-35.46239,-0.009591837,1,2.304907,93.86951,3
TCGA-3C-AALJ-01,10.2321,13.4848,14.0763,2.735389,6.095989,2.688204,7.83241,-0.3576615,8.30949,-2.541533,⋯,1,100.0,1,7.047194,-34.90484,-0.060106641,1,1.889693,71.83608,3
TCGA-3C-AALK-01,8.6073,12.065,15.9389,2.498951,7.551696,1.922101,8.318599,-0.7630923,7.996899,-2.438433,⋯,1,56.37148,1,6.394736,-40.19144,-0.419387755,0,2.510134,18.22183,1
TCGA-4H-AAAK-01,8.4108,13.1673,14.3222,2.431919,6.1946,1.911874,8.180442,-0.3607615,7.878992,-2.438244,⋯,1,33.23366,1,6.154126,-40.12471,-0.412142857,0,2.445472,17.64936,1


In [23]:
colnames(cbind(subtypes,signatures))
write.table(cbind(subtypes,signatures),paste0(data_folder,output_file),sep = "\t",quote = FALSE,row.names = F)

## OncotypeDx and EndoPredict run separately because they require all genes
- for METABRIC, because several genes are missed
- functions are modified such that it replaces NAs with 0 
- (NAs appear due to sd=0 for added genes)

In [None]:
# if important genes are not in expressions, add zero genes
for (gene in c("BAG1","DHCR7")){
 if (!(gene %in% colnames(exprs))){ 
  e <- rep(0,dim(exprs)[[1]])
  exprs <- cbind(exprs,e)
  colnames(exprs)[length(colnames(exprs))] <- gene
 }
}

exprs <- exprs[,sort(colnames(exprs))]

# reload gene annotation
# read the file mapping gene ids in expression matrix to Entrez ids 
gene_anno <- read.delim(paste0(data_folder,gene_annot_file), row.names = 1,sep = "\t")
gene_anno <- gene_anno[,c("probe","EntrezGene.ID","Gene.Symbol")]
head(gene_anno,3)



In [None]:
oncotypedx2 <-
function(data, annot, do.mapping=FALSE, mapping, verbose=FALSE) {
    
	## the reference genes are not taken into account due to their absence from most platforms
	sig2 <- sig.oncotypedx[sig.oncotypedx[ , "group"] != "reference",  , drop=FALSE]
	dimnames(sig2)[[1]] <- sig2[ , "probe.affy"]
	gt <- nrow(sig2)
	if(do.mapping) { ## not an affy HGU platform
		gid1 <- as.numeric(as.character(sig2[ ,"EntrezGene.ID"]))
		names(gid1) <- dimnames(sig2)[[1]]
		gid2 <- as.numeric(as.character(annot[ ,"EntrezGene.ID"]))
		names(gid2) <- dimnames(annot)[[1]]
		## remove missing and duplicated geneids from the gene list
		rm.ix <- is.na(gid1) | duplicated(gid1)
		gid1 <- gid1[!rm.ix]
		## mqpping
		rr <- geneid.map(geneid1=gid2, data1=data, geneid2=gid1, verbose=FALSE)
		gm <- length(rr$geneid2)
		mymapping <- c("mapped"=gm, "total"=gt)
        
		if(length(rr$geneid1) != gt) { ## some genes are missing
			res <- rep(NA, nrow(data))
			names(res) <- dimnames(data)[[1]]
			warning(sprintf("Probe candidates: %i/%i", gm, gt),
				"\nIncomplete overlap between the gene signature EntrezGene.IDs",
				" and the EntrezGene.ID column of annot... Returning all NAs.")
			return(list("score"=res, "risk"=res, "mapping"=mymapping, "probe"=NA))
		}
		gid1 <- rr$geneid2
		gid2 <- rr$geneid1
		data <- rr$data1
        
		myprobe <- cbind("probe"=names(gid1), "EntrezGene.ID"=gid1, "new.probe"=names(gid2))
        
		## change the names of probes in the data
		dimnames(data)[[2]] <- names(gid2) <- names(gid1)
        
	} else {
		myprobe <- NA
		data <- data[ ,intersect(dimnames(sig2)[[1]], dimnames(data)[[2]])]
		gm <- ncol(data)
		mymapping <- c("mapped"=gm, "total"=gt)
		if(nrow(sig2) != ncol(data)) { ## some genes are missing
			res <- rep(NA, nrow(data))
			names(res) <- dimnames(data)[[1]]
			warning(sprintf("Probe candidates: %i/%i", gm, gt),
				"\nIncomplete overlap between the gene signature EntrezGene.IDs",
				" and the colnames of data... Returning all NAs.")
			return(list("score"=res, "risk"=res, "mapping"=mymapping, "probe"=myprobe))
		}
	}
	## rename gene names by the gene symbols
    dimnames(data)[[2]] <- dimnames(sig2)[[1]] <- sig2[ , "symbol"]
    
	## scaling between 0 and 15
	data <- apply(data, 2, function(x) { xx <- (x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE)); return(xx * 15) })
    ######## change 1###########
    data[is.na(data)] <- 0
	
	## OcotypeDX recurrence score
	## GRB7 group score = 0.9 * GRB7 + 0.1 * HER2 if result < 8, then result = 8
	## ER group score = (0.8 * ER + 1.2 * PGR + BCL2 + SCUBE2) / 4
	## proliferation group score = ( survivin + KI67 + MYBL2 + CCNB1 + STK15) / 5 if result < 6.5, then result = 6.5
	## invasion group score = (CTSL2 + MMP11) / 2
	## RSU = + 0.47 * GRB7 group score - 0.34 * ER group score + 1.04 * proliferation group score + 0.10 * invasion group score + 0.05 * CD68 - 0.08 GSTM1 - 0.07 * BAG1 
    
	cc.ix <- complete.cases(data)
	rs <- rs.unscaled <- rsrisk <- NULL
	for (i in 1:nrow(data)) {
		if(cc.ix[i]) {
			grb7.gs <- 0.9 * data[i, "GRB7"] + 0.1 * data[i, "ERBB2"]
			if (grb7.gs < 8) { grb7.gs <- 8 }

			er.gs <- (0.8 * data[i, "ESR1"] + 1.2 * data[i, "PGR"] + data[i, "BCL2"] + data[i, "SCUBE2"]) / 4

			proliferation.gs <- (data[i, "BIRC5"] + data[i, "MKI67"] + data[i, "MYBL2"] + data[i, "CCNB1"] + data[i, "AURKA"]) / 5
			if (proliferation.gs < 6.5) { proliferation.gs <- 6.5 }

			invasion.gs <- (data[i, "CTSL2"] + data[i, "MMP11"]) / 2
            
			rsu <- 0.47 * (grb7.gs) - 0.34 * (er.gs) + 1.04 * (proliferation.gs) + 0.1 * (invasion.gs) + 0.05 * data[i, "CD68"] - 0.08 * data[i, "GSTM1"] - 0.07 * data[i, "BAG1"]
			## rescale the score
			rsu2 <- rsu
			if(rsu >= 0 & rsu <= 100) { rsu <- 20 * (rsu - 6.7) }
			if(rsu < 0) { rsu <- 0 }
			if(rsu > 100) { rsu <- 100 }
			## use of the official curoffs
			if(rsu < 18) { rsr <- 0 }
			if(rsu >= 18 & rsu < 31) { rsr <- 0.5 }
			if(rsu >= 31) { rsr <- 1 }
		}
		else { rsu <- rsr <- rsu2 <- NA }
		rs.unscaled <- c(rs.unscaled, rsu2)
		rs <- c(rs, rsu)
		rsrisk <- c(rsrisk, rsr)
	}
	names(rs) <- names(rs.unscaled) <- names(rsrisk) <- dimnames(data)[[1]]
	return(list("score"=rs, "risk"=rsrisk, "mapping"=mymapping, "probe"=myprobe))
}

endoPredict2 <-
function(data, annot, do.mapping=FALSE, mapping, verbose=FALSE) {

  ## the reference genes are not taken into account due to their absence from most platforms
  sig2 <- sig.endoPredict[sig.endoPredict[ , "group"] != "REFERENCE", , drop=FALSE]
	rownames(sig2) <- sig2[ , "probe.affy"]
	gt <- nrow(sig2)
	if(do.mapping) { ## not an affy HGU platform
		gid1 <- as.numeric(as.character(sig2[ ,"EntrezGene.ID"]))
		names(gid1) <- dimnames(sig2)[[1]]
		gid2 <- as.numeric(as.character(annot[ ,"EntrezGene.ID"]))
		names(gid2) <- dimnames(annot)[[1]]
		## remove missing and duplicated geneids from the gene list
		rm.ix <- is.na(gid1) | duplicated(gid1)
		gid1 <- gid1[!rm.ix]
		## mqpping
		rr <- geneid.map(geneid1=gid2, data1=data, geneid2=gid1, verbose=FALSE)
		gm <- length(rr$geneid2[!is.na(rr$geneid2)])
		mymapping <- c("mapped"=gm, "total"=gt)
		if(!all(is.element(sig2[sig2[ , "group"] == "GOI", "EntrezGene.ID"], rr$geneid1))) { ## if genes of intesignatures are missing
			res <- rep(NA, nrow(data))
			names(res) <- dimnames(data)[[1]]
			warning(sprintf("Probe candidates: %i/%i", gm, gt),
				"\nIncomplete overlap between the gene signature EntrezGene.IDs",
				" and the EntrezGene.ID column of annot... Returning all NAs.")
			return(list("score"=res, "risk"=res, "mapping"=mymapping, "probe"=NA))
		}
		gid1 <- rr$geneid2
		gid2 <- rr$geneid1
		data <- rr$data1
		myprobe <- cbind("probe"=names(gid1), "EntrezGene.ID"=gid1, "new.probe"=names(gid2))
		## change the names of probes in the data
		colnames(data) <- names(gid2) <- names(gid1)
    sig2 <- sig2[colnames(data), , drop=FALSE]
		gm <- ncol(data)
		mymapping <- c("mapped"=gm, "total"=gt)
	} else {
		myprobe <- NA
    	nn <- intersect(dimnames(sig2)[[1]], dimnames(data)[[2]])
		data <- data[ , nn]
    	sig2 <- sig2[nn, , drop=FALSE]
		gm <- ncol(data)
		mymapping <- c("mapped"=gm, "total"=gt)
		if (length(nn) < 1) {
			res <- rep(NA, nrow(data))
			names(res) <- dimnames(data)[[1]]
			warning("No overalp between the gene signature EntrezGene.IDs",
				"and the colnames of your data... Returning all NAs.")
			return(list("score"=res, "risk"=res, "mapping"=mymapping, "probe"=NA))
		}
	}
	## rename gene names by the gene symbols
	colnames(data) <- rownames(sig2) <- sig2[ , "symbol"]
	
	if(do.mapping) {
    ## transform expressions so they match approximately the scale of Affymetrix data
    data <- apply(data, 2, function(x) {
      xx <- (x - quantile(x, probs=0.025, na.rm=TRUE)) / (quantile(x, probs=0.975, na.rm=TRUE) - quantile(x, probs=0.025, na.rm=TRUE)) 
      return((xx * 8) + 6)
    })
    data[!is.na(data) & data < 1] <- 1
    data[!is.na(data) & data > 15] <- 15
  }
  
  ######## change ###########
    data[is.na(data)] <- 0
    
  data <- (data - apply(data, 1, mean, na.rm=TRUE)) + log2(500)
  ## apply transformation factor and offset
  datat <- t(apply(data, 1, function(x, a, b) {
    return((x - b) / a)
  }, a=sig2[ , "a"], b=sig2[ , "b"]))
  data <- matrix(NA, nrow=nrow(data), ncol=ncol(data), dimnames=dimnames(data))
  data[rownames(datat), colnames(datat)] <- datat

	rs <- rs.unscaled <- rsrisk <- rep(NA, nrow(data))
  rs.unscaled <- drop((sig2[ , "weight"] %*% t(data)) - 2.63)
  rs <- sapply(rs.unscaled, function(x) {
    if(!is.na(x)) {
      x <- 1.5 * x + 18.95
      if(x < 0) {
        x <- 0
      } else {
        if(x > 15) {
          x <- 15
        }
      }
    }
    return(x)
  })
  rsrisk <- ifelse(rs >= 5, 1, 0)
	names(rs) <- names(rs.unscaled) <- names(rsrisk) <- dimnames(data)[[1]]
	return(list("score"=rs, "risk"=rsrisk, "mapping"=mymapping, "probe"=myprobe))
}



# Filipits et al. (2011). "A new molecular predictor of distant recurrence in ER-positive, HER2-negative breast cancer adds independent information to conventional clinical risk factors." 
print("EndoPredict")
ep_results <- endoPredict2(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)
signatures[,"EndoPredict"] <- ep_results$score
signatures[,"EndoPredict_risk"] <- ep_results$risk

# Paik et al. (2004) "A Multigene Assay to Predict Recurrence of Tamoxifen-Treated, Node-Negative Breast Cancer"
print("OncotypeDx - GENE21")
od_results <- oncotypedx2(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)
signatures[,"OncotypeDx"] <- od_results$score
signatures[,"OncotypeDx_risk"] <-od_results$risk

head(signatures,5)

In [None]:
colnames(cbind(subtypes,signatures))
write.table(cbind(subtypes,signatures),paste0(data_folder,output_file),sep = "\t",quote = FALSE,row.names = F)