In [1]:
library(breastCancerMAINZ)
library(breastCancerTRANSBIG)
library(breastCancerUPP)
library(breastCancerUNT)
library(breastCancerNKI)

suppressPackageStartupMessages(library("genefu"))
suppressPackageStartupMessages(library("AIMS"))
suppressPackageStartupMessages(library("caret"))

In [4]:
data_folder <- "preprocessed/"
# read the file mapping gene ids in expression matrix to Entrez ids 
#gene_annot_file <- "../Xena_gene_info.tsv"
gene_annot_file <- "gene_id_mapping.tsv"


### input

### log2(x+1) but *not* z-score transformed expressions
#exprs_file <- "TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.log2_exprs_v5.tsv" 
exprs_file <-"METABRIC_1904_17Kgenes.log2_exprs_v5.tsv"


### output
#output_file <- "TCGA-BRCA_1079_17Kgenes.Xena_TCGA_PanCan.signatures_v5.tsv"
output_file  <- "METABRIC_1904_17Kgenes.signatures_v5.tsv"




In [5]:
exprs <- t(read.delim(paste0(data_folder,exprs_file), row.names = 1))
rownames(exprs) <- gsub("\\.", "-",rownames(exprs))
exprs <- exprs[,sort(colnames(exprs))]

head(exprs,3)

Unnamed: 0,A1BG,A1CF,A2M,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,AADACL2,⋯,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
MB-0000,5.369883,5.464824,9.840336,4.959595,6.651367,5.655177,5.590128,7.637542,5.632396,5.257766,⋯,5.259177,5.650032,6.27976,5.709677,6.623938,5.165483,9.003693,9.19709,7.536208,7.689014
MB-0002,5.414432,5.275869,8.316607,5.366424,6.644259,5.316182,6.239036,8.702124,5.43429,5.367004,⋯,5.520065,6.825942,6.464244,6.80913,6.812159,5.357121,8.756802,8.870436,7.555341,7.87396
MB-0005,5.527208,5.453274,9.924913,5.477048,6.243935,5.228924,6.08625,6.781157,5.58391,5.465849,⋯,6.265199,7.510925,6.47933,6.419635,5.744392,5.094948,9.156367,8.921823,6.44628,8.105143


In [9]:
gene_anno <- read.delim(paste0(data_folder,gene_annot_file), row.names = 1,sep = "\t")
gene_anno <- gene_anno[,c("probe","EntrezGene.ID","Gene.Symbol")]
gene_anno <- gene_anno[colnames(exprs),]
head(gene_anno,3)

Unnamed: 0_level_0,probe,EntrezGene.ID,Gene.Symbol
Unnamed: 0_level_1,<fct>,<dbl>,<fct>
A1BG,A1BG,1,A1BG
A1CF,A1CF,29974,A1CF
A2M,A2M,2,A2M


In [10]:
dim(gene_anno)
dim(exprs)

## PAM50

In [11]:
SubtypePredictions<-molecular.subtyping(sbt.model = "pam50",data = scale(exprs, scale = F, center = T),
                                            annot = gene_anno,do.mapping = T, verbose = T)
# missed genes
#df <- pam50$centroids.map
#df[!(df$EntrezGene.ID %in% gene_anno$EntrezGene.ID),]
pam50_subt <- SubtypePredictions$subtype
table(pam50_subt)
table(pam50_subt)/dim(exprs)[[1]]

pam50_subt
 Basal   Her2   LumB   LumA Normal 
   244    244    764    598     54 

pam50_subt
     Basal       Her2       LumB       LumA     Normal 
0.12815126 0.12815126 0.40126050 0.31407563 0.02836134 

In [12]:
# missed genes
df <- pam50$centroids.map
df[!(df$EntrezGene.ID %in% gene_anno$EntrezGene.ID),]

Unnamed: 0_level_0,probe,probe.centroids,EntrezGene.ID
Unnamed: 0_level_1,<chr>,<chr>,<int>
BAG1,BAG1,BAG1,573
GPR160,GPR160,GPR160,26996
TMEM45B,TMEM45B,TMEM45B,120224


In [13]:
SubtypePredictions<-molecular.subtyping(sbt.model = "scmod2",data = scale(exprs, scale = F, center = T),
                                            annot = gene_anno,do.mapping = T, verbose = TRUE)
scmod2_subt <- SubtypePredictions$subtype
table(scmod2_subt)
table(scmod2_subt)/dim(exprs)[[1]]

scmod2_subt
            ER-/HER2- ER+/HER2- High Prolif  ER+/HER2- Low Prolif 
                  365                   692                   645 
                HER2+ 
                  202 

scmod2_subt
            ER-/HER2- ER+/HER2- High Prolif  ER+/HER2- Low Prolif 
            0.1917017             0.3634454             0.3387605 
                HER2+ 
            0.1060924 

In [14]:
SubtypePredictions<-molecular.subtyping(sbt.model = "scmod1", data = scale(exprs, scale = F, center = T),
                                            annot = gene_anno,do.mapping = T, verbose = T)
scmod1_subt <- SubtypePredictions$subtype
table(scmod1_subt)
table(scmod1_subt)/dim(exprs)[[1]]

scmod1_subt
            ER-/HER2- ER+/HER2- High Prolif  ER+/HER2- Low Prolif 
                  299                   795                   562 
                HER2+ 
                  248 

scmod1_subt
            ER-/HER2- ER+/HER2- High Prolif  ER+/HER2- Low Prolif 
            0.1570378             0.4175420             0.2951681 
                HER2+ 
            0.1302521 

In [15]:
SubtypePredictions<-molecular.subtyping(sbt.model = "intClust",data = scale(exprs, scale = F, center = T),
                                            annot = gene_anno,do.mapping = T, verbose = T)
intClust_subt <- SubtypePredictions$subtype
table(intClust_subt)
table(intClust_subt)/dim(exprs)[[1]]

Found  570  out of  612  Exp features
running classifier with only expression...
12345678910111213141516171819202122232425262728293012345678910Fold 1 :123456789101112131415161718192021222324252627282930
Fold 2 :123456789101112131415161718192021222324252627282930
Fold 3 :123456789101112131415161718192021222324252627282930
Fold 4 :123456789101112131415161718192021222324252627282930
Fold 5 :123456789101112131415161718192021222324252627282930
Fold 6 :123456789101112131415161718192021222324252627282930
Fold 7 :123456789101112131415161718192021222324252627282930
Fold 8 :123456789101112131415161718192021222324252627282930
Fold 9 :123456789101112131415161718192021222324252627282930
Fold 10 :123456789101112131415161718192021222324252627282930


intClust_subt
 iC1  iC2  iC3  iC4  iC5  iC6  iC7  iC8  iC9 iC10 
 126   68  340  337  176   73  189  251  122  222 

intClust_subt
       iC1        iC2        iC3        iC4        iC5        iC6        iC7 
0.06617647 0.03571429 0.17857143 0.17699580 0.09243697 0.03834034 0.09926471 
       iC8        iC9       iC10 
0.13182773 0.06407563 0.11659664 

In [16]:
SubtypePredictions<-molecular.subtyping(sbt.model = "AIMS",data = scale(exprs, scale = F, center = F),
                                            annot = gene_anno,do.mapping = T)
AIMS_subt <- SubtypePredictions$subtype
table(AIMS_subt)
table(AIMS_subt)/dim(exprs)[[1]]
AIMS_subt <- AIMS_subt[,"20"]

You are missing the pair or have more than one 1466<79682 in 

You are missing the pair or have more than one 29842<55765 in 

You are missing the pair or have more than one 18<79682 in 

You are missing the pair or have more than one 2330<7272 in 

You are missing the pair or have more than one 142<7450 in 

Current k = 20



AIMS_subt
 Basal   Her2   LumA   LumB Normal 
   330    269    510    543    252 

AIMS_subt
    Basal      Her2      LumA      LumB    Normal 
0.1733193 0.1412815 0.2678571 0.2851891 0.1323529 

### Claudin-low

In [17]:
data(claudinLowData)

# from https://github.com/clfougner/ClaudinLow/blob/master/Code/METABRIC_patientData.r
entrezID_CLgenes <- claudinLowData$fnames
# genes x samples, Entrez gene ids
exprs_entrez <- t(exprs)
rownames(exprs_entrez) <-gene_anno[rownames(exprs_entrez),"EntrezGene.ID"]
exprs_entrez <- exprs_entrez[na.omit(rownames(exprs_entrez)),]

overlappingCL_entrezID <- intersect(entrezID_CLgenes, rownames(exprs_entrez))
length(overlappingCL_entrezID )


In [18]:
# Select relevant rows
exprs_CLGenes <- exprs_entrez[row.names(exprs_entrez) %in% overlappingCL_entrezID, ]
dim(exprs_CLGenes )

# gene means = 0, sd =1
exprs_CLGenes_scaled <- t(scale(t(exprs_CLGenes), scale = T, center = T)) 


In [19]:
# Train centroids based on available genes
trainingData <- claudinLowData
trainingData$xd <- medianCtr(trainingData$xd)
trainingData$xd <- trainingData$xd[rownames(trainingData$xd) %in% rownames(exprs_CLGenes), ]
dim(trainingData$xd)

# gene means = 0, sd =1
trainingData_scaled <- t(scale(t(trainingData$xd), scale = TRUE, center = TRUE))

In [20]:
#mean(exprs_CLGenes_scaled["6712",])
#sd(exprs_CLGenes_scaled["6712",])
#mean(trainingData_scaled ["6712",])
#sd(trainingData_scaled  ["6712",])

In [21]:
cl_class <- claudinLow(x = trainingData_scaled, 
                       classes = as.matrix(trainingData$classes$Group, ncol = 1),
                       y = exprs_CLGenes_scaled,
                       distm = "euclidean")

pred_cl <- cl_class$predictions
pred_cl <- data.frame(sample_id = gsub("\\.", "-", rownames(pred_cl)),
                      ClaudinLow = as.character(pred_cl$Call),
                      stringsAsFactors = FALSE)
pred_cl[pred_cl$ClaudinLow=="Others","claudin_low"] <- 0
length(pred_cl[pred_cl$ClaudinLow=="Claudin","claudin_low"])
pred_cl[pred_cl$ClaudinLow=="Claudin","claudin_low"] <- 1


[1] "Number of genes used: 745"


In [22]:
sample_id <- pred_cl$sample_id
claudin_low <- pred_cl[,"claudin_low"]
pam50_subt <- as.character(pam50_subt)
subtypes <- cbind("sample_id"=sample_id,"PAM50"=pam50_subt,
                  "claudin_low"=claudin_low,
                  "SCMOD2"=scmod2_subt,"SCMOD1"=scmod1_subt,
                  "inClust"=intClust_subt,"AIMS"=AIMS_subt)
head(subtypes,3)
#write.table(subtypes,paste0(data_folder,subtype_file),sep = "\t",quote = FALSE,row.names = F)

Unnamed: 0,sample_id,PAM50,claudin_low,SCMOD2,SCMOD1,inClust,AIMS
MB-0000,MB-0000,Normal,1,ER-/HER2-,ER-/HER2-,4,Normal
MB-0002,MB-0002,LumA,0,ER+/HER2- Low Prolif,ER+/HER2- High Prolif,4,LumA
MB-0005,MB-0005,LumB,0,ER+/HER2- High Prolif,ER+/HER2- Low Prolif,3,LumB


In [23]:
unique(subtypes[,"SCMOD2"])
#rownames(subtypes)[subtypes[,"scmod2_subt"]=="HER2+"]

# Known signatures and risk scores 

In [24]:
data(mod1)
signatures <- NULL
# 
print("SCMGENE- three gene")
modt <- scmgene.robust$mod$AURKA
signatures <- cbind(signatures, "AURKA"=sig.score(x=modt, data=exprs, annot=gene_anno,do.mapping=TRUE, verbose = TRUE)$score)
modt <- scmgene.robust$mod$ESR1
signatures <- cbind(signatures, "ESR1"=sig.score(x=modt, data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)$score)
modt <- scmgene.robust$mod$ERBB2
signatures <- cbind(signatures, "ERBB2"=sig.score(x=modt, data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)$score)

# seven modules from Desmedt2008 et al
for (module in names(mod1)){
    x <- sig.score(mod1[[module]], exprs, gene_anno, do.mapping = TRUE, signed = TRUE, verbose = TRUE)$score
    module_name <- paste0('Desmedt2008_',module) 
    print(module_name)
    signatures <- cbind(signatures, x)
    colnames(signatures)[length(colnames(signatures))] <- module_name
}

print("GENIUS")
# GENIUSM1 Risk score from the ER-/HER2- subtype signature in GENIUS model.
# GENIUSM2 HER2+ subtype signature
# GENIUSM3 ER+/HER2- subtype 
genius_results <- genius(data=exprs, annot=gene_anno, do.mapping=TRUE)
signatures <- cbind(signatures, "GENIUS"=genius_results$score,"GENIUSM1"=genius_results$GENIUSM1,
              "GENIUSM2"=genius_results$GENIUSM2,
              "GENIUSM3"=genius_results$GENIUSM3)

# Filipits et al. (2011). "A new molecular predictor of distant recurrence in ER-positive, HER2-negative breast cancer adds independent information to conventional clinical risk factors." 
print("EndoPredict")
ep_results <- genefu::endoPredict(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)
signatures <- cbind(signatures, "EndoPredict"=ep_results$score, "EndoPredict_risk"=ep_results$risk)

# Paik et al. (2004) "A Multigene Assay to Predict Recurrence of Tamoxifen-Treated, Node-Negative Breast Cancer"
print("OncotypeDx - GENE21")
od_results <- genefu::oncotypedx(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)
signatures <- cbind(signatures, "OncotypeDx"=od_results$score, "OncotypeDx_risk"=od_results$risk)

print("GGI - grade index")
signatures <- cbind(signatures, "GGI"=ggi(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)$score)

print("TAMR13 - Tamoxifen response")
signatures <- cbind(signatures, "TAMR13"=tamr13(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)$score)

# mammaPrint
print("Mammaprint - GENE70")
gene70_results <- gene70(data=exprs, annot=gene_anno, std="none",do.mapping=TRUE, verbose = TRUE)
signatures <- cbind(signatures, "GENE70"=gene70_results$score,"GENE70_risk"=gene70_results$risk, verbose = TRUE)

print("PIK3CA")
signatures <- cbind(signatures, "PIK3CA"=pik3cags(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE))
print("RorS")
rors_results <- rorS(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)
signatures <- cbind(signatures, "rorS"=rors_results$score,"rorS_risk"=rors_results$risk)

head(signatures,5)

[1] "SCMGENE- three gene"


probe candidates: 1/1

probe candidates: 1/1

probe candidates: 1/1

probe candidates: 433/469



[1] "Desmedt2008_ESR1"


probe candidates: 28/28



[1] "Desmedt2008_ERBB2"


probe candidates: 223/229



[1] "Desmedt2008_AURKA"


probe candidates: 67/68



[1] "Desmedt2008_PLAU"


probe candidates: 13/14



[1] "Desmedt2008_VEGF"


probe candidates: 88/95



[1] "Desmedt2008_STAT1"


probe candidates: 9/10



[1] "Desmedt2008_CASP3"
[1] "GENIUS"
[1] "EndoPredict"


probe candidates: 7/8



[1] "OncotypeDx - GENE21"


probe candidates: 15/16



[1] "GGI - grade index"


probe candidates: 102/128



[1] "TAMR13 - Tamoxifen response"


probe candidates: 4/7

probe candidates: 27/45

probe candidates: 10/14

probe candidates: 24/38

probe candidates: 3/8

probe candidates: 25/36

probe candidates: 4/7

probe candidates: 7/21

probe candidates: 15/26

probe candidates: 15/30

probe candidates: 3/7



[1] "Mammaprint - GENE70"


49/70 probes are used to compute the score

no standardization of the gene expressions



[1] "PIK3CA"


probe candidates: 225/278



[1] "RorS"


Unnamed: 0,AURKA,ESR1,ERBB2,Desmedt2008_ESR1,Desmedt2008_ERBB2,Desmedt2008_AURKA,Desmedt2008_PLAU,Desmedt2008_VEGF,Desmedt2008_STAT1,Desmedt2008_CASP3,⋯,OncotypeDx,OncotypeDx_risk,GGI,TAMR13,GENE70,GENE70_risk,verbose,PIK3CA,rorS,rorS_risk
MB-0000,6.14895,8.929817,9.333972,1.122729,4.362967,1.069513,5.382351,-1.6399264,6.741159,-1.569133,⋯,,,4.085216,-37.47346,-0.38765306,0,1,1.478941,0.0,1
MB-0002,7.247495,10.047059,9.729606,1.98526,4.531835,1.714929,5.440198,-0.9928992,6.338358,-1.717078,⋯,,,4.654126,-30.2515,-0.22755102,1,1,1.367254,46.30432,2
MB-0005,7.317185,10.041281,9.725825,1.629221,4.374361,1.630036,5.730484,-1.5755017,6.810293,-1.211142,⋯,,,4.855819,-31.82141,-0.0455102,1,1,1.748682,58.4786,3
MB-0006,9.283242,10.404685,10.334979,1.650139,4.581135,1.772883,5.488209,-1.7953025,7.017693,-1.322573,⋯,,,4.79051,-30.57637,-0.05163265,1,1,1.3863,58.7549,3
MB-0008,8.02149,11.276581,9.956267,1.77877,4.817622,2.014978,5.192804,-1.3017143,7.232848,-1.412366,⋯,,,5.395154,-28.44974,0.10163265,1,1,1.453307,63.83198,3


In [25]:
#write.table(cbind(subtypes,signatures),paste0(data_folder,output_file),sep = "\t",quote = FALSE,row.names = F)

## OncotypeDx and EndoPredict run separately because they require all genes
- functions are modified such that it replaces NAs with 0 
- (NAs appear due to sd=0 for added genes)

In [26]:
# if important genes are not in expressions, add zero genes
for (gene in c("BAG1","DHCR7")){
 if (!(gene %in% colnames(exprs))){ 
  e <- rep(0,dim(exprs)[[1]])
  exprs <- cbind(exprs,e)
  colnames(exprs)[length(colnames(exprs))] <- gene
 }
}

exprs <- exprs[,sort(colnames(exprs))]

# reload gene annotation
# read the file mapping gene ids in expression matrix to Entrez ids 
gene_anno <- read.delim(paste0(data_folder,gene_annot_file), row.names = 1,sep = "\t")
gene_anno <- gene_anno[,c("probe","EntrezGene.ID","Gene.Symbol")]
head(gene_anno,3)



Unnamed: 0_level_0,probe,EntrezGene.ID,Gene.Symbol
Unnamed: 0_level_1,<fct>,<dbl>,<fct>
RERE,RERE,473,RERE
RNF165,RNF165,494470,RNF165
PHF7,PHF7,51533,PHF7


In [27]:
oncotypedx2 <-
function(data, annot, do.mapping=FALSE, mapping, verbose=FALSE) {
    
	## the reference genes are not taken into account due to their absence from most platforms
	sig2 <- sig.oncotypedx[sig.oncotypedx[ , "group"] != "reference",  , drop=FALSE]
	dimnames(sig2)[[1]] <- sig2[ , "probe.affy"]
	gt <- nrow(sig2)
	if(do.mapping) { ## not an affy HGU platform
		gid1 <- as.numeric(as.character(sig2[ ,"EntrezGene.ID"]))
		names(gid1) <- dimnames(sig2)[[1]]
		gid2 <- as.numeric(as.character(annot[ ,"EntrezGene.ID"]))
		names(gid2) <- dimnames(annot)[[1]]
		## remove missing and duplicated geneids from the gene list
		rm.ix <- is.na(gid1) | duplicated(gid1)
		gid1 <- gid1[!rm.ix]
		## mqpping
		rr <- geneid.map(geneid1=gid2, data1=data, geneid2=gid1, verbose=FALSE)
		gm <- length(rr$geneid2)
		mymapping <- c("mapped"=gm, "total"=gt)
        
		if(length(rr$geneid1) != gt) { ## some genes are missing
			res <- rep(NA, nrow(data))
			names(res) <- dimnames(data)[[1]]
			warning(sprintf("Probe candidates: %i/%i", gm, gt),
				"\nIncomplete overlap between the gene signature EntrezGene.IDs",
				" and the EntrezGene.ID column of annot... Returning all NAs.")
			return(list("score"=res, "risk"=res, "mapping"=mymapping, "probe"=NA))
		}
		gid1 <- rr$geneid2
		gid2 <- rr$geneid1
		data <- rr$data1
        
		myprobe <- cbind("probe"=names(gid1), "EntrezGene.ID"=gid1, "new.probe"=names(gid2))
        
		## change the names of probes in the data
		dimnames(data)[[2]] <- names(gid2) <- names(gid1)
        
	} else {
		myprobe <- NA
		data <- data[ ,intersect(dimnames(sig2)[[1]], dimnames(data)[[2]])]
		gm <- ncol(data)
		mymapping <- c("mapped"=gm, "total"=gt)
		if(nrow(sig2) != ncol(data)) { ## some genes are missing
			res <- rep(NA, nrow(data))
			names(res) <- dimnames(data)[[1]]
			warning(sprintf("Probe candidates: %i/%i", gm, gt),
				"\nIncomplete overlap between the gene signature EntrezGene.IDs",
				" and the colnames of data... Returning all NAs.")
			return(list("score"=res, "risk"=res, "mapping"=mymapping, "probe"=myprobe))
		}
	}
	## rename gene names by the gene symbols
    dimnames(data)[[2]] <- dimnames(sig2)[[1]] <- sig2[ , "symbol"]
    
	## scaling between 0 and 15
	data <- apply(data, 2, function(x) { xx <- (x - min(x, na.rm=TRUE)) / (max(x, na.rm=TRUE) - min(x, na.rm=TRUE)); return(xx * 15) })
    ######## change 1###########
    data[is.na(data)] <- 0
	
	## OcotypeDX recurrence score
	## GRB7 group score = 0.9 * GRB7 + 0.1 * HER2 if result < 8, then result = 8
	## ER group score = (0.8 * ER + 1.2 * PGR + BCL2 + SCUBE2) / 4
	## proliferation group score = ( survivin + KI67 + MYBL2 + CCNB1 + STK15) / 5 if result < 6.5, then result = 6.5
	## invasion group score = (CTSL2 + MMP11) / 2
	## RSU = + 0.47 * GRB7 group score - 0.34 * ER group score + 1.04 * proliferation group score + 0.10 * invasion group score + 0.05 * CD68 - 0.08 GSTM1 - 0.07 * BAG1 
    
	cc.ix <- complete.cases(data)
	rs <- rs.unscaled <- rsrisk <- NULL
	for (i in 1:nrow(data)) {
		if(cc.ix[i]) {
			grb7.gs <- 0.9 * data[i, "GRB7"] + 0.1 * data[i, "ERBB2"]
			if (grb7.gs < 8) { grb7.gs <- 8 }

			er.gs <- (0.8 * data[i, "ESR1"] + 1.2 * data[i, "PGR"] + data[i, "BCL2"] + data[i, "SCUBE2"]) / 4

			proliferation.gs <- (data[i, "BIRC5"] + data[i, "MKI67"] + data[i, "MYBL2"] + data[i, "CCNB1"] + data[i, "AURKA"]) / 5
			if (proliferation.gs < 6.5) { proliferation.gs <- 6.5 }

			invasion.gs <- (data[i, "CTSL2"] + data[i, "MMP11"]) / 2
            
			rsu <- 0.47 * (grb7.gs) - 0.34 * (er.gs) + 1.04 * (proliferation.gs) + 0.1 * (invasion.gs) + 0.05 * data[i, "CD68"] - 0.08 * data[i, "GSTM1"] - 0.07 * data[i, "BAG1"]
			## rescale the score
			rsu2 <- rsu
			if(rsu >= 0 & rsu <= 100) { rsu <- 20 * (rsu - 6.7) }
			if(rsu < 0) { rsu <- 0 }
			if(rsu > 100) { rsu <- 100 }
			## use of the official curoffs
			if(rsu < 18) { rsr <- 0 }
			if(rsu >= 18 & rsu < 31) { rsr <- 0.5 }
			if(rsu >= 31) { rsr <- 1 }
		}
		else { rsu <- rsr <- rsu2 <- NA }
		rs.unscaled <- c(rs.unscaled, rsu2)
		rs <- c(rs, rsu)
		rsrisk <- c(rsrisk, rsr)
	}
	names(rs) <- names(rs.unscaled) <- names(rsrisk) <- dimnames(data)[[1]]
	return(list("score"=rs, "risk"=rsrisk, "mapping"=mymapping, "probe"=myprobe))
}

endoPredict2 <-
function(data, annot, do.mapping=FALSE, mapping, verbose=FALSE) {

  ## the reference genes are not taken into account due to their absence from most platforms
  sig2 <- sig.endoPredict[sig.endoPredict[ , "group"] != "REFERENCE", , drop=FALSE]
	rownames(sig2) <- sig2[ , "probe.affy"]
	gt <- nrow(sig2)
	if(do.mapping) { ## not an affy HGU platform
		gid1 <- as.numeric(as.character(sig2[ ,"EntrezGene.ID"]))
		names(gid1) <- dimnames(sig2)[[1]]
		gid2 <- as.numeric(as.character(annot[ ,"EntrezGene.ID"]))
		names(gid2) <- dimnames(annot)[[1]]
		## remove missing and duplicated geneids from the gene list
		rm.ix <- is.na(gid1) | duplicated(gid1)
		gid1 <- gid1[!rm.ix]
		## mqpping
		rr <- geneid.map(geneid1=gid2, data1=data, geneid2=gid1, verbose=FALSE)
		gm <- length(rr$geneid2[!is.na(rr$geneid2)])
		mymapping <- c("mapped"=gm, "total"=gt)
		if(!all(is.element(sig2[sig2[ , "group"] == "GOI", "EntrezGene.ID"], rr$geneid1))) { ## if genes of intesignatures are missing
			res <- rep(NA, nrow(data))
			names(res) <- dimnames(data)[[1]]
			warning(sprintf("Probe candidates: %i/%i", gm, gt),
				"\nIncomplete overlap between the gene signature EntrezGene.IDs",
				" and the EntrezGene.ID column of annot... Returning all NAs.")
			return(list("score"=res, "risk"=res, "mapping"=mymapping, "probe"=NA))
		}
		gid1 <- rr$geneid2
		gid2 <- rr$geneid1
		data <- rr$data1
		myprobe <- cbind("probe"=names(gid1), "EntrezGene.ID"=gid1, "new.probe"=names(gid2))
		## change the names of probes in the data
		colnames(data) <- names(gid2) <- names(gid1)
    sig2 <- sig2[colnames(data), , drop=FALSE]
		gm <- ncol(data)
		mymapping <- c("mapped"=gm, "total"=gt)
	} else {
		myprobe <- NA
    	nn <- intersect(dimnames(sig2)[[1]], dimnames(data)[[2]])
		data <- data[ , nn]
    	sig2 <- sig2[nn, , drop=FALSE]
		gm <- ncol(data)
		mymapping <- c("mapped"=gm, "total"=gt)
		if (length(nn) < 1) {
			res <- rep(NA, nrow(data))
			names(res) <- dimnames(data)[[1]]
			warning("No overalp between the gene signature EntrezGene.IDs",
				"and the colnames of your data... Returning all NAs.")
			return(list("score"=res, "risk"=res, "mapping"=mymapping, "probe"=NA))
		}
	}
	## rename gene names by the gene symbols
	colnames(data) <- rownames(sig2) <- sig2[ , "symbol"]
	
	if(do.mapping) {
    ## transform expressions so they match approximately the scale of Affymetrix data
    data <- apply(data, 2, function(x) {
      xx <- (x - quantile(x, probs=0.025, na.rm=TRUE)) / (quantile(x, probs=0.975, na.rm=TRUE) - quantile(x, probs=0.025, na.rm=TRUE)) 
      return((xx * 8) + 6)
    })
    data[!is.na(data) & data < 1] <- 1
    data[!is.na(data) & data > 15] <- 15
  }
  
  ######## change ###########
    data[is.na(data)] <- 0
    
  data <- (data - apply(data, 1, mean, na.rm=TRUE)) + log2(500)
  ## apply transformation factor and offset
  datat <- t(apply(data, 1, function(x, a, b) {
    return((x - b) / a)
  }, a=sig2[ , "a"], b=sig2[ , "b"]))
  data <- matrix(NA, nrow=nrow(data), ncol=ncol(data), dimnames=dimnames(data))
  data[rownames(datat), colnames(datat)] <- datat

	rs <- rs.unscaled <- rsrisk <- rep(NA, nrow(data))
  rs.unscaled <- drop((sig2[ , "weight"] %*% t(data)) - 2.63)
  rs <- sapply(rs.unscaled, function(x) {
    if(!is.na(x)) {
      x <- 1.5 * x + 18.95
      if(x < 0) {
        x <- 0
      } else {
        if(x > 15) {
          x <- 15
        }
      }
    }
    return(x)
  })
  rsrisk <- ifelse(rs >= 5, 1, 0)
	names(rs) <- names(rs.unscaled) <- names(rsrisk) <- dimnames(data)[[1]]
	return(list("score"=rs, "risk"=rsrisk, "mapping"=mymapping, "probe"=myprobe))
}



# Filipits et al. (2011). "A new molecular predictor of distant recurrence in ER-positive, HER2-negative breast cancer adds independent information to conventional clinical risk factors." 
print("EndoPredict")
ep_results <- endoPredict2(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)
signatures[,"EndoPredict"] <- ep_results$score
signatures[,"EndoPredict_risk"] <- ep_results$risk

# Paik et al. (2004) "A Multigene Assay to Predict Recurrence of Tamoxifen-Treated, Node-Negative Breast Cancer"
print("OncotypeDx - GENE21")
od_results <- oncotypedx2(data=exprs, annot=gene_anno, do.mapping=TRUE, verbose = TRUE)
signatures[,"OncotypeDx"] <- od_results$score
signatures[,"OncotypeDx_risk"] <-od_results$risk

head(signatures,5)

[1] "EndoPredict"
[1] "OncotypeDx - GENE21"


Unnamed: 0,AURKA,ESR1,ERBB2,Desmedt2008_ESR1,Desmedt2008_ERBB2,Desmedt2008_AURKA,Desmedt2008_PLAU,Desmedt2008_VEGF,Desmedt2008_STAT1,Desmedt2008_CASP3,⋯,OncotypeDx,OncotypeDx_risk,GGI,TAMR13,GENE70,GENE70_risk,verbose,PIK3CA,rorS,rorS_risk
MB-0000,6.14895,8.929817,9.333972,1.122729,4.362967,1.069513,5.382351,-1.6399264,6.741159,-1.569133,⋯,43.89323,1.0,4.085216,-37.47346,-0.38765306,0,1,1.478941,0.0,1
MB-0002,7.247495,10.047059,9.729606,1.98526,4.531835,1.714929,5.440198,-0.9928992,6.338358,-1.717078,⋯,29.95655,0.5,4.654126,-30.2515,-0.22755102,1,1,1.367254,46.30432,2
MB-0005,7.317185,10.041281,9.725825,1.629221,4.374361,1.630036,5.730484,-1.5755017,6.810293,-1.211142,⋯,52.73028,1.0,4.855819,-31.82141,-0.0455102,1,1,1.748682,58.4786,3
MB-0006,9.283242,10.404685,10.334979,1.650139,4.581135,1.772883,5.488209,-1.7953025,7.017693,-1.322573,⋯,45.10648,1.0,4.79051,-30.57637,-0.05163265,1,1,1.3863,58.7549,3
MB-0008,8.02149,11.276581,9.956267,1.77877,4.817622,2.014978,5.192804,-1.3017143,7.232848,-1.412366,⋯,38.5366,1.0,5.395154,-28.44974,0.10163265,1,1,1.453307,63.83198,3


In [28]:
colnames(signatures)
write.table(cbind(subtypes,signatures),paste0(data_folder,output_file),sep = "\t",quote = FALSE,row.names = F)

In [23]:
#data(sig.endoPredict)
#data(sig.oncotypedx)
#data(sig.ggi)
#data(sig.gene70)
#data(sig.pik3cags)
#data(mod1)