/
apply_RF_model.R
197 lines (158 loc) · 8.24 KB
/
apply_RF_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
<<<<<<< HEAD
library(randomForest)
library(ROCR)
require(Hmisc)
#Set working directory and filenames for Input/output
#setwd("/Users/ogriffit/git/biostar-tutorials/MachineLearning")
setwd("/Users/nspies/biostar-tutorials/MachineLearning")
RF_model_file="RF_model"
datafile="testset_gcrma.txt" #combined (standardCDF + customCDF)
clindatafile="testset_clindetails.txt"
outfile="testset_RFoutput.txt"
case_pred_outfile="testset_CasePredictions.txt"
ROC_pdffile="testset_ROC.pdf"
vote_dist_pdffile="testset_vote_dist.pdf"
#Read in data (expecting a tab-delimited file with header line and rownames)
data_import=read.table(datafile, header = TRUE, na.strings = "NA", sep="\t")
clin_data_import=read.table(clindatafile, header = TRUE, na.strings = "NA", sep="\t")
#NEED TO GET CLINICAL DATA IN SAME ORDER AS GCRMA DATA
clin_data_order=order(clin_data_import[,"geo_accn"])
clindata=clin_data_import[clin_data_order,]
data_order=order(colnames(data_import)[4:length(colnames(data_import))])+3 #Order data without first three columns, then add 3 to get correct index in original file
rawdata=data_import[,c(1:3,data_order)] #grab first three columns, and then remaining columns in order determined above
header=colnames(rawdata)
#Get predictor variables
predictor_data=t(rawdata[,4:length(header)])
predictor_names=as.vector((rawdata[,3])) #gene symbol + probe ids
colnames(predictor_data)=predictor_names
#Load RandomForests classifier from file (object "rf_output" which was saved previously)
load(file=RF_model_file)
#Run test data through forest
RF_predictions_responses=predict(rf_output, predictor_data, type="response")
#RF_predictions_probs=predict(rf_output, predictor_data, type="prob")
RF_predictions_votes=predict(rf_output, predictor_data, type="vote")
#Join predictions with clinical data
clindata_plusRF=cbind(clindata,RF_predictions_responses,RF_predictions_votes)
#Exclude rows with missing clinical data needed for subsequent steps
clindata_plusRF=clindata_plusRF[which(!is.na(clindata_plusRF[,"event.rfs"])),]
#write results to file
write.table(clindata_plusRF,file=case_pred_outfile, sep="\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
#Determine performance statistics
confusion=table(clindata_plusRF[,c("event.rfs","RF_predictions_responses")])
rownames(confusion)=c("NoRelapse","Relapse")
sensitivity=(confusion[2,2]/(confusion[2,2]+confusion[2,1]))*100
specificity=(confusion[1,1]/(confusion[1,1]+confusion[1,2]))*100
overall_error=((confusion[1,2]+confusion[2,1])/sum(confusion))*100
overall_accuracy=((confusion[1,1]+confusion[2,2])/sum(confusion))*100
class1_error=confusion[1,2]/(confusion[1,1]+confusion[1,2])
class2_error=confusion[2,1]/(confusion[2,2]+confusion[2,1])
#Prepare stats for output to file
sens_out=paste("sensitivity=",sensitivity, sep="")
spec_out=paste("specificity=",specificity, sep="")
err_out=paste("overall error rate=",overall_error,sep="")
acc_out=paste("overall accuracy=",overall_accuracy,sep="")
misclass_1=paste(confusion[1,2], colnames(confusion)[1],"misclassified as", colnames(confusion)[2], sep=" ")
misclass_2=paste(confusion[2,1], colnames(confusion)[2],"misclassified as", colnames(confusion)[1], sep=" ")
#Prepare confusion table for writing to file
confusion_out=confusion[1:2,1:2]
confusion_out=cbind(rownames(confusion_out), confusion_out)
#Create ROC curve plot and calculate AUC
#Can use Relapse vote fractions fractions as predictive variable
=======
#Load necessary libraries
library(randomForest)
library(ROCR)
require(Hmisc)
#Set working directory and filenames for Input/output
setwd("/Users/ogriffit/git/biostar-tutorials/MachineLearning")
RF_model_file="RF_model"
datafile="testset_gcrma.txt"
clindatafile="testset_clindetails.txt"
outfile="testset_RFoutput.txt"
case_pred_outfile="testset_CasePredictions.txt"
ROC_pdffile="testset_ROC.pdf"
vote_dist_pdffile="testset_vote_dist.pdf"
#Read in data (expecting a tab-delimited file with header line and rownames)
data_import=read.table(datafile, header = TRUE, na.strings = "NA", sep="\t")
clin_data_import=read.table(clindatafile, header = TRUE, na.strings = "NA", sep="\t")
#NEED TO GET CLINICAL DATA IN SAME ORDER AS GCRMA DATA
clin_data_order=order(clin_data_import[,"geo_accn"])
clindata=clin_data_import[clin_data_order,]
data_order=order(colnames(data_import)[4:length(colnames(data_import))])+3 #Order data without first three columns, then add 3 to get correct index in original file
rawdata=data_import[,c(1:3,data_order)] #grab first three columns, and then remaining columns in order determined above
header=colnames(rawdata)
#Get predictor variables
predictor_data=t(rawdata[,4:length(header)])
predictor_names=as.vector((rawdata[,3])) #gene symbol
colnames(predictor_data)=predictor_names
#Load RandomForests classifier from file (object "rf_output" which was saved previously)
load(file=RF_model_file)
#Run test data through forest
RF_predictions_responses=predict(rf_output, predictor_data, type="response")
RF_predictions_votes=predict(rf_output, predictor_data, type="vote")
#Join predictions with clinical data
clindata_plusRF=cbind(clindata,RF_predictions_responses,RF_predictions_votes)
#Exclude rows with missing clinical data needed for subsequent steps
clindata_plusRF=clindata_plusRF[which(!is.na(clindata_plusRF[,"event.rfs"])),]
#write results to file
write.table(clindata_plusRF,file=case_pred_outfile, sep="\t", quote=FALSE, col.names=TRUE, row.names=FALSE)
#Determine performance statistics
confusion=table(clindata_plusRF[,c("event.rfs","RF_predictions_responses")])
rownames(confusion)=c("NoRelapse","Relapse")
sensitivity=(confusion[2,2]/(confusion[2,2]+confusion[2,1]))*100
specificity=(confusion[1,1]/(confusion[1,1]+confusion[1,2]))*100
overall_error=((confusion[1,2]+confusion[2,1])/sum(confusion))*100
overall_accuracy=((confusion[1,1]+confusion[2,2])/sum(confusion))*100
class1_error=confusion[1,2]/(confusion[1,1]+confusion[1,2])
class2_error=confusion[2,1]/(confusion[2,2]+confusion[2,1])
#Prepare stats for output to file
sens_out=paste("sensitivity=",sensitivity, sep="")
spec_out=paste("specificity=",specificity, sep="")
err_out=paste("overall error rate=",overall_error,sep="")
acc_out=paste("overall accuracy=",overall_accuracy,sep="")
misclass_1=paste(confusion[1,2], colnames(confusion)[1],"misclassified as", colnames(confusion)[2], sep=" ")
misclass_2=paste(confusion[2,1], colnames(confusion)[2],"misclassified as", colnames(confusion)[1], sep=" ")
#Prepare confusion table for writing to file
confusion_out=confusion[1:2,1:2]
confusion_out=cbind(rownames(confusion_out), confusion_out)
#Print results to file
write("confusion table", file=outfile)
write.table(confusion_out,file=outfile, sep="\t", quote=FALSE, col.names=TRUE, row.names=FALSE, append=TRUE)
write(c(sens_out,spec_out,acc_out,err_out,misclass_1,misclass_2,AUC_out), file=outfile, append=TRUE)
#Create ROC curve plot and calculate AUC
#Can use Relapse vote fractions fractions as predictive variable
>>>>>>> 5bd7c52fbf394c54d848092ca1abf0d5e4a82730
#The ROC curve will be generated by stepping up through different thresholds for calling Response vs NoResponse
target=clindata_plusRF[,"event.rfs"]
target[target==1]="Relapse"
target[target==0]="NoRelapse"
relapse_scores=clindata_plusRF[,"Relapse"]
<<<<<<< HEAD
pred=prediction(relapse_scores,target)
#First calculate the AUC value
perf_AUC=performance(pred,"auc")
AUC=perf_AUC@y.values[[1]]
AUC_out=paste("AUC=",AUC,sep="")
#Then, plot the actual ROC curve
perf_ROC=performance(pred,"tpr","fpr")
pdf(file=ROC_pdffile)
plot(perf_ROC, main="ROC plot")
text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))
dev.off()
#Print results to file
write("confusion table", file=outfile)
write.table(confusion_out,file=outfile, sep="\t", quote=FALSE, col.names=TRUE, row.names=FALSE, append=TRUE)
write(c(sens_out,spec_out,acc_out,err_out,misclass_1,misclass_2,AUC_out), file=outfile, append=TRUE)
=======
pred=prediction(relapse_scores,target)
#First calculate the AUC value
perf_AUC=performance(pred,"auc")
AUC=perf_AUC@y.values[[1]]
AUC_out=paste("AUC=",AUC,sep="")
#Then, plot the actual ROC curve
perf_ROC=performance(pred,"tpr","fpr")
pdf(file=ROC_pdffile)
plot(perf_ROC, main="ROC plot")
text(0.5,0.5,paste("AUC = ",format(AUC, digits=5, scientific=FALSE)))
dev.off()
>>>>>>> 5bd7c52fbf394c54d848092ca1abf0d5e4a82730