-
Notifications
You must be signed in to change notification settings - Fork 0
/
Test_repr_code.R
55 lines (33 loc) · 2.6 KB
/
Test_repr_code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#AFTER you complete Info section 1 and obtain two files:
#one file with 1444 Padel descriptors as "file_with_1444_Padel_descriptors.csv"
#and the other file with 17 rdkit descriptors as "file_with_17_rdkit_descriptors.csv"
# THEN: Open R in the same directory of that files:
# V<-read.csv("file_with_17_rdkit_descriptors.csv",sep=',',header=TRUE) # file with 17 rdkit #descriptors, check the separation symbol and whether the file has a header, also check whether the #first column is the first descriptors, if it isnt, but if it is a simple column with row numbers, that #column must be removed, so that the first column is the first molecular descriptor
# V1<-read.csv('file_with_1444_Padel_descriptors.csv',sep=',',header=TRUE) # file with 1444 #Padel descriptors, check the separation symbol and whether the file has a header, also check #whether the first column is the first descriptors, if it isn't, but if it is a simple column with row #numbers, that column must be removed, so that the first column is the first molecular descriptor
#Fe1<-cbind(V,V1)
#dim(Fe1) ### the second dimension must be 1461, as 17+1444 = 1461 mol. descriptors
#Just before we continue, These V and V1 are YOUR OWN FILES IF YOU CREATED THEM. #But we already have our own test file with 1461 such descriptors, so you can use our test file to #test our results.
#In that case you can also load our file as:
Fe1<-read.csv("AqSolDB-n_test_set_mol_descr.csv",sep=",",header=TRUE)
Fe1<-Fe1[,-1] #Removing non-descriptor column!
dim(Fe1) # checking dimensions
# [1] 220 1461 dimension of 1461 descriptors is OK, we can continue, otherwise we would NOT #be able to continue
Fe1<-as.matrix(Fe1)
F<-Fe1[,c("MolLogP","XLogP","ATS0p","ZMIC1","piPC2","MDEC.33","MPC7","piPC1","piPC3","ZMIC2","piPC6","AATS5p","MolMR","TpiPC","AATS6v","piPC4","AATS1i","piPC10","GATS2c","AATS1e","AATS4v","TWC","nH","ATS1m","Mi","MPC8")] #Selecting 26 AqSolDB-n descrip.
### Loading mean vector and standard deviation vector when scaling the descriptor data: ####
meanvect<-read.csv("meanvect.csv",header=FALSE)
meanvect<-as.vector(t(meanvect))
sdvect<-read.csv("sdvect.csv",header=FALSE)
sdvect<-as.vector(t(sdvect))
###
for (i in 1:ncol(F)) {
F[,i]<- (F[,i]- meanvect[i])/sdvect[i] }
Y<-read.csv("AqSolDB-n_test_set_LogS.csv",header=TRUE) # Loading
B2<-as.vector(t(Y))
library(xgboost) #Loading XGB library
bst<-xgb.load("FSTI-XGB_1619") #Loading our AqSolDB-N model
pred_y<-predict(bst,F)
library(caret) # Loading caret library
caret::RMSE(B2,pred_y)
# [1] 0.5943423
#Bravo, you correctly reproduced our result of our internal independent test set (Table 4), last row #RMSEV value.