-
Notifications
You must be signed in to change notification settings - Fork 2
/
protein_analysis_test.R
238 lines (176 loc) · 9.48 KB
/
protein_analysis_test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
library(keras)
library(tensortree) # for global install:
# devtools::install_github("oneilsh/tensortree", lib = "/usr/lib/R/site-library") # if root...
# on uniprot: search e.g. "P450 length:[450 TO 700]", then Download -> Preview first 10, grab url, change 10 to 2000...
# wget 'https://www.uniprot.org/uniprot/?query=P450%20length:[450%20TO%20700]&format=fasta&limit=2000&sort=score' -O datasets/proteins/p450s.fa
# read dataframe of metadata about the two files
seqids_df <- fastas_to_targets_df(c("p450s.fa", "fmos.fa"), directory = "datasets/proteins")
print(head(seqids_df))
# shuffle the dataset
shuffled_indices <- sample(1:nrow(seqids_df))
seqids_df <- seqids_df[shuffled_indices, ]
# a placeholder for results, 5 rows, three columns
metrics_df <- data.frame(loss = rep(NA, 5), accuracy = rep(NA, 5), fold = rep(NA, 5))
for(i in 0:4) {
message("Training fold ", i)
potential_indices <- 1:nrow(seqids_df)
train_indices <- potential_indices[potential_indices %% 5 != i]
metrics <- train_model(seqids_df, train_indices)
metrics_df[i+1, ] <- c(metrics$loss, metrics$acc, i)
}
# split it into two dfs, one with 80% of rows, the other with the rest
#train_indices <- sample(1:nrow(seqids_df), size = nrow(seqids_df) * 0.8)
# a function which, given a dataframe with training and validation data,
# and a vector of indices to use for training (others will be used for validation)
train_model <- function(seqids_df, train_indices) {
seqids_df_train <- seqids_df[train_indices, ]
seqids_df_validate <- seqids_df[-train_indices, ]
# figure out the minimum length of all the sequences (both training and validation), we'll
# trim all the sequences to this length to pack them into tensors
min_length <- min(seqids_df$seq_len) # minimum over all sequences; 450
# we have two classes, so we'll use "binary" class_mode rather than "categorical"
train_generator <- flow_sequences_from_fasta_df(seqids_df_train,
targets_col = "class",
alphabet = "protein",
batch_size = 64,
class_mode = "binary",
trim_to = min_length)
validate_generator <- flow_sequences_from_fasta_df(seqids_df_validate,
targets_col = "class",
alphabet = "protein",
batch_size = 64,
class_mode = "binary",
trim_to = min_length)
# let's see what we've got
#batch <- train_generator()
#batch[[1]] %>% tt() %>% print(bottom = "2d", end_n = 21)
#batch[[2]] %>% tt() %>% print(bottom = "1d", end_n = 64)
# we'll do a 1-d CNN, the analog for 2d CNNs for image data
# note the activation of "sigmoid" on the final layer, which only has one node - good for predicting
# a single number between 0 and 1 (to go along with a binary classification task)
network <- keras_model_sequential() %>%
layer_conv_1d(filters = 16, kernel_size = 7, activation = "relu", input_shape = c(450, 21)) %>%
layer_max_pooling_1d(pool_size = 2) %>%
layer_conv_1d(filters = 32, kernel_size = 7, activation = "relu") %>%
layer_max_pooling_1d(pool_size = 2) %>%
layer_flatten() %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 1, activation = "sigmoid") # 1 unit sigmoid -> good output for "binary" prediction
# note the use of binary_crossentropy rather than categorical_crossentropy
compile(network,
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy"))
# 3200 rows in training df, batch size 64; 3200/64 = 50
# 800 in validation, batch size 64, 800 / 64 ~= 12 (10, 12, whateves)
history <- fit_generator(network,
train_generator,
epochs = 5,
steps_per_epoch = 20)
#validation_data = validate_generator,
#validation_steps = 10)
# list with entries for $loss and $acc
metrics <- evaluate_generator(network, generator = validate_generator, steps = 10)
return(metrics)
}
# try an RNN
# note the activation of "sigmoid" on the final layer, which only has one node - good for predicting
# a single number between 0 and 1 (to go along with a binary classification task)
#network <- keras_model_sequential() %>%
# layer_gru(input_shape = c(450, 21), units = 32, dropout = 0.3, return_sequences = TRUE) %>%
# layer_batch_normalization() %>%
# layer_flatten() %>%
# layer_dropout(rate = 0.3) %>%
# layer_batch_normalization() %>%
# layer_dense(units = 64, activation = "relu") %>%
# layer_dense(units = 1, activation = "sigmoid") # 1 unit sigmoid -> good output for "binary" prediction
# note the use of binary_crossentropy rather than categorical_crossentropy
compile(network,
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = c("accuracy"))
# 3200 rows in training df, batch size 64; 3200/64 = 50
# 800 in validation, batch size 64, 800 / 64 ~= 12 (10, 12, whateves)
history <- fit_generator(network,
train_generator,
epochs = 5,
steps_per_epoch = 50,
validation_data = validate_generator,
validation_steps = 10)
## Same thing, except this time we want to predict the percent of the sequences that are
# non-polar letters
library(keras)
library(tensortree)
# a function that, given a single sequence, returns a percent
# e.g. get_percent_nonpolar("MEPFVVL") -> 0.85
get_percent_nonpolar <- function(seq) {
# F, L, W, etc are "nonpolar" - we're going replace anything that's not one of these with "" (delete them)
only_nonpolar <- gsub("[^FLWPIMVA]", "", seq)
return(nchar(only_nonpolar)/nchar(seq))
}
# now when we read the metadata df, we can specify a named list of functions to use to produce extra columns
seqids_df <- fastas_to_targets_df(c("p450s.fa", "fmos.fa"),
directory = "datasets/proteins",
function_list = list(percent_nonpolar = get_percent_nonpolar),
alphabet = "protein")
# see:
print(head(seqids_df))
# split into train and validate
train_indices <- sample(1:nrow(seqids_df), size = nrow(seqids_df) * 0.8)
seqids_df_train <- seqids_df[train_indices, ]
seqids_df_validate <- seqids_df[-train_indices, ]
# figure out what we'll trim to
min_length <- min(seqids_df$seq_len) # minimum over all sequences
# as above, except: targets_col is the new "percent_nonpolar"
# class_mode = "identity" (don't alter the targets)
train_generator <- flow_sequences_from_fasta_df(seqids_df_train,
targets_col = "percent_nonpolar",
alphabet = "protein",
batch_size = 64,
class_mode = "identity",
trim_to = min_length)
validate_generator <- flow_sequences_from_fasta_df(seqids_df_validate,
targets_col = "percent_nonpolar",
alphabet = "protein",
batch_size = 64,
class_mode = "identity",
trim_to = min_length)
# as above, except activation is linear instead of sigmoid - so that network can output any number, positive or
# negative (though since our percents are between 0 and 1 anyway, we could have tried sigmoid)
network <- keras_model_sequential() %>%
layer_conv_1d(filters = 16, kernel_size = 3, activation = "relu", input_shape = c(450, 21)) %>%
layer_max_pooling_1d(pool_size = 2) %>%
layer_conv_1d(filters = 32, kernel_size = 3, activation = "relu") %>%
layer_max_pooling_1d(pool_size = 2) %>%
layer_flatten() %>%
layer_dense(units = 64, activation = "relu") %>%
layer_dense(units = 1, activation = "linear")
# loss function is "mean squared error" - appropriate for regression
# metric is "mean absolute error" - appropriate for regression ("accuracy" is not appropriate for regression,
# even though keras will let us specify it and it will look like it makes sense)
compile(network,
optimizer = "rmsprop",
loss = "mse",
metrics = c("mae"))
fit_generator(network,
train_generator,
epochs = 5,
steps_per_epoch = 50,
validation_data = validate_generator,
validation_steps = 10)
data_generator <- flow_sequences_from_fasta_df(seqids_df,
targets_col = "class",
alphabet = "protein",
batch_size = 1000,
class_mode = "binary",
trim_to = min_length)
batch <- data_generator()
actuals <- batch[[2]]
preds <- predict(network, batch[[1]]) %>% # returns a shape (1, 1000) tensor...
array_reshape(dim = c(1000)) # make it a vector
head(preds)
head(actuals)
pred_classes <- preds * 0
pred_classes[preds > 0.5] <- 1
table(actuals, pred_classes)
library(PRROC)