forked from ckproc/Msdnet_classification_tf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train_main.py
372 lines (336 loc) · 18 KB
/
train_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
"""Training a face recognizer with TensorFlow using softmax cross entropy loss
"""
# MIT License
#
# Copyright (c) 2016 David Sandberg
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import os.path
import time
import sys
import random
import tensorflow as tf
import numpy as np
import importlib
import argparse
import h5py
import math
import tensorflow.contrib.slim as slim
from tensorflow.python.ops import data_flow_ops
from tensorflow.python.framework import ops
from tensorflow.python.ops import array_ops
#import models.msdnet as msdnet
#import cifar
import msdnet
'''
def testontest():
num_examples=10000
num_iter = int(math.ceil(num_examples / args.batch_size))
#true_count = 0 # Counts the number of correct predictions.
top1_count=0
top5_count=0
test_step=0
total_sample_count = num_iter * args.batch_size
while test_step<num_iter:
#predictions,targets = sess.run([top_k_op,labels],{image_paths_placeholder: testfilenames})
predictions,targets = sess.run([top_k_op,labels_test])
#print (targets.shape[0])
for i in range(targets.shape[0]):
if targets[i] in predictions.indices[i]:
top5_count += 1
if targets[i] == predictions.indices[i][0]:
top1_count += 1
#top1_count += np.sum(predictions)
#top5_count += np.sum(predictions)
test_step += 1
precision1=top1_count/total_sample_count
precision5=top5_count/total_sample_count
print ('total test step:',test_step)
print('%s: precision @ 1 = %.3f' % (datetime.now(), precision1),'%s: precision @ 5 = %.3f' % (datetime.now(), precision5))
with open('test_result.txt','at') as f:
f.write('%d\t%.5f\t%.5f\n' % (test_step, precision1, precision5))
'''
def softmax(x):
x=x.astype(float)
if x.ndim==1:
S=np.sum(np.exp(x))
return np.exp(x)/S
elif x.ndim==2:
result=np.zeros_like(x)
M,N=x.shape
for n in range(M):
S=np.sum(np.exp(x[n,:]))
result[n,:]=np.exp(x[n,:])/S
return result
else:
print("The input array is not 1- or 2-dimensional.")
def computeScore(prediction,labels_test):
#32*10
top5_count = 0
top1_count = 0
batchsize = prediction.shape[0]
for i in range(batchsize):
sort_indices = np.argsort(prediction[i])
#print (labels_test[i].eval(),"|",sort_indices)
if labels_test[i] in sort_indices[0:5]:
top5_count += 1
if labels_test[i] != sort_indices[-1]:
top1_count += 1
return top1_count,top5_count
def main(args):
#network = importlib.import_module(args.model_def)
#trainfilenames = [os.path.join(args.data_dir, 'data_batch_%d.bin' % i) for i in xrange(1, 6)]
#testfilenames = [os.path.join(data_dir, 'test_batch.bin')]
args.grFactor = args.grFactor.split('-')
args.bnFactor = args.bnFactor.split('-')
for i in range(len(args.grFactor)):
args.grFactor[i]=int(args.grFactor[i])
for i in range(len(args.bnFactor)):
args.bnFactor[i]=int(args.bnFactor[i])
with tf.Graph().as_default():
#with tf.device('/gpu:1'):
with tf.variable_scope("model") as scope:
global_step = tf.Variable(0, trainable=False)
#global_step = tf.train.get_or_create_global_step()
#image_paths_placeholder = tf.placeholder(tf.string, shape=(None,1), name='image_paths')
#with tf.device('/cpu:0'):
batch_size_placeholder = tf.placeholder(tf.int32, name='batch_size')
phase_train_placeholder = tf.placeholder(tf.bool, name='phase_train')
images, labels=msdnet.distorted_inputs(args,batch_size_placeholder)
images = tf.identity(images, 'input')
#images_test,labels_test=msdnet.inputs(args, True)
#inference model
logits,end_points = msdnet.inference(images,args,phase_train_placeholder) #logits shape[10,32,10]
#print (images)
#print (end_points)
#list = []
#print (len(end_points))
#i=0
#for item in end_points:
# sublist=[]
# if i%2==0:
# for tensor in item:
# sublist.append(tensor.name)
# else:
# sublist.append(item.name)
# list.append(sublist)
# i+=1
#print (list)
#print (np.shape(logits))
loss = msdnet.multi_loss(logits,labels)
regularization_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
#l2_loss=tf.add_n(regularization_losses)
total_loss = tf.add_n(loss + regularization_losses, name='total_loss')
train_op = msdnet.train(total_loss,global_step,args)
###scope.reuse_variables()
###logits_test,_ = msdnet.inference(images_test,args)
#top_k_op = tf.nn.in_top_k(logits_test, labels_test,5)
#top_k_op = tf.nn.top_k(logits_test, 5)
#calculate loss
# Build a Graph that trains the model with one batch of examples and
# updates the model parameters.
saver = tf.train.Saver(tf.global_variables(),max_to_keep = 50)
#merged_summary_op = tf.merge_all_summaries()
summary_op = tf.summary.merge_all()
#saver = tf.train.Saver(tf.global_variables())
init = tf.global_variables_initializer()
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5)
sess = tf.Session(config=tf.ConfigProto(
allow_soft_placement=True,
log_device_placement=args.log_device_placement))
sess.run(init)
tf.train.start_queue_runners(sess=sess)
#summary_writer = tf.train.SummaryWriter(args.logs, sess.graph)
summary_writer = tf.summary.FileWriter(args.logs, sess.graph)
best = 100
with sess.as_default():
for step in xrange(args.max_steps):
start_time = time.time()
#_, loss_value = sess.run([train_op, loss],{image_paths_placeholder: trainfilenames})
_, loss_value= sess.run([train_op, loss],feed_dict={batch_size_placeholder:args.batch_size,phase_train_placeholder:True})
loss_value = np.array(loss_value).sum()/float(len(loss_value))
duration = time.time() - start_time
#assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
if step % 10 == 0:
num_examples_per_step = args.batch_size * args.num_gpus
examples_per_sec = num_examples_per_step / duration
sec_per_batch = duration / args.num_gpus
format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
'sec/batch)')
print (format_str % (datetime.now(), step, loss_value,
examples_per_sec, sec_per_batch))
#testontest(sess, args, )
#if step % 100 == 0:
# summary_str = sess.run(summary_op)
# summary_writer.add_summary(summary_str, global_step=step)
# Save the model checkpoint periodically.
if (step % 703 == 0 or (step + 1) == args.max_steps) and step !=0:
epoch = step/703
num_examples=10000
num_iter = int(math.ceil(num_examples / args.batch_size))
#top1_count=0
#top5_count=0
test_step=0
N = 0.0
total_sample_count = num_iter * args.batch_size
top1All=[0.0]*10
top5All=[0.0]*10
top1Evolve=[0.0]*10
top5Evolve=[0.0]*10
#while test_step<num_iter and not coord.should_stop():
#while test_step<num_iter:
while False:
#predictions,targets = sess.run([top_k_op,labels],{image_paths_placeholder: testfilenames})
#predictions[10*32*10]
#targets[32*10]
predictions,targets = sess.run([logits_test,labels_test])
ensemble = np.zeros( (args.batch_size,10),dtype = float )
for i in range(len(predictions)):
top1,top5 = computeScore(predictions[i],targets)
top1All[i] = top1All[i] + top1
top5All[i] = top5All[i] + top5
ensemble = ensemble+softmax(predictions[i])
top1, top5 = computeScore(ensemble, targets)
top1Evolve[i] = top1Evolve[i] + top1
top5Evolve[i] = top5Evolve[i] + top5
N=N+args.batch_size
#print (('step %d, top1 %.3f (cumul: %.3f), top5 %.3f(cumul: %.3f)') %(test_step,top1,top1All[args.nBlocks-1]/N
# ,top5,top5All[args.nBlocks-1]/N)) # here
test_step += 1
#for i in range(len(top1All)):
# top1All[i] = top1All[i] / N
# top5All[i] = top5All[i] / N
# top1Evolve[i] = top1Evolve[i] / N
# top5Evolve[i] = top5Evolve[i] / N
# print (('%d exit top1: %.3f top5: %.3f, \t Ensemble %d exit(s) top1: %.3f top5: %.3f')
# %(i,top1All[i],top5All[i],i,top1Evolve[i],top5Evolve[i]))
#with open('single_result.txt','at') as f:
# f.write('%d\t' %epoch)
# f.write(str(top1All)+'\n')
#with open('ensemble_result.txt','at') as f:
# f.write('%d\t' %epoch)
# f.write(str(top1Evolve)+'\n')
#if best-top1All[9]>0:
# best = top1All[9]
# print ('epoch %d'%epoch,'best err:',best)
print('Saving variables')
start_time = time.time()
checkpoint_path = os.path.join(args.train_dir, 'model-%s.ckpt' % str(step))
#checkpoint_path = os.path.join(model_dir, 'model-%s.ckpt' % model_name)
saver.save(sess, checkpoint_path, global_step=step,write_meta_graph=False)
save_time_variables = time.time() - start_time
print('Variables saved in %.2f seconds' % save_time_variables)
#save graph
metagraph_filename = os.path.join(args.train_dir, 'model.meta')
if not os.path.exists(metagraph_filename):
print('Saving metagraph')
saver.export_meta_graph(metagraph_filename)
save_time_metagraph = time.time() - start_time
print('Metagraph saved in %.2f seconds' % save_time_metagraph)
if(False):
print('Saving variables')
start_time = time.time()
checkpoint_path = os.path.join(args.train_dir, 'model-%s.ckpt' % str(step))
#checkpoint_path = os.path.join(model_dir, 'model-%s.ckpt' % model_name)
saver.save(sess, checkpoint_path, global_step=step,write_meta_graph=False)
save_time_variables = time.time() - start_time
print('Variables saved in %.2f seconds' % save_time_variables)
#save graph
metagraph_filename = os.path.join(args.train_dir, 'model.meta')
if not os.path.exists(metagraph_filename):
print('Saving metagraph')
saver.export_meta_graph(metagraph_filename)
save_time_metagraph = time.time() - start_time
print('Metagraph saved in %.2f seconds' % save_time_metagraph)
def parse_arguments(argv):
parser = argparse.ArgumentParser()
### ------------ General options --------------------
parser.add_argument('--data', type=str,
help='Path to dataset.', default='')
parser.add_argument('--dataset', type=str,
help='Options: imagenet | cifar10 | cifar100.', default='imagenet')
parser.add_argument('--manualSeed', type=int,
help='Manually set RNG seed.', default=0)
parser.add_argument('--gen', type=str,
help='path to save generated files.', default='gen')
parser.add_argument('--precision', type=str,
help='Options: single | double | half.', default='single')
#log_device_placement
parser.add_argument('--log_device_placement',
help='whether to log device placement.', action='store_true')
parser.add_argument('--train_dir',type=str,help='directory to write checkpoints',default='/home/ckp/Msd_tf/checkpoints')
parser.add_argument('--logs',type=str,help='directory to write summary',default='/home/ckp/Msd_tf/logs')
parser.add_argument('--max_steps',type=int,help='',default=220000)
parser.add_argument('--num_gpus',type=int,help='',default=1)
parser.add_argument('--batch_size',type=int,help='',default=64)
parser.add_argument('--nScales',type=int,help='',default=3)
###------------- Data options ------------------------
#parser.add_argument('--nThreads', type=int,
# help='number of data loading threads.', default=2)
parser.add_argument('--DataAug',
help='use data augmentation or not.', action='store_true')
###------------- Training options --------------------
parser.add_argument('--testOnly',
help='Run on validation set only.', action='store_true')
parser.add_argument('--tenCrop',
help='Ten-crop testing.', action='store_true')
parser.add_argument('--reduction',type=float,help='dimension reduction ratio at transition layers',default=0.5)
###------------- Checkpointing options ---------------
###---------- Optimization options ----------------------
parser.add_argument('--LR', type=float,help='initial learning rate.', default=0.1)
parser.add_argument('--momentum', type=float, help='.', default=0.9)
parser.add_argument('--weight_decay', type=float, help='.', default=1e-4)
###---------- Model options ----------------------------------
#parser.add_argument('--shareGradInput',help='Share gradInput tensors to reduce memory usage',action='store_true' )
#parser.add_argument('--optnet',help='Use optnet to reduce memory usage', action='store_true')
###---------- MSDNet MOdel options ----------------------------------
parser.add_argument('--base', type=int,
help='the layer to attach the first classifier', default=4)
parser.add_argument('--nBlocks', type=int,
help='number of blocks/classifiers', default=10)
parser.add_argument('--stepmode', type=str,
help='patten of span between two adjacent classifers |even|lin_grow|', default='even')
parser.add_argument('--step', type=int,
help='span between two adjacent classifers.', default=2)
parser.add_argument('--bottleneck',
help='use 1x1 conv layer or not', action='store_true')
parser.add_argument('--growthRate', type=int,
help='number of output channels for each layer (the first scale).', default=6)
parser.add_argument('--grFactor', type=str,
help='growth rate factor of each sacle', default='1-2-4-4')
parser.add_argument('--prune', type=str,
help='specify how to prune the network, min | max', default='max')
parser.add_argument('--joinType', type=str,
help='add or concat for features from different paths', default='concat')
parser.add_argument('--bnFactor', type=str,
help='bottleneck factor of each sacle, 4-4-4-4 | 1-2-4-4', default='1-2-4-4')
parser.add_argument('--initChannels',type=int,help='number of features produced by the initial conv layer',default=32)
#parser.add_argument('--EEensemble', help='use ensemble or not in early exit', action='store_true')
parser.add_argument('--model_def', type=str,
help='Model definition. Points to a module containing the definition of the inference graph.', default='models.msdnet')
#tf.app.flags.DEFINE_string('data_dir', '/home/ckp/Msd_tf/dataset/cifar10_data',"""Path to the CIFAR-10 data directory.""")
parser.add_argument('--data_dir',type=str ,default = '/home/ckp/Msd_tf/dataset/cifar10_data')
parser.add_argument('--eval_dir',type=str ,default = '/home/ckp/Msd_tf/dataset/cifar10_data')
return parser.parse_args(argv)
if __name__ == '__main__':
main(parse_arguments(sys.argv[1:]))