In [1]:
import tensorflow as tf
import tensorflow.contrib.layers as layers
import numpy as np
import math
import pickle
import timeit
import matplotlib.pyplot as plt
%matplotlib inline
import itertools

In [21]:
from cs231n.data_utils import load_CIFAR10

def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for the two-layer neural net classifier. These are the same steps as
    we used for the SVM, but condensed to a single function.  
    """
    # Load the raw CIFAR-10 data
    cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

    # Subsample the data
    mask = range(num_training, num_training + num_validation)
    X_val = X_train[mask]
    y_val = y_train[mask]
    mask = range(num_training)
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = range(num_test)
    X_test = X_test[mask]
    y_test = y_test[mask]

    # Normalize the data: subtract the mean image
    mean_image = np.mean(X_train, axis=0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image

    return X_train, y_train, X_val, y_val, X_test, y_test


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)

Train data shape:  (49000, 32, 32, 3)
Train labels shape:  (49000,)
Validation data shape:  (1000, 32, 32, 3)
Validation labels shape:  (1000,)
Test data shape:  (1000, 32, 32, 3)
Test labels shape:  (1000,)


In [11]:
# Action space 
def get_action_space():
    action_space = [] 

    # Softmax
    action_0 = {'type': 'SM',
                'num_output': 10}
    action_space.append(action_0) 

    # Convolution
    for filter_size in [1, 3, 5]:
        for num_filter in [64, 128, 256, 512]:
            action = {'type': 'C', 
                      'filter_size': filter_size, 
                      'stride': 1, 
                      'num_filter': num_filter}
            action_space.append(action)

    # Pooling
    action_13 = {'type': 'P', 
                'filter_size': 5, 
                'stride': 3}
    action_14 = {'type': 'P', 
                'filter_size': 3, 
                'stride': 2}
    action_15 = {'type': 'P', 
                'filter_size': 2, 
                'stride': 2}
    action_space.append(action_13)
    action_space.append(action_14)
    action_space.append(action_15)
    return action_space

action_space = get_action_space()
MAX_LAYER = 4
NUM_ACTION = len(action_space)
NUM_MODEL = 100

print('max_layer: ', MAX_LAYER)
print('num_action: ', NUM_ACTION)
print('total_model: ', NUM_MODEL)

max_layer:  4
num_action:  16
total_model:  100


In [12]:
def get_session():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.Session(config=config)
    return session

def run_model(session, predict, loss_val, Xd, yd,
              epochs=1, batch_size=128, print_every=100,
              training=None, plot_losses=False, verbose=False):
    # have tensorflow compute accuracy
    correct_prediction = tf.equal(tf.argmax(predict,1), y)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    # shuffle indicies
    train_indicies = np.arange(Xd.shape[0])
    np.random.shuffle(train_indicies)

    training_now = training is not None
    
    # setting up variables we want to compute (and optimizing)
    # if we have a training function, add that to things we compute
    variables = [mean_loss,correct_prediction,accuracy]
    if training_now:
        variables[-1] = training
    
    # counter 
    iter_cnt = 0

    for e in range(epochs):
        # keep track of losses and accuracy
        correct = 0
        losses = []
        # make sure we iterate over the dataset once
        for i in range(int(math.ceil(Xd.shape[0]/batch_size))):
            # generate indicies for the batch
            start_idx = (i*batch_size)%X_train.shape[0]
            idx = train_indicies[start_idx:start_idx+batch_size]
            
            # create a feed dictionary for this batch
            feed_dict = {X: Xd[idx,:],
                         y: yd[idx],
                         is_training: training_now }
            # get batch size
            actual_batch_size = yd[i:i+batch_size].shape[0]
            
            # have tensorflow compute loss and correct predictions
            # and (if given) perform a training step
            loss, corr, _ = session.run(variables,feed_dict=feed_dict)
            
            # aggregate performance stats
            losses.append(loss*actual_batch_size)
            correct += np.sum(corr)
            
            # print every now and then
            if training_now and (iter_cnt % print_every) == 0 and verbose:
                print("Iteration {0}: with minibatch training loss = {1:.3g} and accuracy of {2:.2g}"\
                      .format(iter_cnt,loss,np.sum(corr)/actual_batch_size))
            iter_cnt += 1
        total_correct = correct/Xd.shape[0]
        total_loss = np.sum(losses)/Xd.shape[0]
        
        if verbose:
            print("Epoch {2}, Overall loss = {0:.3g} and accuracy of {1:.3g}"\
                  .format(total_loss,total_correct,e+1))
        if plot_losses:
            plt.plot(losses)
            plt.grid(True)
            plt.title('Epoch {} Loss'.format(e+1))
            plt.xlabel('minibatch number')
            plt.ylabel('minibatch loss')
            plt.show()
    return total_loss, total_correct

In [33]:
def action_to_hex(action):
    return hex(action)[2:]
        
def hex_to_action(hex_str):
    if hex_str is '$':
        return 16
    return int('0x' + hex_str, 16)

def state_to_action_sequence(state):
    action_sequence = []
    for c in state:
        if c is not '$':
            action_sequence.append(action_space[hex_to_action(c)])
            
    return action_sequence

def is_pool(i):
    if 13 <= i <= 15:
        return True
    return False
    
def sample_network(epsilon=1.0, Q=None):
    state = '$'
    action_sequence = []
    prev_action = 16
    
    for layer in range(MAX_LAYER):  
        
        # Last layer softmax
        if layer == MAX_LAYER - 1:
            i = 0
        
        # e-greedy 
        elif np.random.rand() < epsilon:
            if is_pool(prev_action):
                i = np.random.randint(0, 13)
            else:
                i = np.random.randint(0, NUM_ACTION)
        else:
            if is_pool(prev_action):
                i = np.argmax(Q[prev_action,:12])
            else:
                i = np.argmax(Q[prev_action, :]) 
                
        state += action_to_hex(i)
        action_sequence.append(action_space[i])
        prev_action = i
        
        if i == 0:
            break
            
    return state, action_sequence
    
print(sample_network())
state_to_action_sequence('$495d0')

('$6530', [{'type': 'C', 'stride': 1, 'num_filter': 128, 'filter_size': 3}, {'type': 'C', 'stride': 1, 'num_filter': 64, 'filter_size': 3}, {'type': 'C', 'stride': 1, 'num_filter': 256, 'filter_size': 1}, {'type': 'SM', 'num_output': 10}])


[{'filter_size': 1, 'num_filter': 512, 'stride': 1, 'type': 'C'},
 {'filter_size': 5, 'num_filter': 64, 'stride': 1, 'type': 'C'},
 {'filter_size': 3, 'num_filter': 64, 'stride': 1, 'type': 'C'},
 {'filter_size': 5, 'stride': 3, 'type': 'P'},
 {'num_output': 10, 'type': 'SM'}]

In [23]:
def build_model(action_sequence, X, y):
    inputs = X
    for i in range(len(action_sequence)):
        layer = action_sequence[i]
        if layer['type'] is 'SM':  
            break
        
        elif layer['type'] is 'C':
            outputs = layers.conv2d(inputs=inputs,
                                    num_outputs=layer['num_filter'],
                                    kernel_size=layer['filter_size'],
                                    stride=layer['stride'],
                                    padding='same',
                                    activation_fn=tf.nn.relu)
        
        elif layer['type'] is 'P':
            outputs = layers.max_pool2d(inputs=inputs, 
                                        kernel_size=layer['filter_size'], 
                                        stride=layer['stride'],
                                        padding='same')
        
        inputs = outputs 
    
    flat = layers.flatten(inputs)
    outputs = layers.fully_connected(inputs=flat,
                                     num_outputs=10,
                                     activation_fn=None)
    return outputs

In [24]:
def update_Q(Q, state, reward, learning_rate, discount):
    S = state[:-1]
    A = state[1:]
    s = hex_to_action(S[-1])
    a = hex_to_action(A[-1])
    Q[s, a] = (1 - learning_rate)*Q[s, a] + learning_rate * reward
    for i in range(len(state) - 3, -1, -1):
        s = hex_to_action(S[i])
        a = hex_to_action(A[i])
        next_s = a
        if is_pool(next_s):
            Q[s, a] = (1 - learning_rate)*Q[s, a] + learning_rate * discount * np.max(Q[next_s, :12]) 
        else:
            Q[s, a] = (1 - learning_rate)*Q[s, a] + learning_rate * discount * np.max(Q[next_s, :])
    return Q

Q = 0.5*np.ones([NUM_ACTION + 1, NUM_ACTION])
state = '$a215b6af10'
reward = 100
update_Q(Q, state, reward, 1, 1)

array([[   0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,
           0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5],
       [ 100. ,    0.5,    0.5,    0.5,    0.5,  100. ,    0.5,    0.5,
           0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5],
       [   0.5,  100. ,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,
           0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5],
       [   0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,
           0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5],
       [   0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,
           0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5],
       [   0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,
           0.5,    0.5,    0.5,  100. ,    0.5,    0.5,    0.5,    0.5],
       [   0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,    0.5,
           0.5,    0.5,  100. ,    0.5,    0.5,    0.5,   

In [25]:
def save_obj(obj, name ):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [None]:
replay = {}
epsilon = 1.0
Q = 0.5*np.ones([NUM_ACTION + 1, NUM_ACTION])

for i_episode in range(2):
    state, action_sequence = sample_network(epsilon, Q)
    
    # Train the model
    tf.reset_default_graph()
    X = tf.placeholder(tf.float32, [None, 32, 32, 3])
    y = tf.placeholder(tf.int64, [None])
    is_training = tf.placeholder(tf.bool)

    y_out = build_model(action_sequence, X, y)

    loss = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(y,10), logits=y_out)
    mean_loss = tf.reduce_mean(loss)
    optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
    train_step = optimizer.minimize(mean_loss)

    with get_session() as sess:
        with tf.device("/gpu:0") as dev:
            sess.run(tf.global_variables_initializer())
            print('Training ', state)
            loss, accuracy = run_model(sess,y_out,mean_loss,X_train,y_train,10,64,100,train_step, verbose=True)
            print('Testing')
            _, reward = run_model(sess,y_out,mean_loss,X_test,y_test,1, 64, verbose=True)
            
    replay[state] = (reward, accuracy)
    #Q = update_Q(Q, state, 100, 0.01, 1)                  

Training  $3250
Iteration 0: with minibatch training loss = 8.02 and accuracy of 0.17
Iteration 100: with minibatch training loss = 2.25 and accuracy of 0.25
Epoch 1, Overall loss = 3.52 and accuracy of 0.15
Iteration 200: with minibatch training loss = 2.13 and accuracy of 0.27
Iteration 300: with minibatch training loss = 2.02 and accuracy of 0.3
Epoch 2, Overall loss = 2.17 and accuracy of 0.231
Iteration 400: with minibatch training loss = 2.14 and accuracy of 0.25
Epoch 3, Overall loss = 2.02 and accuracy of 0.289
Iteration 500: with minibatch training loss = 1.9 and accuracy of 0.36
Iteration 600: with minibatch training loss = 1.73 and accuracy of 0.34
Epoch 4, Overall loss = 1.88 and accuracy of 0.33
Iteration 700: with minibatch training loss = 1.88 and accuracy of 0.36
Epoch 5, Overall loss = 1.79 and accuracy of 0.377
Iteration 800: with minibatch training loss = 1.98 and accuracy of 0.36
Iteration 900: with minibatch training loss = 1.85 and accuracy of 0.36
Epoch 6, Overal

In [None]:
replay

In [3]:
replay1 = load_obj('replay_' + str(326))
replay2 = load_obj('replay_' + str(246))
replay3 = load_obj('replay_' + str(460))
replay4 = load_obj('replay_' + str(518))
print(len(replay1))
print(len(replay2))
print(len(replay3))
print(len(replay4))

93
96
95
99


In [9]:
z = {**replay1, **replay2, **replay3, **replay4}
import operator
sorted_z = sorted(z.items(), key=operator.itemgetter(1), reverse=True)
sorted_z

[('$5d690', (0.51400000000000001, 0.6502)),
 ('$6a1d0', (0.51400000000000001, 0.62539999999999996)),
 ('$495d0', (0.502, 0.69769999999999999)),
 ('$52db0', (0.49399999999999999, 0.65939999999999999)),
 ('$6ec40', (0.49199999999999999, 0.65449999999999997)),
 ('$151d0', (0.48999999999999999, 0.74109999999999998)),
 ('$816d0', (0.48999999999999999, 0.70220000000000005)),
 ('$cdc40', (0.48999999999999999, 0.55679999999999996)),
 ('$6d6a0', (0.48599999999999999, 0.6865)),
 ('$57f40', (0.47999999999999998, 0.90049999999999997)),
 ('$7d390', (0.47599999999999998, 0.54359999999999997)),
 ('$71d10', (0.47399999999999998, 0.77569999999999995)),
 ('$66960', (0.47199999999999998, 0.96330000000000005)),
 ('$68ce0', (0.46800000000000003, 0.89270000000000005)),
 ('$6e1f0', (0.46200000000000002, 0.65549999999999997)),
 ('$97a60', (0.45800000000000002, 0.91469999999999996)),
 ('$910', (0.45400000000000001, 1.0)),
 ('$f5db0', (0.45200000000000001, 0.53369999999999995)),
 ('$b2d50', (0.45000000000000001

In [None]:
state = '$66960'
action_sequence = state_to_action_sequence(state)
tf.reset_default_graph()
    
X = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)

y_out = build_model(action_sequence, X, y)

loss = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(y,10), logits=y_out)
mean_loss = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
train_step = optimizer.minimize(mean_loss)

print('Training ', state)
reward = None
with get_session() as sess:
    with tf.device("/gpu:0") as dev:
        sess.run(tf.global_variables_initializer())       
        loss, accuracy = run_model(sess,y_out,mean_loss,X_train,y_train,40,256,100,train_step,verbose=True)
        _, reward = run_model(sess,y_out,mean_loss,X_val,y_val,1, 256)
print('Reward: ', reward)

Training  $66960
Iteration 0: with minibatch training loss = 6.6 and accuracy of 0.09

In [None]:
# Unit test 1
action_1 = {'type': 'C', 
            'filter_size': 5, 
            'stride': 1, 
            'num_filter': 64}

action_2 = {'type': 'P', 
            'filter_size': 2, 
            'stride': 2}

action_3 = {'type': 'SM',
            'num_output': 10}
action_sequence = [action_1, action_2, action_3]

tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)

y_out = build_model(action_sequence, X, y)

loss = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(y,10), logits=y_out)
mean_loss = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
train_step = optimizer.minimize(mean_loss)

with get_session() as sess:
    with tf.device("/gpu:0") as dev:
        sess.run(tf.global_variables_initializer())
        print('Training')
        loss, accuracy = run_model(sess,y_out,mean_loss,X_train,y_train,10,128,100,train_step, verbose=True)
        print('Testing')
        _, reward = run_model(sess,y_out,mean_loss,X_test,y_test,1, 128, verbose=True)

In [None]:
# Unit test 2
state, action_sequence = sample_network()
print(state)
print(action_sequence)

tf.reset_default_graph()
    
X = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)

y_out = build_model(action_sequence, X, y)

loss = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(y,10), logits=y_out)
mean_loss = tf.reduce_mean(loss)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3)
train_step = optimizer.minimize(mean_loss)

print('Training ', state)
reward = None
with get_session() as sess:
    with tf.device("/gpu:0") as dev:
        sess.run(tf.global_variables_initializer())       
        loss, accuracy = run_model(sess,y_out,mean_loss,X_train,y_train,10,128,100,train_step,verbose=True)
        _, reward = run_model(sess,y_out,mean_loss,X_test,y_test,1, 128)
print('Reward: ', reward)

In [None]:
#action
#increase weigth, decrease weigth *2 *0.5 same
#increase lr, decrease weigth *2 * 0.5 same
#increase dept +1 -1 same
#increse dropout +0.2 -0.2 same
#increse network width *2 /2

#State 




In [None]:
def get_session():
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    session = tf.Session(config=config)
    return session

def get_model(X, y, is_training, feature):
    
    _, weigth_decay, dropout_rate, depth, width = feature
    depth = int(depth)
    width = int(width)
    
    # Model
    inputs = X
    for d in range(depth):
        conv = layers.conv2d(inputs=inputs,
                                num_outputs=width,
                                kernel_size=5,
                                stride=1,
                                padding='same',
                                activation_fn=tf.nn.relu)
        
        pool = layers.max_pool2d(inputs=conv,
                                 kernel_size=3,
                                 stride=2,
                                 padding='same')
    
        inputs = layers.batch_norm(inputs=pool, 
                                   center=True,
                                   scale=True,
                                   is_training=is_training,
                                   trainable=True)
        
        #inputs = tf.nn.dropout(inputs, dropout_rate)
    
    
    flat = layers.flatten(inputs)
    
    y_out = layers.fully_connected(inputs=flat,
                                   num_outputs=10,
                                   activation_fn=None)
    return y_out

def get_optimizer(loss_val, feature):
    optimizer = tf.train.AdamOptimizer(learning_rate=feature[0])
    extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(extra_update_ops):
        train_step = optimizer.minimize(loss_val)
    return train_step
    

In [None]:
# Feature
learning_rate = 1e-3 
weigth_decay = 0.5
dropout_rate = 0.1
depth = 2
width = 32
loss_history = [0.0, 0.0, 0.0, 0.0]
accuracy_history = [0.0, 0.0, 0.0, 0.0]

feature = [learning_rate, weigth_decay, dropout_rate, depth, width]

state = [feature, loss_history, accuracy_history]
state = list(itertools.chain(*state))
state = np.reshape(state, (1,13))

state_reset = state
print(len(state))

num_episodes = 20
gamma = .99
e = 0.50

tf.reset_default_graph()
inputs1 = tf.placeholder(shape=[1,13],dtype=tf.float32)
W = tf.Variable(tf.random_uniform([13,3],0,0.01))
Qout = tf.matmul(inputs1,W)
predict = tf.argmax(Qout,1)

nextQ = tf.placeholder(shape=[1, 3],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.AdamOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)

#action 0 same, 1 down, 2 up
with get_session() as joke:
    joke.run(tf.global_variables_initializer())
    for episode in range(num_episodes):
        episode_reward = 0
        state = state_reset
        e = 0.5
        for i_step in range(100):
            
            a, allQ = joke.run([predict, Qout], feed_dict={inputs1: state})
            if np.random.rand(1) < e:
                a[0] = np.random.randint(3) 
            print(e)
            print(a[0])
            print('before',feature[3])
            feature = state[0][:5]
            if a[0] == 1:
                feature[3] += 1
            if a[0] == 2:
                feature[3] -= 1
            print('after',feature[3])
            if feature[3] == 0:
                print('RESETTT')
                reward = 0.0
                print(reward)
                break

            tf.reset_default_graph()
            X = tf.placeholder(tf.float32, [None, 32, 32, 3])
            y = tf.placeholder(tf.int64, [None])
            is_training = tf.placeholder(tf.bool)

            #print(feature)
            y_out = get_model(X, y, is_training, feature)

            loss = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(y,10), logits=y_out)
            layers.l2_regularizer(scale=0.5)
            mean_loss = tf.reduce_mean(loss)
            reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
            loss_val = mean_loss + tf.reduce_sum(reg_loss)
            train_step = get_optimizer(loss_val, feature)

            reward = None
            new_state = None
            with get_session() as sess:
                with tf.device("/gpu:0") as dev:
                    sess.run(tf.global_variables_initializer())
                    #print('Training')
                    loss_history, accuracy_history = run_model(sess,y_out,loss_val,X_train,y_train,4,64,100,train_step, verbose=False)
                    #print(loss_history, accuracy_history)
                    #print('Validation')
                    _, reward = run_model(sess,y_out,loss_val,X_val,y_val,1,64)
                    reward = reward[0]
                    print('reward',reward)
                    #print(feature[0])
                    new_state = [feature, loss_history, accuracy_history]
                    new_state = list(itertools.chain(*new_state))
                    new_state = np.reshape(new_state, (1,13))

            Q1 = joke.run(Qout,feed_dict={inputs1: new_state})

            maxQ1 = np.max(Q1)
            targetQ = allQ
            targetQ[0,a[0]] = reward + gamma*maxQ1

            maxQ1 = np.max(Q1)
            targetQ = allQ
            targetQ[0,a[0]] = reward + gamma*maxQ1

            _,W1 = joke.run([updateModel,W],feed_dict={inputs1: new_state, nextQ:targetQ})

            episode_reward += reward
            state = new_state
            
            e *= 0.95
                


In [None]:
learning_rate = 1e-3 
weigth_decay = 0.0
dropout_rate = 0.1
depth = 16
width = 32
loss_history = [0.0, 0.0, 0.0, 0.0]
accuracy_history = [0.0, 0.0, 0.0, 0.0]

feature = [learning_rate, weigth_decay, dropout_rate, depth, width]

state = [feature, loss_history, accuracy_history]
state = list(itertools.chain(*state))
state = np.reshape(state, (1,13))

feature = state[0][:5]
print('after',feature[0])

tf.reset_default_graph()
X = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)

#print(feature)
y_out = get_model(X, y, is_training, feature)

loss = tf.nn.softmax_cross_entropy_with_logits(labels=tf.one_hot(y,10), logits=y_out)
mean_loss = tf.reduce_mean(loss)
# reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
# loss_val = mean_loss + tf.reduce_sum(reg_loss)
train_step = get_optimizer(mean_loss, feature)

reward = None
new_state = None
with get_session() as sess:
    with tf.device("/gpu:0") as dev:
        sess.run(tf.global_variables_initializer())
        #print('Training')
        loss_history, accuracy_history = run_model(sess,y_out,mean_loss,X_train,y_train,4,64,100,train_step, verbose=True)
        #print(loss_history, accuracy_history)
        #print('Validation')
        _, reward = run_model(sess,y_out,mean_loss,X_val,y_val,1,64)
        reward = reward[0]
        print('reward',reward)