# Q-Network Learning

In [1]:
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

### Make the environment according to simulation sepcified by Eric Neural Net Paper


In [2]:
class FOMS:
    def get_reward(self,state,action):
        normal_coeff=[(1-action)*(2*(state[0]+state[1])-(state[2]+state[3]))+action*(2*(state[2]+state[3])-(state[0]+state[1])),0.01]
        return np.random.normal(loc=normal_coeff[0],scale=normal_coeff[1],size=1)
        
    def get_next_state(self,current_state,action):
        if len(current_state)==0:
            return np.random.normal(loc=0,scale=0.25,size=64)
        else:
            new_state=np.zeros(64)
            for i in range(16):
                s_t=current_state[i]
                #mean and sd for normal distribution of next state vectors 
                # 4i-3 and 4i-2
                norm_coeff_p1=[(1-action)*s_t,0.01*(1-action)+0.25*action]
                #coeffs for 4i-1 and 4i
                norm_coeff_p2=[(1-action)*s_t,0.01*(1-action)+0.25*action]
                
                #Populate the state variables by sampling from the normal 
                #distribution params specified above
                new_state[4*(i+1)-1]=np.random.normal(loc=norm_coeff_p2[0],scale=norm_coeff_p2[1],size=1)
                new_state[4*(i+1)-2]=np.random.normal(loc=norm_coeff_p2[0],scale=norm_coeff_p2[1],size=1)
                new_state[4*(i+1)-4]=np.random.normal(loc=norm_coeff_p1[0],scale=norm_coeff_p1[1],size=1)
                new_state[4*(i+1)-3]=np.random.normal(loc=norm_coeff_p1[0],scale=norm_coeff_p1[1],size=1)
            return new_state
    #Don't need it but putting it just in case
    def get_action(self):
        return np.random.binomial(1,0.5,1)[0]
                

### Testing our environment made

In [3]:
#Create environment object
sim=FOMS()
state=sim.get_next_state([],-1)
action=sim.get_action()

print action

new_state=sim.get_next_state(state,action)

print new_state
#It seems to work!!

0
[ 0.36740096  0.35198255  0.36365887  0.3491325  -0.22659814 -0.22742933
 -0.2229184  -0.1999111   0.27895242  0.28630602  0.27195331  0.28656794
 -0.20551268 -0.22295816 -0.19493619 -0.21282867 -0.01596389  0.00379725
  0.00346627 -0.00227742  0.31883244  0.30924713  0.30730164  0.29933591
 -0.04187928 -0.03635926 -0.04660693 -0.0343696  -0.17134724 -0.19154977
 -0.18476586 -0.18628574 -0.0088024  -0.00718638 -0.01912129 -0.01154355
  0.19411522  0.204038    0.20577921  0.19364711  0.21902144  0.22523132
  0.21579502  0.21899785 -0.06987925 -0.09463085 -0.08655176 -0.07885639
  0.38430935  0.41143997  0.41176515  0.40118561  0.18818825  0.20588541
  0.1895193   0.19164272  0.37608614  0.36771978  0.36678938  0.37832324
 -0.07764933 -0.0994036  -0.09821317 -0.11214831]


## The Q-Network (Highly Simplified for now, will implement experience reply and other refinements)

In [4]:
tf.reset_default_graph()

### Network Architechture


In [5]:
#These lines establish the feed-forward part of the network used to choose actions
inputs1 = tf.placeholder(shape=[1,64],dtype=tf.float32,name="input1")

#Layer 1
W = tf.Variable(tf.random_uniform([64,32],0,0.01))
Qout1 = tf.matmul(inputs1,W)
#Layer 2
W1 = tf.Variable(tf.random_uniform([32,2],0,0.01))
Qout = tf.matmul(Qout1,W1)

predict = tf.argmax(Qout,1,name="op_to_restore")


### Loss Function and Optimizer Specification

In [6]:
#Below we obtain the loss by taking the sum of squares difference between the target and prediction Q values.
nextQ = tf.placeholder(shape=[1,2],dtype=tf.float32)
loss = tf.reduce_sum(tf.square(nextQ - Qout))
trainer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = trainer.minimize(loss)

init = tf.initialize_all_variables()

Instructions for updating:
Use `tf.global_variables_initializer` instead.


### Training the network

In [7]:
init = tf.initialize_all_variables()

# Set learning parameters
y = .99
e = 0.1
num_episodes = 2000
#create lists to contain total rewards and steps per episode
jList = []
rList = []

saver=tf.train.Saver()

with tf.Session() as sess:
    sess.run(init)
    for i in range(num_episodes):
        #Save the model for every 100th iteration
        if i%100==0:
            save_path=saver.save(sess,"/scratch/nsani/DeepQLearning/model"+str(i)+".ckpt")
            print "Model saved at: "+save_path
        #Reset environment and get first new observation
        state = FOMS()
        s1=state.get_next_state([],1)
        rAll = 0
        d = False
        j = 0
        #The Q-Network
        while j < 91:
            j+=1
            #Choose an action by greedily (with e chance of random action) from the Q-network
            a,allQ = sess.run([predict,Qout],feed_dict={inputs1:np.reshape(s1,(1,64))})
            if np.random.rand(1) < e:
                a[0] = state.get_action()
                
            #Get new state and reward from environment
            
            #s1,r,d,_ = env.step(a[0])
            new_state=state.get_next_state(s1,a[0])
            r=state.get_reward(s1,a[0])
            
            #Obtain the Q' values by feeding the new state through our network
            Q1 = sess.run(Qout,feed_dict={inputs1:np.reshape(new_state,(1,64))})
            #Obtain maxQ' and set our target value for chosen action.
            maxQ1 = np.max(Q1)
            targetQ = allQ
            targetQ[0,a[0]] = r + y*maxQ1
            #Train our network using target and predicted Q values
            _,W1 = sess.run([updateModel,W],feed_dict={inputs1:np.reshape(new_state,(1,64)),nextQ:targetQ})
            rAll += r
            s = s1
#             if d == True:
#                 #Reduce chance of random action as we train the model.
#                 e = 1./((i/50) + 10)
#                 break
        jList.append(j)
        rList.append(rAll)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
Model saved at: /scratch/nsani/DeepQLearning/model0.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model100.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model200.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model300.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model400.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model500.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model600.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model700.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model800.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model900.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model1000.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model1100.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model1200.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model1300.ckpt
Model saved at: /scratch/nsani/DeepQLearning/model1400.ckpt
Model saved at: /scratch/nsani/DeepQLe

### Model trained, now calculate marginal mean outcome from the learned Q-function (According to MDP paper specifications: 500 iterations for 90 actions per ietration

In [8]:
with tf.Session() as sess:
  sess.run(init)
  new_saver = tf.train.import_meta_graph('/scratch/nsani/DeepQLearning/model1900.ckpt.meta')
  new_saver.restore(sess, tf.train.latest_checkpoint('/scratch/nsani/DeepQLearning'))
  graph=tf.get_default_graph()
  op_to_restore=graph.get_tensor_by_name("op_to_restore:0")
  rList=[]
  for i in range(500):
    sim=FOMS()
    s1=sim.get_next_state([],1)
    rList.append(0)
    for j in range(90):
        action=sess.run([op_to_restore],feed_dict={inputs1:np.reshape(s1,(1,64))})
        rList[i]+=sim.get_reward(s1,action[0][0])
        s1=sim.get_next_state(s1,action[0][0])
  print np.mean(rList)

1.92715090476
