In [1]:
# Compared to the simple multi-armed bandits, contextual Bandits introduce the concept of the state.
# In this problem, instead of a single bandit, there can now be multiple bandits. 
# The state of the environment tells us which bandit we are dealing with, and the goal of the agent is to learn the best action not just for a single bandit, but for any number of them.
# We will be building a single-layer neural network in Tensorflow that takes a state and produces an action.
# By using a policy-gradient update method, we can have the network learn to take actions that maximize its reward.

In [2]:
import tensorflow as tf
import numpy as np

## The Contextual Bandits

In [3]:
# Here we define our contextual bandits. 
# In this example, we are using three four-armed bandit. 
class contextual_bandit():
    def __init__(self):
        self.state = 0
        #List out our bandits. Currently arms 4, 2, and 1 (respectively) are the most optimal.
        self.bandits = np.array([[0.2,0,-0.0,-5],[0.1,-5,1,0.25],[-5,5,5,5]])
        self.num_bandits = self.bandits.shape[0]
        self.num_actions = self.bandits.shape[1]
    
    def getBandit(self):
        self.state = np.random.randint(0,len(self.bandits))
        return self.state
    
    def pullArm(self, action):
        bandit = self.bandits[self.state,action]
        result = np.random.randn(1)
        if result > bandit:
            #return a positive reward.
            return 1
        else:
            #return a negative reward.
            return -1

## The Policy-Based Agent

In [6]:
class agent():
    def __init__(self, lr, s_size, a_size):
        # Establish the feed-forward network. The agent takes a state and produces an action
        self.state_in= tf.placeholder(shape=[1],dtype=tf.int32)
        state_in_OH= tf.one_hot(self.state_in, s_size)
        output = tf.contrib.layers.fully_connected(state_in_OH, a_size, 
                                                   activation_fn=tf.nn.sigmoid,
                                                   weights_initializer = tf.ones_initializer())
        self.output = tf.reshape(output, [-1])
        self.chosen_action = tf.argmax(self.output, 0)
        # Establish the training procedure
        self.reward_holder = tf.placeholder(shape=[1],dtype=tf.float32)
        self.action_holder = tf.placeholder(shape=[1],dtype=tf.int32)
        self.responsible_weight = tf.slice(self.output,self.action_holder,[1])
        self.loss = -(tf.log(self.responsible_weight)*self.reward_holder)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=lr)
        self.update = optimizer.minimize(self.loss)

## Training the Agent

In [7]:
# We will train our agent by getting a state from the environment, take an action, and recieve a reward. 
tf.reset_default_graph()

cBandit = contextual_bandit()
myAgent = agent(lr=0.001,s_size=cBandit.num_bandits,a_size=cBandit.num_actions)
# The weights we will evaluate to look into the network
weights = tf.trainable_variables()[0]

total_episodes = 10000 
total_reward = np.zeros([cBandit.num_bandits,cBandit.num_actions])
e = 0.1 #Set the chance of taking a random action.

init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for i in range(total_episodes):
        s = cBandit.getBandit()
        
        if np.random.rand(1) < e:
            action = np.random.randint(cBandit.num_actions)
        else:
            action = sess.run(myAgent.chosen_action,feed_dict={myAgent.state_in:[s]})
            
        reward = cBandit.pullArm(action) #Get our reward for taking an action given a bandit.
        
        #Update the network.
        feed_dict={myAgent.reward_holder:[reward],myAgent.action_holder:[action],myAgent.state_in:[s]}
        _,ww = sess.run([myAgent.update,weights], feed_dict=feed_dict)
        
        #Update our running tally of scores.
        total_reward[s,action] += reward
        if i % 500 == 0:
            print("Mean reward for each of the " + str(cBandit.num_bandits) + " bandits: " + str(np.mean(total_reward,axis=1)))

for a in range(cBandit.num_bandits):
    print("The agent thinks action " + str(np.argmax(ww[a])+1) + " for bandit " + str(a+1) + " is the most promising....")
    if np.argmax(ww[a]) == np.argmin(cBandit.bandits[a]):
        print("...and it was right!")
    else:
        print("...and it was wrong!")

Mean reward for each of the 3 bandits: [ 0.    0.    0.25]
Mean reward for each of the 3 bandits: [ -2.    -3.25  35.  ]
Mean reward for each of the 3 bandits: [ -2.    -2.5   70.75]
Mean reward for each of the 3 bandits: [  -7.75  -13.    109.  ]
Mean reward for each of the 3 bandits: [ -15.    -12.25  146.  ]
Mean reward for each of the 3 bandits: [ -22.75  -17.25  181.75]
Mean reward for each of the 3 bandits: [ -26.25  -23.5   220.5 ]
Mean reward for each of the 3 bandits: [ -32.    -26.5   254.75]
Mean reward for each of the 3 bandits: [ -40.25  -32.    295.  ]
Mean reward for each of the 3 bandits: [ -46.    -40.5   325.75]
Mean reward for each of the 3 bandits: [ -52.    -44.5   361.75]
Mean reward for each of the 3 bandits: [ -53.5   -49.    393.25]
Mean reward for each of the 3 bandits: [ -58.75  -53.    429.5 ]
Mean reward for each of the 3 bandits: [ -68.    -57.75  467.5 ]
Mean reward for each of the 3 bandits: [ -74.5   -56.75  500.5 ]
Mean reward for each of the 3 bandits