# Battle Ultra Bot (aka B.U.B.)

### version 0.0.1


Notes:

B.U.B. version 0.0.1 uses the following versions:
* python:  3.7.4
* tensorflow:  2.0.0 (but casted to v1 compatitbility for now)
* gym:  0.15.4
* numpy:  1.18.0

We recommend checking the output of the next cell and comparing it to the versions listed above.

-----

In [2]:
# import packages and check versions

import tensorflow.compat.v1 as tf
import gym
import random
import platform
from collections import deque
import numpy as np
print("python: ", platform.python_version())
print("tensorflow: ", tf.__version__)
print("gym: ", gym.__version__)
print("numpy: ", np.__version__)

python:  3.7.4
tensorflow:  2.0.0
gym:  0.15.4
numpy:  1.18.0


In [4]:
# build custom pokemon environment
env_name = "Pokemon-v0"

env = gym.make(env_name)

# declare constants
numMoves = 821
numNonVolatileStatuses = 7
numVolatileStatuses = 57
numTypes = 18
numItems = 413
numAbilities = 262
numPokemon = 1198

print("observation space: ", env.observation_space)
print("upper limit: ", env.observation_space.high)
print("lower limit: ", env.observation_space.low)
print("\nAction space: ", env.action_space)

tf.disable_eager_execution() # for compatibility issues

observation space:  Box(4,)
upper limit:  [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38]
lower limit:  [-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38]

Action space:  Discrete(2)


In [1]:
# declare and initialize the QNetwork

class QNetwork():
    def __init__(self, state_dim, action_size):
        self.state_in = tf.placeholder(tf.float32, shape=[None, *state_dim])
        self.action_in = tf.placeholder(tf.int32, shape=[None])
        self.q_target_in = tf.placeholder(tf.float32, shape=[None])
        action_one_hot = tf.one_hot(self.action_in, depth=action_size)
        
        self.hidden1 = tf.layers.dense(self.state_in, 100, activation=tf.nn.relu)
        self.q_state = tf.layers.dense(self.hidden1, action_size, activation=None)
        self.q_state_action = tf.reduce_sum(tf.multiply(self.q_state, action_one_hot), axis=1)
        
        self.loss = tf.reduce_mean(tf.square(self.q_state_action - self.q_target_in))
        self.optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(self.loss)
      
    def update_model(self, session, state, action, q_target):
        feed = {self.state_in: state, self.action_in: action, self.q_target_in: q_target}
        session.run(self.optimizer, feed_dict=feed)
    
    def get_q_state(self, session, state):
        q_state = session.run(self.q_state, feed_dict={self.state_in: state})
        return q_state
    

#### A small, yet important addition: the ReplayBuffer( ) class

* exists bc
* does x, y, z

In [4]:
class ReplayBuffer():
    def __init__(self, maxlen):
        self.buffer = deque(maxlen=maxlen)
        
    def add(self, experience):
        self.buffer.append(experience)
        
    def sample(self, batch_size):
        sample_size = min(len(self.buffer), batch_size)
        samples = random.choices(self.buffer, k=sample_size)
        return map(list, zip(*samples))

## The DQNAgent( ) class

The learning agent / also known as the deep QNetwork agent is the mechanism of learning for the QNetwork.
* agent needs to select the action with the highest predicted Q value
* 

#### \_\_init\_\_(self, env):
Desc.
* info
* info

#### get_action(self, state):
Desc.
* info
* info 

#### train(self, state, action, next_state, reward, done):
Desc.
* info
* info 

#### \_\_del\_\_(self):
Desc.
* info
* info 

In [5]:
class DQNAgent():
    def __init__(self, env):
        self.state_dim = env.observation_space.shape
        self.action_size = env.action_space.n
        self.q_network = QNetwork(self.state_dim, self.action_size)
        self.replay_buffer = ReplayBuffer(maxlen=10000)
        self.gamma = 0.97
        self.eps = 1.0
        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
        
    def get_action(self, state):
        q_state = self.q_network.get_q_state(self.sess, [state])
        action_greedy = np.argmax(q_state)
        action_random = np.random.randint(self.action_size)
        action = action_random if random.random() < self.eps else action_greedy
        return action
    
    def train(self, state, action, next_state, reward, done):
        self.replay_buffer.add((state, action, next_state, reward, done))
        states, actions, next_states, rewards, dones = self.replay_buffer.sample(50)
        q_next_states = self.q_network.get_q_state(self.sess, next_states)
        q_next_states[dones] = np.zeros([self.action_size])
        q_targets = rewards + self.gamma * np.max(q_next_states, axis=1)
        self.q_network.update_model(self.sess, states, actions, q_targets)
        
        if done: self.eps = max(0.1, 0.99*self.eps)
    
    def __del__(self):
        self.sess.close()

### Putting it all together, B.U.B. plays and learns

In [None]:
agent = DQNAgent(env)
num_episodes = 400

for ep in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False
    while not done:
        action = agent.get_action(state)
        next_state, reward, done, info = env.step(action)
        agent.train(state, action, next_state, reward, done)
        env.render()
        total_reward += reward
        state = next_state
        
    print("Episode: {}, total_reward: {:.2f}".format(ep, total_reward))

Instructions for updating:
Use keras.layers.Dense instead.
Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Episode: 0, total_reward: 35.00
Episode: 1, total_reward: 13.00
Episode: 2, total_reward: 11.00
Episode: 3, total_reward: 52.00
Episode: 4, total_reward: 39.00
Episode: 5, total_reward: 9.00
Episode: 6, total_reward: 15.00
Episode: 7, total_reward: 12.00
Episode: 8, total_reward: 29.00
Episode: 9, total_reward: 18.00
Episode: 10, total_reward: 16.00
Episode: 11, total_reward: 12.00
Episode: 12, total_reward: 9.00
Episode: 13, total_reward: 56.00
Episode: 14, total_reward: 17.00
Episode: 15, total_reward: 10.00
Episode: 16, total_reward: 31.00
Episode: 17, total_reward: 11.00
Episode: 18, total_reward: 16.00
Episode: 19, total_reward: 13.00
Episode: 20, total_reward: 20.00
Episode: 21, total_reward: 14.00
Episode: 22, total_reward: 18.00
Episode: 23, total_reward: 14.00
Episode: 

Episode: 215, total_reward: 200.00
Episode: 216, total_reward: 200.00
Episode: 217, total_reward: 200.00
Episode: 218, total_reward: 200.00
Episode: 219, total_reward: 200.00
Episode: 220, total_reward: 200.00
Episode: 221, total_reward: 200.00
Episode: 222, total_reward: 200.00
Episode: 223, total_reward: 200.00
Episode: 224, total_reward: 200.00
Episode: 225, total_reward: 200.00
Episode: 226, total_reward: 200.00
Episode: 227, total_reward: 200.00
Episode: 228, total_reward: 200.00
Episode: 229, total_reward: 200.00
Episode: 230, total_reward: 200.00
Episode: 231, total_reward: 200.00
Episode: 232, total_reward: 200.00
Episode: 233, total_reward: 200.00
Episode: 234, total_reward: 200.00
Episode: 235, total_reward: 200.00
Episode: 236, total_reward: 200.00
Episode: 237, total_reward: 200.00
Episode: 238, total_reward: 200.00
Episode: 239, total_reward: 200.00
Episode: 240, total_reward: 200.00
Episode: 241, total_reward: 200.00
Episode: 242, total_reward: 200.00
Episode: 243, total_

In [None]:
env.close()