In [1]:
# Imports
import gym
import random
import numpy as np
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers.legacy import Adam

In [2]:
# Filter Warnings
import warnings
warnings.filterwarnings('ignore')

  and should_run_async(code)


# **Environment**

In [3]:
# Initialize Env
env = gym.make('CartPole-v1')
# Get States
states = env.observation_space.shape[0]
# Get Actions
actions = env.action_space.n

In [4]:
episodes = 10
for i in range(1,episodes+1):
  env.reset()
  done = False
  score = 0

  while not done:
    env.render()
    action = random.choice([0,1])
    n_states, reward, done, info = env.step(action)
    score+=reward
  print(f'Episode: {i}, Score: {score}')

Episode: 1, Score: 16.0
Episode: 2, Score: 22.0
Episode: 3, Score: 15.0
Episode: 4, Score: 11.0
Episode: 5, Score: 15.0
Episode: 6, Score: 14.0
Episode: 7, Score: 20.0
Episode: 8, Score: 12.0
Episode: 9, Score: 14.0
Episode: 10, Score: 14.0


# **Model**

In [5]:
# Model
def build_model(state, action):
  model = Sequential()
  model.add(Flatten(input_shape = (1,state)))
  model.add(Dense(16, activation = 'relu'))
  model.add(Dense(32, activation = 'relu'))
  model.add(Dense(action, activation = 'linear'))
  return model

In [6]:
model = build_model(states, actions)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 4)                 0         
                                                                 
 dense (Dense)               (None, 16)                80        
                                                                 
 dense_1 (Dense)             (None, 32)                544       
                                                                 
 dense_2 (Dense)             (None, 2)                 66        
                                                                 
Total params: 690
Trainable params: 690
Non-trainable params: 0
_________________________________________________________________


## **Agent**

In [7]:
# Policy-Based RL
def build_agent(model ,actions):
  policy = BoltzmannQPolicy()
  memory = SequentialMemory(limit = 500, window_length = 1)
  dqn = DQNAgent(model = model, memory = memory, policy = policy,
                 nb_actions = actions, nb_steps_warmup = 10, target_model_update = 1e-2)
  return dqn
agent = build_agent(model ,actions)

# **Training**

In [8]:
# Train
agent.compile(Adam(learning_rate=1e-3), metrics = ['mae'])
agent.fit(env, nb_steps = 50000, visualize = False, verbose = 1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
111 episodes - episode_reward: 89.234 [9.000, 500.000] - loss: 1.879 - mae: 18.194 - mean_q: 36.498

Interval 2 (10000 steps performed)
35 episodes - episode_reward: 275.971 [150.000, 500.000] - loss: 1.412 - mae: 37.342 - mean_q: 75.065

Interval 3 (20000 steps performed)
29 episodes - episode_reward: 352.103 [35.000, 500.000] - loss: 5.249 - mae: 46.224 - mean_q: 91.710

Interval 4 (30000 steps performed)
29 episodes - episode_reward: 336.690 [49.000, 500.000] - loss: 2.453 - mae: 45.886 - mean_q: 91.334

Interval 5 (40000 steps performed)
done, took 248.237 seconds


<keras.callbacks.History at 0x7dfd3caacc40>

In [9]:
scores = agent.test(env, nb_episodes = 2, visualize = False)
print(np.mean(scores.history['episode_reward']))

Testing for 2 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
500.0


In [10]:
_ = agent.test(env, nb_episodes = 2, visualize = True)

Testing for 2 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
