In [None]:
import gymnasium as gym
import numpy as np

from Lib.DQN import DQNAgent

env_name = 'CartPole-v1'

In [None]:
def single_run(agent, N=500):
    env = gym.make(env_name, render_mode='rgb_array')
    state_size = env.observation_space.shape[0]

    episode_lengths = np.zeros(shape=(N,))
    mse_per_episode = np.zeros(shape=(N,))
    scores = []
    
    batch_size = 64
    max_steps = 400
    converge_counter = 0
    
    for episode in range(N):
        state, _ = env.reset()
        state = np.reshape(state, [1, state_size])        # Reshape state for NN      
        steps = 0
        batch_loss = None
        while steps < max_steps:
            # Select and perform action
            action = agent.act(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_state = np.reshape(next_state, [1, state_size])            # Reshape state for NN

            # Modified reward structure
            done = terminated or truncated
            reward = reward if not done else -100
            
            # Store transition
            agent.remember(state, action, reward, next_state, done)
            
            # Move to next state
            state = next_state
            steps += 1
            
            # Perform batch training if enough samples
            if len(agent.memory) > batch_size:
                batch_loss = agent.replay(batch_size)
            
            if done:
                break
        
        # Store episode results
        episode_lengths[episode] = steps
        mse_per_episode[episode] = -1 if batch_loss is None else np.mean(batch_loss)
        scores.append(reward)
        
        # Logging
        if (episode + 1) % 10 == 0:
            print(f"Episode: {episode + 1:4d} | "
                  f"Score: {steps:4d} | "
                  f"Epsilon: {agent.epsilon:.2f} | "
                  f"Memory: {len(agent.memory):5d} | "
                  f"MSE: {mse_per_episode[episode]:.2f}")

        if steps >= max_steps:
            converge_counter += 1
            if converge_counter >= N * 0.1:
                break
        
    return agent, scores, episode_lengths, mse_per_episode

In [None]:
env = gym.make(env_name, render_mode='human')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n 

n_episodes = 100

# agent_dqn = DQNAgent(state_size, action_size, alpha=0.001, use_double_learning=False) # Failed!
# agent_dqn, reward_hist_tot1, ep_length_tot1, mse_per_episode_tot1 = single_run(agent_dqn, N=n_episodes) # Failed!
agent_ddqn = DQNAgent(state_size, action_size, alpha=0.001, use_double_learning=True, target_model_update_freq=40)
agent_ddqn, reward_hist_tot1, ep_length_tot1, mse_per_episode_tot1 = single_run(agent_ddqn, N=n_episodes)
agent_d3qn = DQNAgent(state_size, action_size, alpha=0.001, use_double_learning=True, target_model_update_freq=40, use_dueling_net=True)
agent_d3qn, reward_hist_tot1, ep_length_tot1, mse_per_episode_tot1 = single_run(agent_d3qn, N=n_episodes)

In [None]:
def run_simulation(model, env):
    s, _ = env.reset()
    done = False
    trunc = False
    steps = 0
    while not done and not trunc:
        s = np.reshape(s, [1, 4]) 
        a = model.act(s)
        s, _, done, trunc, _ = env.step(a)
        env.render()   
        steps += 1
        if steps >= 200:
            break
            
    print("reward =", steps)

# run_simulation(agent_dqn, env) # Failed!
print("1")
run_simulation(agent_ddqn, env)   
print("2")
run_simulation(agent_d3qn, env)   