In [3]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from helper_DQN import running_mean, scale_and_resize, ExperienceMemory, PrioritizedExperienceReplayBuffer
import collections
import random
import gymnasium as gym
from models import MLP_state
import torchvision.transforms as transforms

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

device = torch.device('cpu')


class TrainMountainCar:
    def __init__(self, n_training_episodes=200, gamma=0.99, learning_rate=0.1, epsilon_max=0.5,
                 epsilon_min=0.05, max_steps=10000, batch_size=32, fixed_target=False,
                 copy_target=10000, replay_size=100000, double=False, dueling=False, prioritized=False, debug=False,
                 eval_epsilon=None, eval_episodes=10, eval_every=50, noisy=False, distributional=False, env=None,
                 epsilon_frame=500000):
        self.n_training_episodes = n_training_episodes
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon_max = epsilon_max
        self.epsilon_min = epsilon_min
        self.epsilon_frame = epsilon_frame
        self.max_steps = max_steps
        self.batch_size = batch_size
        self.fixed_target = fixed_target
        self.copy_target = copy_target
        self.replay_size = replay_size
        self.double = double
        self.dueling = dueling
        self.debug = debug
        self.eval_epsilon = eval_epsilon if eval_epsilon is not None else epsilon_min
        self.eval_episodes = eval_episodes
        self.eval_every = eval_every
        self.prioritized = prioritized,

    def epsilon_greedy_policy(self, policy: torch.nn.Module, X, epsilon: float, env: gym.envs):
        """
        Samples a random action with probability epsilon and picks the maximum action under policy network otherwise.
        :param policy: Policy Network under which to take action
        :param X: stacked tensor of shape (4,80,120)
        :param epsilon: float probability of sampling a random action
        :param env: Gymnasium environment
        :return: Randomly sampled action or maximum action under policy network
        """
        if random.uniform(0, 1) < epsilon:
            return env.action_space.sample()
        else:
            with torch.no_grad():
                X = np.vstack(X).astype(np.float32)
                X = torch.tensor(X).squeeze(1)
                return policy(X).max(0)[1].view(1, 1).item()


#     def get_action(self, policy, s, eps=0.1):
#         with torch.no_grad():
#             self.seq.pop(0)
#             self.seq.append(s)
#             if np.random.random() >= eps:
#                 X = torch.tensor(np.vstack(self.seq).astype(np.float32), device=device, dtype=torch.float)
#                 a = policy(X.unsqueeze(0))
#                 # a = a[:, -1, :]  # select last element of seq
#                 a = a.max(1)[1]
#                 return a.item()
#             else:
#                 return env.action_space.sample()

    # def initialize_measuring_states(self, env):
    #     """
    #     Randomly samples 200 states by taking random actions
    #     :param env: Gymnasium environment
    #     :return: list of states that were visited by random walk
    #     """
    #     measuring_states = []
    #     env.reset()
    #     for i in range(200):
    #         action = env.action_space.sample()
    #         env.step(action)
    #         img = env.render()
    #         img = transforms.ToTensor()(img)
    #         measuring_states.append(transform(img))
    #     env.reset()
    #     return measuring_states

    def eval(self, policy: torch.nn.Module, env: gym.envs):
        """
        Evaluate a policy and return the average reward over self.eval_episodes trials with maximum 10000 steps each
        :param policy: The policy to be evaluated
        :param env: The Gymnasium environment
        :return: average over rewards collected turing trials
        """
        rewards_list = []
        for episode in range(self.eval_episodes):
            state = env.reset()[0]

            # up to 30 no-op actions
            noop = random.randint(0, 30)
            for i in range(noop):
                action = env.action_space.sample()
                state,_,_,_,_ = env.step(action)

            rewards = 0

            for i in range(0, self.max_steps):      # max episode length 10000
                action = self.epsilon_greedy_policy(policy, state, self.eval_epsilon, env)
#                 action = self.get_action(policy, state, self.eval_epsilon)
                state, reward, terminated, _, _ = env.step(action)
                rewards += reward

                if terminated:
                    break

            rewards_list.append(rewards)

        return np.mean(rewards_list)
    
    def train(self):
        """
        trains DQN using a fixed target network if self.fixed_target == True, otherwise with the policy network.
        :return: list of total rewards, list of steps in each episode, q values over sampled states
        """

        # keep track of total steps and rewards
        total_steps = 0
        total_rewards = []
        total_steps_list = []
        evaluations = []

        # initialize states in which Q value is measured every X episodes to track progress
        # measuring_states = self.initialize_measuring_states(env)
        # q_measures = []

        # Initialize Experience Memory
        if self.prioritized:
            beta_start = 0.5
            beta_frames = 1000
            beta_by_frame = lambda total_steps: min(1.0, beta_start + total_steps * (1.0 - beta_start) / beta_frames)
            experience_memory = PrioritizedExperienceReplayBuffer(alpha=0.7, batch_size=self.batch_size,
                                                                  buffer_size=self.replay_size)
        else:
            experience_memory = ExperienceMemory(self.replay_size)

        # initialize policy (and target) network
        if self.dueling:
            policy = MLP_state(env.action_space.n).to(device)
            if self.fixed_target:
                target = MLP_state(env.action_space.n).to(device)
                target.load_state_dict(policy.state_dict())
                target.eval()
        else:
            policy = MLP_state(env.action_space.n).to(device)
            if self.fixed_target:
                target = MLP_state(env.action_space.n).to(device)
                target.load_state_dict(policy.state_dict())
                target.eval()

        # Best values found during evaluation
        best_reward = - float('inf')
        best_policy = policy.state_dict()

        optimizer = torch.optim.RMSprop(policy.parameters(), lr=self.learning_rate, weight_decay=0.99, momentum=0.95) 

        for episode in range(self.n_training_episodes):
            steps = 0
            rewards = 0

            state = env.reset()[0]

            # up to 30 no-op actions
            noop = random.randint(0, 30)
            for i in range(noop):
                action = env.action_space.sample()
                state, _, _, _, _ = env.step(action)

            while True:
                # linear epsilon decay based on steps
                epsilon = max(self.epsilon_max - ((self.epsilon_max - self.epsilon_min)/self.epsilon_frame) *
                              total_steps, self.epsilon_min)

                # Choose the action At using epsilon greedy policy
                action = self.epsilon_greedy_policy(policy, state, epsilon, env)
#                 action = self.get_action(policy, X, epsilon)
                # take action
                new_state, reward, terminated, _, _ = env.step(action)

                experience_memory.add((state, action, reward, new_state, terminated))

                steps += 1
                total_steps += 1

                if len(experience_memory) > self.batch_size:
                    if self.prioritized:
                        beta = beta_by_frame(total_steps)
                        idxs, experiences, weights = experience_memory.sample(beta)
#                         states, actions, _rewards, next_states, terminations = (i for i in
#                                                                                 zip(*experiences))  # (torch.Tensor(vs).to(device) for vs in
                        # zip(*experiences))
                        weights = torch.tensor(weights).to(device)
                    else:
                        experiences = experience_memory.sample(self.batch_size)
#                     print(list(len(i) for i in zip(*experiences)))
                    states, actions, _rewards, next_states, terminations = (i for i in zip(*experiences))
                    a = torch.tensor(actions).long().unsqueeze(dim=1).to(device)
                    r = torch.tensor(_rewards).unsqueeze(dim=1).to(device)
                    states = np.vstack(states).astype(np.float32)
                    states = torch.tensor(states).to(device)
                    next_states = np.vstack(states).astype(np.float32)
                    next_states = torch.tensor(next_states).to(device)
#                     states = np.vstack(states).astype(np.float32)
#                     states = torch.from_numpy(states)
#                     next_states = np.vstack(next_states).astype(np.float32)
#                     next_states = torch.from_numpy(next_states)
#                     states = torch.reshape(states, (self.batch_size, 1, 84, 84)).to(device)  # 80,120
#                     next_states = torch.reshape(next_states, (self.batch_size, 1, 84, 84)).to(device)
                    mask = [i for i, x in enumerate(terminations) if not x]  # get all non-final states
                    
#                     reward = torch.tensor([reward]).to(device)
#                     action = torch.tensor([action]).unsqueeze(0).to(device)
#                     state = X.unsqueeze(0)
#                     new_state = X_new.unsqueeze(0)
                    
                    steps += 1
                    total_steps += 1
                    
                    state_action_values = policy(states).gather(1, a)
                    
                    next_state_values = torch.zeros(self.batch_size, device=device)
                    
                    # update network
                    if self.double:
                        max_next_action = policy(next_states).max(1)[1].view(-1, 1)
                        next_state_values[mask] = target(next_states[mask]).gather(1, max_next_action[mask]).squeeze(1)
                    elif self.fixed_target:
                        next_state_values[mask] = target(next_states[mask]).max(1)[0].detach()
                    else:
                        next_state_values[mask] = policy(next_states[mask]).max(1)[0].detach()
                    # Compute the expected Q values
                    expected_state_action_values = (next_state_values * self.gamma) + r.squeeze(1)
                    

                    if self.prioritized:
                            diff = expected_state_action_values.unsqueeze(1) - state_action_values
                            experience_memory.update_priority(idxs, diff.cpu().detach().squeeze().abs().numpy().tolist())

                            loss = torch.nn.MSELoss()(state_action_values,
                                                      expected_state_action_values.unsqueeze(1)).squeeze() * weights
                    else:
                    # loss = torch.nn.MSELoss()(state_action_values, expected_state_action_values.unsqueeze(1)).squeeze()
                        loss = torch.nn.MSELoss()(state_action_values, expected_state_action_values)

                    loss = loss.mean()

                    # Optimize the model
                    optimizer.zero_grad()
                    loss.backward()

                    torch.nn.utils.clip_grad_norm_(policy.parameters(), 10.)    # clip gradients
                    optimizer.step()

                # Update total reward
                rewards += reward

                # update current state to be next state
                state = new_state

                # If done, finish the episode
                if terminated or steps >= self.max_steps-1:  # or truncated:
                    # Track rewards
                    total_rewards.append(rewards)
                    total_steps_list.append(steps)

                    # # measure Q values in selected states
                    # Q_states = torch.stack(measuring_states).to(device)
                    # Q_states = torch.unique(Q_states, dim=0, sorted=False)  # eliminate duplicate states
                    # with torch.no_grad():
                    #     q_measures.append(torch.mean(policy(Q_states).max(1)[0]).item())

                    # Evaluate current policy and save optimal policy weights
                    if episode == self.n_training_episodes-1 or (episode > 0 and episode % self.eval_every == 0):
                        eval_reward = self.eval(policy, env)
                        if eval_reward > best_reward:
                            best_reward = eval_reward
                            best_policy = policy.state_dict()
                        print(f"Evaluation: {int(episode/self.eval_every)}\t average reward: {eval_reward}")
                        evaluations.append(eval_reward)

                    # print training information
                    if self.debug:
                        print(f"episode: {episode + 1:03d}\t steps: {steps + 1:05d}\t total steps:"
                              f"{total_steps + 1:06d}\t epsilon: {epsilon:.2f}")#\t average Q: {q_measures[-1]:.3f}")
                    break

                if self.fixed_target:
                    # copy policy network weights to target net every copy_target steps
                    if total_steps % self.copy_target <= 4:
                        target.load_state_dict(policy.state_dict())

        return total_rewards, total_steps_list, best_policy, evaluations




In [None]:
# Hyperparameters
n_training_episodes = 1000
gamma = 0.99
learning_rate = 0.00025  # 0.1
max_training_steps = 10000

# Exploration parameters
epsilon_max = 1
epsilon_min = 0.1
# epsilon_frame = 100000

# replay memory parameters
replay_size = 100000
batch_size = 32

# fixed target network
fixed_target = True
copy_target = 10000

debug = True

epsilon_frame = 500000

transform = scale_and_resize()

env = gym.make('MountainCar-v0', render_mode='rgb_array')

car = TrainMountainCar(n_training_episodes=n_training_episodes, gamma=gamma, learning_rate=learning_rate,
                       epsilon_max=epsilon_max, epsilon_min=epsilon_min,
                       max_steps=max_training_steps, batch_size=batch_size, fixed_target=fixed_target,
                       copy_target=copy_target, debug=debug, env=env, epsilon_frame=epsilon_frame)

total_rewards, total_steps_list, best_policy, evaluations = car.train()

# save best policy as well as steps and q measures
torch.save(best_policy, 'data/DRQN_final.pth')
np.savetxt(f'data/steps_DRQN_txt', total_steps_list)
# np.savetxt(f'data/q_values_DRQN.txt', q_measures)
np.savetxt(f'data/eval_DRQN.txt', evaluations)

# Plot steps per episode
plt.plot(np.arange(len(total_steps_list)) + 1, total_steps_list, zorder=0, label='training')
x = np.arange(50, n_training_episodes+1, 50)
plt.scatter(x, [-e for e in evaluations], color='r', marker='x', zorder=1, label='evaluations')
N = 10
steps_mean = running_mean(total_steps_list, N)
plt.plot(np.arange(len(steps_mean)) + 1, steps_mean, zorder=0, label='running average')
plt.legend()
plt.xlabel('Episode')
plt.ylabel('Steps')
plt.title('Steps per Episode - DRQN')
plt.savefig('plots/steps_DRQN.png')
plt.close()

# Plot q measures per episode
# plt.plot(np.arange(len(q_measures)) + 1, q_measures)
# plt.xlabel('Episode')
# plt.ylabel('Average Q')
# plt.title('Average Q measure over sampled states')
# plt.savefig('plots/q_measures_DRQN.png')
# plt.close()

## DDQN

In [14]:
#  Hyperparameters
n_training_episodes = 1000
gamma = 0.99
learning_rate = 0.00025  # 0.1
max_training_steps = 10000

# Exploration parameters
epsilon_max = 1
epsilon_min = 0.1
# epsilon_frame = 100000

# replay memory parameters
replay_size = 100000
batch_size = 32

# fixed target network
fixed_target = True
copy_target = 10000

debug = True

double = True

epsilon_frame = 500000

transform = scale_and_resize()

env = gym.make('MountainCar-v0', render_mode='rgb_array')

car = TrainMountainCar(n_training_episodes=n_training_episodes, gamma=gamma, learning_rate=learning_rate,
                       epsilon_max=epsilon_max, epsilon_min=epsilon_min,double=double,
                       max_steps=max_training_steps, batch_size=batch_size, fixed_target=fixed_target,
                       copy_target=copy_target, debug=debug, env=env, epsilon_frame=epsilon_frame)

total_rewards, total_steps_list, best_policy, evaluations = car.train()

# save best policy as well as steps and q measures
torch.save(best_policy, 'data/DDQN_MLP.pth')
np.savetxt(f'data/steps_DDQN_MLP_txt', total_steps_list)
# np.savetxt(f'data/q_values_DDQN_MLP.txt', q_measures)
np.savetxt(f'data/eval_DDQN_MLP.txt', evaluations)

# Plot steps per episode
plt.plot(np.arange(len(total_steps_list)) + 1, total_steps_list, zorder=0, label='training')
x = np.arange(50, n_training_episodes+1, 50)
plt.scatter(x, [-e for e in evaluations], color='r', marker='x', zorder=1, label='evaluations')
N = 10
steps_mean = running_mean(total_steps_list, N)
plt.plot(np.arange(len(steps_mean)) + 1, steps_mean, zorder=0, label='running average')
plt.legend()
plt.xlabel('Episode')
plt.ylabel('Steps')
plt.title('Steps per Episode - DDQN_MLP')
plt.savefig('plots/steps_DDQN_MLP.png')
plt.close()

# Plot q measures per episode
# plt.plot(np.arange(len(q_measures)) + 1, q_measures)
# plt.xlabel('Episode')
# plt.ylabel('Average Q')
# plt.title('Average Q measure over sampled states')
# plt.savefig('plots/q_measures_DRQN.png')
# plt.close()

episode: 001	 steps: 10001	 total steps:010001	 epsilon: 0.98
episode: 002	 steps: 10001	 total steps:020001	 epsilon: 0.96
episode: 003	 steps: 10001	 total steps:030001	 epsilon: 0.95
episode: 004	 steps: 10001	 total steps:040001	 epsilon: 0.93
episode: 005	 steps: 10001	 total steps:050001	 epsilon: 0.91
episode: 006	 steps: 10001	 total steps:060001	 epsilon: 0.89
episode: 007	 steps: 07721	 total steps:067721	 epsilon: 0.88
episode: 008	 steps: 10001	 total steps:077721	 epsilon: 0.86
episode: 009	 steps: 10001	 total steps:087721	 epsilon: 0.84
episode: 010	 steps: 10001	 total steps:097721	 epsilon: 0.82
episode: 011	 steps: 08955	 total steps:106675	 epsilon: 0.81
episode: 012	 steps: 10001	 total steps:116675	 epsilon: 0.79
episode: 013	 steps: 08125	 total steps:124799	 epsilon: 0.78
episode: 014	 steps: 10001	 total steps:134799	 epsilon: 0.76
episode: 015	 steps: 10001	 total steps:144799	 epsilon: 0.74
episode: 016	 steps: 10001	 total steps:154799	 epsilon: 0.72
episode:

episode: 132	 steps: 04129	 total steps:772109	 epsilon: 0.10
episode: 133	 steps: 05227	 total steps:777335	 epsilon: 0.10
episode: 134	 steps: 02539	 total steps:779873	 epsilon: 0.10
episode: 135	 steps: 03061	 total steps:782933	 epsilon: 0.10
episode: 136	 steps: 01169	 total steps:784101	 epsilon: 0.10
episode: 137	 steps: 01303	 total steps:785403	 epsilon: 0.10
episode: 138	 steps: 04189	 total steps:789591	 epsilon: 0.10
episode: 139	 steps: 09433	 total steps:799023	 epsilon: 0.10
episode: 140	 steps: 02275	 total steps:801297	 epsilon: 0.10
episode: 141	 steps: 09755	 total steps:811051	 epsilon: 0.10
episode: 142	 steps: 04213	 total steps:815263	 epsilon: 0.10
episode: 143	 steps: 02383	 total steps:817645	 epsilon: 0.10
episode: 144	 steps: 05631	 total steps:823275	 epsilon: 0.10
episode: 145	 steps: 05093	 total steps:828367	 epsilon: 0.10
episode: 146	 steps: 01341	 total steps:829707	 epsilon: 0.10
episode: 147	 steps: 00593	 total steps:830299	 epsilon: 0.10
episode:

episode: 262	 steps: 06737	 total steps:1261481	 epsilon: 0.10
episode: 263	 steps: 05087	 total steps:1266567	 epsilon: 0.10
episode: 264	 steps: 02319	 total steps:1268885	 epsilon: 0.10
episode: 265	 steps: 00929	 total steps:1269813	 epsilon: 0.10
episode: 266	 steps: 07767	 total steps:1277579	 epsilon: 0.10
episode: 267	 steps: 06451	 total steps:1284029	 epsilon: 0.10
episode: 268	 steps: 08727	 total steps:1292755	 epsilon: 0.10
episode: 269	 steps: 00505	 total steps:1293259	 epsilon: 0.10
episode: 270	 steps: 09749	 total steps:1303007	 epsilon: 0.10
episode: 271	 steps: 02677	 total steps:1305683	 epsilon: 0.10
episode: 272	 steps: 02223	 total steps:1307905	 epsilon: 0.10
episode: 273	 steps: 05883	 total steps:1313787	 epsilon: 0.10
episode: 274	 steps: 00807	 total steps:1314593	 epsilon: 0.10
episode: 275	 steps: 02269	 total steps:1316861	 epsilon: 0.10
episode: 276	 steps: 01135	 total steps:1317995	 epsilon: 0.10
episode: 277	 steps: 02439	 total steps:1320433	 epsilo

episode: 391	 steps: 05231	 total steps:1794393	 epsilon: 0.10
episode: 392	 steps: 10001	 total steps:1804393	 epsilon: 0.10
episode: 393	 steps: 01057	 total steps:1805449	 epsilon: 0.10
episode: 394	 steps: 04157	 total steps:1809605	 epsilon: 0.10
episode: 395	 steps: 00947	 total steps:1810551	 epsilon: 0.10
episode: 396	 steps: 05375	 total steps:1815925	 epsilon: 0.10
episode: 397	 steps: 01425	 total steps:1817349	 epsilon: 0.10
episode: 398	 steps: 01651	 total steps:1818999	 epsilon: 0.10
episode: 399	 steps: 07941	 total steps:1826939	 epsilon: 0.10
episode: 400	 steps: 07037	 total steps:1833975	 epsilon: 0.10
Evaluation: 8	 average reward: -10000.0
episode: 401	 steps: 01301	 total steps:1835275	 epsilon: 0.10
episode: 402	 steps: 02621	 total steps:1837895	 epsilon: 0.10
episode: 403	 steps: 06421	 total steps:1844315	 epsilon: 0.10
episode: 404	 steps: 01473	 total steps:1845787	 epsilon: 0.10
episode: 405	 steps: 10001	 total steps:1855787	 epsilon: 0.10
episode: 406	 s

episode: 520	 steps: 02055	 total steps:2304859	 epsilon: 0.10
episode: 521	 steps: 01017	 total steps:2305875	 epsilon: 0.10
episode: 522	 steps: 00941	 total steps:2306815	 epsilon: 0.10
episode: 523	 steps: 01543	 total steps:2308357	 epsilon: 0.10
episode: 524	 steps: 01087	 total steps:2309443	 epsilon: 0.10
episode: 525	 steps: 10001	 total steps:2319443	 epsilon: 0.10
episode: 526	 steps: 06613	 total steps:2326055	 epsilon: 0.10
episode: 527	 steps: 00775	 total steps:2326829	 epsilon: 0.10
episode: 528	 steps: 10001	 total steps:2336829	 epsilon: 0.10
episode: 529	 steps: 05075	 total steps:2341903	 epsilon: 0.10
episode: 530	 steps: 03473	 total steps:2345375	 epsilon: 0.10
episode: 531	 steps: 04885	 total steps:2350259	 epsilon: 0.10
episode: 532	 steps: 03561	 total steps:2353819	 epsilon: 0.10
episode: 533	 steps: 02585	 total steps:2356403	 epsilon: 0.10
episode: 534	 steps: 07939	 total steps:2364341	 epsilon: 0.10
episode: 535	 steps: 03221	 total steps:2367561	 epsilo

episode: 649	 steps: 01837	 total steps:2859345	 epsilon: 0.10
episode: 650	 steps: 06765	 total steps:2866109	 epsilon: 0.10
Evaluation: 13	 average reward: -10000.0
episode: 651	 steps: 03191	 total steps:2869299	 epsilon: 0.10
episode: 652	 steps: 05671	 total steps:2874969	 epsilon: 0.10
episode: 653	 steps: 04553	 total steps:2879521	 epsilon: 0.10
episode: 654	 steps: 01323	 total steps:2880843	 epsilon: 0.10
episode: 655	 steps: 03907	 total steps:2884749	 epsilon: 0.10
episode: 656	 steps: 01989	 total steps:2886737	 epsilon: 0.10
episode: 657	 steps: 03041	 total steps:2889777	 epsilon: 0.10
episode: 658	 steps: 08603	 total steps:2898379	 epsilon: 0.10
episode: 659	 steps: 04211	 total steps:2902589	 epsilon: 0.10
episode: 660	 steps: 06269	 total steps:2908857	 epsilon: 0.10
episode: 661	 steps: 07759	 total steps:2916615	 epsilon: 0.10
episode: 662	 steps: 03233	 total steps:2919847	 epsilon: 0.10
episode: 663	 steps: 10001	 total steps:2929847	 epsilon: 0.10
episode: 664	 

episode: 778	 steps: 05977	 total steps:3399095	 epsilon: 0.10
episode: 779	 steps: 01183	 total steps:3400277	 epsilon: 0.10
episode: 780	 steps: 01285	 total steps:3401561	 epsilon: 0.10
episode: 781	 steps: 02913	 total steps:3404473	 epsilon: 0.10
episode: 782	 steps: 07117	 total steps:3411589	 epsilon: 0.10
episode: 783	 steps: 05283	 total steps:3416871	 epsilon: 0.10
episode: 784	 steps: 10001	 total steps:3426871	 epsilon: 0.10
episode: 785	 steps: 00661	 total steps:3427531	 epsilon: 0.10
episode: 786	 steps: 01805	 total steps:3429335	 epsilon: 0.10
episode: 787	 steps: 02277	 total steps:3431611	 epsilon: 0.10
episode: 788	 steps: 00609	 total steps:3432219	 epsilon: 0.10
episode: 789	 steps: 01495	 total steps:3433713	 epsilon: 0.10
episode: 790	 steps: 06141	 total steps:3439853	 epsilon: 0.10
episode: 791	 steps: 04451	 total steps:3444303	 epsilon: 0.10
episode: 792	 steps: 04239	 total steps:3448541	 epsilon: 0.10
episode: 793	 steps: 01683	 total steps:3450223	 epsilo

episode: 907	 steps: 03009	 total steps:3946901	 epsilon: 0.10
episode: 908	 steps: 04097	 total steps:3950997	 epsilon: 0.10
episode: 909	 steps: 08183	 total steps:3959179	 epsilon: 0.10
episode: 910	 steps: 01061	 total steps:3960239	 epsilon: 0.10
episode: 911	 steps: 03881	 total steps:3964119	 epsilon: 0.10
episode: 912	 steps: 01271	 total steps:3965389	 epsilon: 0.10
episode: 913	 steps: 04149	 total steps:3969537	 epsilon: 0.10
episode: 914	 steps: 02107	 total steps:3971643	 epsilon: 0.10
episode: 915	 steps: 02855	 total steps:3974497	 epsilon: 0.10
episode: 916	 steps: 04211	 total steps:3978707	 epsilon: 0.10
episode: 917	 steps: 06105	 total steps:3984811	 epsilon: 0.10
episode: 918	 steps: 03307	 total steps:3988117	 epsilon: 0.10
episode: 919	 steps: 00753	 total steps:3988869	 epsilon: 0.10
episode: 920	 steps: 07621	 total steps:3996489	 epsilon: 0.10
episode: 921	 steps: 03585	 total steps:4000073	 epsilon: 0.10
episode: 922	 steps: 04001	 total steps:4004073	 epsilo

## Prioritized_DDQN

In [5]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

device = torch.device("cpu")

# Hyperparameters
n_training_episodes = 1000
gamma = 0.99
learning_rate = 0.00025/4  # 0.1
max_training_steps = 10000

# Exploration parameters
epsilon_max = 1
epsilon_min = 0.01
epsilon_frame = 500000

# replay memory parameters
replay_size = 100000
batch_size = 32


# fixed target network
fixed_target = True
copy_target = 30000


debug = True
double = True
prioritized = True

env = gym.make('MountainCar-v0', render_mode='rgb_array')


car = TrainMountainCar(n_training_episodes=n_training_episodes, gamma=gamma, learning_rate=learning_rate,
                       epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_frame=epsilon_frame,
                       max_steps=max_training_steps, batch_size=batch_size, fixed_target=fixed_target,
                       copy_target=copy_target, replay_size=replay_size, double=double, prioritized=prioritized,
                       debug=debug)

total_rewards, total_steps_list, best_policy, evaluations = car.train()

torch.save(best_policy, 'data/Prioritized_DDQN_MLP.pth')
np.savetxt(f'data/steps_Prioritized_DDQN_MLP.txt', total_steps_list)
np.savetxt(f'data/q_values_Prioritized_DDQN_MLP.txt', q_measures)
np.savetxt(f'data/eval_Prioritized_DDQN_MLP.txt', evaluations)

plt.plot(np.arange(len(total_steps_list)) + 1, total_steps_list, zorder=0, label='evaluations')
x = np.arange(50, n_training_episodes+1, 50)
plt.scatter(x, [-e for e in evaluations], color='r', marker='x', zorder=1, label='evaluations')
N = 10
steps_mean = running_mean(total_steps_list, N)
plt.plot(np.arange(len(steps_mean)) + 1, steps_mean, zorder=0, label='running average')
plt.legend()
plt.xlabel('Episode')
plt.ylabel('Steps')
plt.title('Steps per Episode - Prioritized_DDQN_MLP')
plt.savefig('plots/steps_Prioritized_DDQN_MLP.png')
plt.close()


episode: 001	 steps: 10001	 total steps:010001	 epsilon: 0.98
episode: 002	 steps: 10001	 total steps:020001	 epsilon: 0.96
episode: 003	 steps: 10001	 total steps:030001	 epsilon: 0.94
episode: 004	 steps: 10001	 total steps:040001	 epsilon: 0.92
episode: 005	 steps: 10001	 total steps:050001	 epsilon: 0.90
episode: 006	 steps: 10001	 total steps:060001	 epsilon: 0.88
episode: 007	 steps: 10001	 total steps:070001	 epsilon: 0.86
episode: 008	 steps: 10001	 total steps:080001	 epsilon: 0.84
episode: 009	 steps: 10001	 total steps:090001	 epsilon: 0.82
episode: 010	 steps: 10001	 total steps:100001	 epsilon: 0.80
episode: 011	 steps: 10001	 total steps:110001	 epsilon: 0.78
episode: 012	 steps: 10001	 total steps:120001	 epsilon: 0.76
episode: 013	 steps: 09111	 total steps:129111	 epsilon: 0.74
episode: 014	 steps: 05877	 total steps:134987	 epsilon: 0.73
episode: 015	 steps: 10001	 total steps:144987	 epsilon: 0.71
episode: 016	 steps: 02917	 total steps:147903	 epsilon: 0.71
episode:

episode: 132	 steps: 01457	 total steps:444965	 epsilon: 0.12
episode: 133	 steps: 01127	 total steps:446091	 epsilon: 0.12
episode: 134	 steps: 00703	 total steps:446793	 epsilon: 0.12
episode: 135	 steps: 01057	 total steps:447849	 epsilon: 0.11
episode: 136	 steps: 03499	 total steps:451347	 epsilon: 0.11
episode: 137	 steps: 04539	 total steps:455885	 epsilon: 0.10
episode: 138	 steps: 02787	 total steps:458671	 epsilon: 0.09
episode: 139	 steps: 00435	 total steps:459105	 epsilon: 0.09
episode: 140	 steps: 01161	 total steps:460265	 epsilon: 0.09
episode: 141	 steps: 02425	 total steps:462689	 epsilon: 0.08
episode: 142	 steps: 02139	 total steps:464827	 epsilon: 0.08
episode: 143	 steps: 00753	 total steps:465579	 epsilon: 0.08
episode: 144	 steps: 00497	 total steps:466075	 epsilon: 0.08
episode: 145	 steps: 00517	 total steps:466591	 epsilon: 0.08
episode: 146	 steps: 00409	 total steps:466999	 epsilon: 0.08
episode: 147	 steps: 00385	 total steps:467383	 epsilon: 0.07
episode:

episode: 263	 steps: 00255	 total steps:569401	 epsilon: 0.01
episode: 264	 steps: 00493	 total steps:569893	 epsilon: 0.01
episode: 265	 steps: 01145	 total steps:571037	 epsilon: 0.01
episode: 266	 steps: 00247	 total steps:571283	 epsilon: 0.01
episode: 267	 steps: 03585	 total steps:574867	 epsilon: 0.01
episode: 268	 steps: 01443	 total steps:576309	 epsilon: 0.01
episode: 269	 steps: 00939	 total steps:577247	 epsilon: 0.01
episode: 270	 steps: 00877	 total steps:578123	 epsilon: 0.01
episode: 271	 steps: 00733	 total steps:578855	 epsilon: 0.01
episode: 272	 steps: 00873	 total steps:579727	 epsilon: 0.01
episode: 273	 steps: 01077	 total steps:580803	 epsilon: 0.01
episode: 274	 steps: 00781	 total steps:581583	 epsilon: 0.01
episode: 275	 steps: 00325	 total steps:581907	 epsilon: 0.01
episode: 276	 steps: 00509	 total steps:582415	 epsilon: 0.01
episode: 277	 steps: 00517	 total steps:582931	 epsilon: 0.01
episode: 278	 steps: 01097	 total steps:584027	 epsilon: 0.01
episode:

episode: 394	 steps: 01219	 total steps:691535	 epsilon: 0.01
episode: 395	 steps: 00509	 total steps:692043	 epsilon: 0.01
episode: 396	 steps: 00609	 total steps:692651	 epsilon: 0.01
episode: 397	 steps: 00977	 total steps:693627	 epsilon: 0.01
episode: 398	 steps: 01105	 total steps:694731	 epsilon: 0.01
episode: 399	 steps: 02583	 total steps:697313	 epsilon: 0.01
episode: 400	 steps: 02849	 total steps:700161	 epsilon: 0.01
Evaluation: 8	 average reward: -10000.0
episode: 401	 steps: 00333	 total steps:700493	 epsilon: 0.01
episode: 402	 steps: 01063	 total steps:701555	 epsilon: 0.01
episode: 403	 steps: 01045	 total steps:702599	 epsilon: 0.01
episode: 404	 steps: 02317	 total steps:704915	 epsilon: 0.01
episode: 405	 steps: 00637	 total steps:705551	 epsilon: 0.01
episode: 406	 steps: 00803	 total steps:706353	 epsilon: 0.01
episode: 407	 steps: 00781	 total steps:707133	 epsilon: 0.01
episode: 408	 steps: 00733	 total steps:707865	 epsilon: 0.01
episode: 409	 steps: 01329	 to

episode: 525	 steps: 01105	 total steps:824997	 epsilon: 0.01
episode: 526	 steps: 04251	 total steps:829247	 epsilon: 0.01
episode: 527	 steps: 00225	 total steps:829471	 epsilon: 0.01
episode: 528	 steps: 00465	 total steps:829935	 epsilon: 0.01
episode: 529	 steps: 00795	 total steps:830729	 epsilon: 0.01
episode: 530	 steps: 01117	 total steps:831845	 epsilon: 0.01
episode: 531	 steps: 00377	 total steps:832221	 epsilon: 0.01
episode: 532	 steps: 00669	 total steps:832889	 epsilon: 0.01
episode: 533	 steps: 02139	 total steps:835027	 epsilon: 0.01
episode: 534	 steps: 00523	 total steps:835549	 epsilon: 0.01
episode: 535	 steps: 00295	 total steps:835843	 epsilon: 0.01
episode: 536	 steps: 01139	 total steps:836981	 epsilon: 0.01
episode: 537	 steps: 01143	 total steps:838123	 epsilon: 0.01
episode: 538	 steps: 01037	 total steps:839159	 epsilon: 0.01
episode: 539	 steps: 00559	 total steps:839717	 epsilon: 0.01
episode: 540	 steps: 00617	 total steps:840333	 epsilon: 0.01
episode:

episode: 656	 steps: 00547	 total steps:950225	 epsilon: 0.01
episode: 657	 steps: 00835	 total steps:951059	 epsilon: 0.01
episode: 658	 steps: 01695	 total steps:952753	 epsilon: 0.01
episode: 659	 steps: 00515	 total steps:953267	 epsilon: 0.01
episode: 660	 steps: 00723	 total steps:953989	 epsilon: 0.01
episode: 661	 steps: 00783	 total steps:954771	 epsilon: 0.01
episode: 662	 steps: 00939	 total steps:955709	 epsilon: 0.01
episode: 663	 steps: 00321	 total steps:956029	 epsilon: 0.01
episode: 664	 steps: 00481	 total steps:956509	 epsilon: 0.01
episode: 665	 steps: 01047	 total steps:957555	 epsilon: 0.01
episode: 666	 steps: 00491	 total steps:958045	 epsilon: 0.01
episode: 667	 steps: 00381	 total steps:958425	 epsilon: 0.01
episode: 668	 steps: 00259	 total steps:958683	 epsilon: 0.01
episode: 669	 steps: 00431	 total steps:959113	 epsilon: 0.01
episode: 670	 steps: 01709	 total steps:960821	 epsilon: 0.01
episode: 671	 steps: 00205	 total steps:961025	 epsilon: 0.01
episode:

episode: 786	 steps: 02265	 total steps:1080157	 epsilon: 0.01
episode: 787	 steps: 00439	 total steps:1080595	 epsilon: 0.01
episode: 788	 steps: 00821	 total steps:1081415	 epsilon: 0.01
episode: 789	 steps: 01659	 total steps:1083073	 epsilon: 0.01
episode: 790	 steps: 01535	 total steps:1084607	 epsilon: 0.01
episode: 791	 steps: 01427	 total steps:1086033	 epsilon: 0.01
episode: 792	 steps: 01175	 total steps:1087207	 epsilon: 0.01
episode: 793	 steps: 00387	 total steps:1087593	 epsilon: 0.01
episode: 794	 steps: 01417	 total steps:1089009	 epsilon: 0.01
episode: 795	 steps: 02871	 total steps:1091879	 epsilon: 0.01
episode: 796	 steps: 01039	 total steps:1092917	 epsilon: 0.01
episode: 797	 steps: 01221	 total steps:1094137	 epsilon: 0.01
episode: 798	 steps: 00387	 total steps:1094523	 epsilon: 0.01
episode: 799	 steps: 00583	 total steps:1095105	 epsilon: 0.01
episode: 800	 steps: 01203	 total steps:1096307	 epsilon: 0.01
Evaluation: 16	 average reward: -10000.0
episode: 801	 

episode: 915	 steps: 00571	 total steps:1220279	 epsilon: 0.01
episode: 916	 steps: 01343	 total steps:1221621	 epsilon: 0.01
episode: 917	 steps: 00737	 total steps:1222357	 epsilon: 0.01
episode: 918	 steps: 00797	 total steps:1223153	 epsilon: 0.01
episode: 919	 steps: 00649	 total steps:1223801	 epsilon: 0.01
episode: 920	 steps: 00513	 total steps:1224313	 epsilon: 0.01
episode: 921	 steps: 00289	 total steps:1224601	 epsilon: 0.01
episode: 922	 steps: 00523	 total steps:1225123	 epsilon: 0.01
episode: 923	 steps: 01841	 total steps:1226963	 epsilon: 0.01
episode: 924	 steps: 00703	 total steps:1227665	 epsilon: 0.01
episode: 925	 steps: 01087	 total steps:1228751	 epsilon: 0.01
episode: 926	 steps: 01611	 total steps:1230361	 epsilon: 0.01
episode: 927	 steps: 00919	 total steps:1231279	 epsilon: 0.01
episode: 928	 steps: 01689	 total steps:1232967	 epsilon: 0.01
episode: 929	 steps: 02349	 total steps:1235315	 epsilon: 0.01
episode: 930	 steps: 00767	 total steps:1236081	 epsilo

NameError: name 'q_measures' is not defined