In [3]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from helper_DQN import running_mean, scale_and_resize, ExperienceMemory, PrioritizedExperienceReplayBuffer
import collections
import random
import gymnasium as gym
from models import MLP_state
import torchvision.transforms as transforms

import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

device = torch.device('cpu')


class TrainMountainCar:
    def __init__(self, n_training_episodes=200, gamma=0.99, learning_rate=0.1, epsilon_max=0.5,
                 epsilon_min=0.05, max_steps=10000, batch_size=32, fixed_target=False, min_replay=80000,
                 copy_target=10000, replay_size=100000, double=False, dueling=False, prioritized=False, debug=False,
                 eval_epsilon=None, eval_episodes=10, eval_every=50, noisy=False, distributional=False, env=None,
                 epsilon_frame=500000):
        self.n_training_episodes = n_training_episodes
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.epsilon_max = epsilon_max
        self.epsilon_min = epsilon_min
        self.epsilon_frame = epsilon_frame
        self.max_steps = max_steps
        self.batch_size = batch_size
        self.fixed_target = fixed_target
        self.copy_target = copy_target
        self.replay_size = replay_size
        self.double = double
        self.dueling = dueling
        self.debug = debug
        self.eval_epsilon = eval_epsilon if eval_epsilon is not None else epsilon_min
        self.eval_episodes = eval_episodes
        self.eval_every = eval_every
        self.prioritized = prioritized
        self.min_replay = min_replay

    def epsilon_greedy_policy(self, policy: torch.nn.Module, X, epsilon: float, env: gym.envs):
        """
        Samples a random action with probability epsilon and picks the maximum action under policy network otherwise.
        :param policy: Policy Network under which to take action
        :param X: stacked tensor of shape (4,80,120)
        :param epsilon: float probability of sampling a random action
        :param env: Gymnasium environment
        :return: Randomly sampled action or maximum action under policy network
        """
        if random.uniform(0, 1) < epsilon:
            return env.action_space.sample()
        else:
            with torch.no_grad():
                X = np.vstack(X).astype(np.float32)
                X = torch.tensor(X).squeeze(1)
                return policy(X).max(0)[1].view(1, 1).item()


#     def get_action(self, policy, s, eps=0.1):
#         with torch.no_grad():
#             self.seq.pop(0)
#             self.seq.append(s)
#             if np.random.random() >= eps:
#                 X = torch.tensor(np.vstack(self.seq).astype(np.float32), device=device, dtype=torch.float)
#                 a = policy(X.unsqueeze(0))
#                 # a = a[:, -1, :]  # select last element of seq
#                 a = a.max(1)[1]
#                 return a.item()
#             else:
#                 return env.action_space.sample()

    # def initialize_measuring_states(self, env):
    #     """
    #     Randomly samples 200 states by taking random actions
    #     :param env: Gymnasium environment
    #     :return: list of states that were visited by random walk
    #     """
    #     measuring_states = []
    #     env.reset()
    #     for i in range(200):
    #         action = env.action_space.sample()
    #         env.step(action)
    #         img = env.render()
    #         img = transforms.ToTensor()(img)
    #         measuring_states.append(transform(img))
    #     env.reset()
    #     return measuring_states

    def eval(self, policy: torch.nn.Module, env: gym.envs):
        """
        Evaluate a policy and return the average reward over self.eval_episodes trials with maximum 10000 steps each
        :param policy: The policy to be evaluated
        :param env: The Gymnasium environment
        :return: average over rewards collected turing trials
        """
        rewards_list = []
        for episode in range(self.eval_episodes):
            state = env.reset()[0]

            # up to 30 no-op actions
            noop = random.randint(0, 30)
            for i in range(noop):
                action = env.action_space.sample()
                state,_,_,_,_ = env.step(action)

            rewards = 0

            for i in range(0, self.max_steps):      # max episode length 10000
                action = self.epsilon_greedy_policy(policy, state, self.eval_epsilon, env)
#                 action = self.get_action(policy, state, self.eval_epsilon)
                state, reward, terminated, _, _ = env.step(action)
                rewards += reward

                if terminated:
                    break

            rewards_list.append(rewards)

        return np.mean(rewards_list)
    
    def train(self):
        """
        trains DQN using a fixed target network if self.fixed_target == True, otherwise with the policy network.
        :return: list of total rewards, list of steps in each episode, q values over sampled states
        """

        # keep track of total steps and rewards
        total_steps = 0
        total_rewards = []
        total_steps_list = []
        evaluations = []
        td_errors = []

        # initialize states in which Q value is measured every X episodes to track progress
        # measuring_states = self.initialize_measuring_states(env)
        # q_measures = []

        # Initialize Experience Memory
        if self.prioritized:
            beta_start = 0.5
            beta_frames = 1000
            beta_by_frame = lambda total_steps: min(1.0, beta_start + total_steps * (1.0 - beta_start) / beta_frames)
            experience_memory = PrioritizedExperienceReplayBuffer(alpha=0.7, batch_size=self.batch_size,
                                                                  buffer_size=self.replay_size)
        else:
            experience_memory = ExperienceMemory(self.replay_size)

        # initialize policy (and target) network
        if self.dueling:
            policy = MLP_Dueling_state(env.action_space.n).to(device)
            if self.fixed_target:
                target = MLP_Dueling_state(env.action_space.n).to(device)
                target.load_state_dict(policy.state_dict())
                target.eval()
        else:
            policy = MLP_state(env.action_space.n).to(device)
            if self.fixed_target:
                target = MLP_state(env.action_space.n).to(device)
                target.load_state_dict(policy.state_dict())
                target.eval()

        # Best values found during evaluation
        best_reward = - float('inf')
        best_policy = policy.state_dict()

        optimizer = torch.optim.RMSprop(policy.parameters(), lr=self.learning_rate, weight_decay=0.99, momentum=0.95) 

        for episode in range(self.n_training_episodes):
            steps = 0
            total_steps += 1
            rewards = 0

            state = env.reset()[0]

            # up to 30 no-op actions
            noop = random.randint(0, 30)
            for i in range(noop):
                action = env.action_space.sample()
                state, _, _, _, _ = env.step(action)

            while True:
                # linear epsilon decay based on steps
                epsilon = max(self.epsilon_max - ((self.epsilon_max - self.epsilon_min)/self.epsilon_frame) *
                              total_steps, self.epsilon_min)

                # Choose the action At using epsilon greedy policy
                action = self.epsilon_greedy_policy(policy, state, epsilon, env)
#                 action = self.get_action(policy, X, epsilon)
                # take action
                new_state, reward, terminated, _, _ = env.step(action)

                experience_memory.add((state, action, reward, new_state, terminated))

                steps += 1
                total_steps += 1

                if len(experience_memory) >= self.min_replay:  # self.batch_size:
                    if self.prioritized:
                        beta = beta_by_frame(total_steps)
                        idxs, experiences, weights = experience_memory.sample(beta)
#                         states, actions, _rewards, next_states, terminations = (i for i in
#                                                                                 zip(*experiences))  # (torch.Tensor(vs).to(device) for vs in
                        # zip(*experiences))
                        weights = torch.tensor(weights).to(device)
                    else:
                        experiences = experience_memory.sample(self.batch_size)
#                     print(list(len(i) for i in zip(*experiences)))
                    states, actions, _rewards, next_states, terminations = (i for i in zip(*experiences))
                    a = torch.tensor(actions).long().unsqueeze(dim=1).to(device)
                    r = torch.tensor(_rewards).unsqueeze(dim=1).to(device)
                    states = np.vstack(states).astype(np.float32)
                    states = torch.tensor(states).to(device)
                    next_states = np.vstack(states).astype(np.float32)
                    next_states = torch.tensor(next_states).to(device)
#                     states = np.vstack(states).astype(np.float32)
#                     states = torch.from_numpy(states)
#                     next_states = np.vstack(next_states).astype(np.float32)
#                     next_states = torch.from_numpy(next_states)
#                     states = torch.reshape(states, (self.batch_size, 1, 84, 84)).to(device)  # 80,120
#                     next_states = torch.reshape(next_states, (self.batch_size, 1, 84, 84)).to(device)
                    mask = [i for i, x in enumerate(terminations) if not x]  # get all non-final states
                    
#                     reward = torch.tensor([reward]).to(device)
#                     action = torch.tensor([action]).unsqueeze(0).to(device)
#                     state = X.unsqueeze(0)
#                     new_state = X_new.unsqueeze(0)
                    
#                     steps += 1
#                     total_steps += 1
                    
                    state_action_values = policy(states).gather(1, a)
                    
                    next_state_values = torch.zeros(self.batch_size, device=device)
                    
                    # update network
                    if self.double:
                        max_next_action = policy(next_states).max(1)[1].view(-1, 1)
                        next_state_values[mask] = target(next_states[mask]).gather(1, max_next_action[mask]).squeeze(1)
                    elif self.fixed_target:
                        next_state_values[mask] = target(next_states[mask]).max(1)[0].detach()
                    else:
                        next_state_values[mask] = policy(next_states[mask]).max(1)[0].detach()
                    # Compute the expected Q values
                    expected_state_action_values = (next_state_values * self.gamma) + r.squeeze(1)
                    

                    if self.prioritized:
                            diff = expected_state_action_values.unsqueeze(1) - state_action_values
                            experience_memory.update_priority(idxs, diff.cpu().detach().squeeze().abs().numpy().tolist())

                            loss = torch.nn.MSELoss()(state_action_values,
                                                      expected_state_action_values.unsqueeze(1)).squeeze() * weights
                    else:
                    # loss = torch.nn.MSELoss()(state_action_values, expected_state_action_values.unsqueeze(1)).squeeze()
                        loss = torch.nn.MSELoss()(state_action_values, expected_state_action_values.unsqueeze(1))

                    loss = loss.mean()
                    td_errors.append(loss.detach().cpu())  # save td errors

                    # Optimize the model
                    optimizer.zero_grad()
                    loss.backward()

                    torch.nn.utils.clip_grad_norm_(policy.parameters(), 10.)    # clip gradients
                    optimizer.step()

                # Update total reward
                rewards += reward

                # update current state to be next state
                state = new_state

                # If done, finish the episode
                if terminated or steps >= self.max_steps-1:  # or truncated:
                    # Track rewards
                    total_rewards.append(rewards)
                    total_steps_list.append(steps)

                    # # measure Q values in selected states
                    # Q_states = torch.stack(measuring_states).to(device)
                    # Q_states = torch.unique(Q_states, dim=0, sorted=False)  # eliminate duplicate states
                    # with torch.no_grad():
                    #     q_measures.append(torch.mean(policy(Q_states).max(1)[0]).item())

                    # Evaluate current policy and save optimal policy weights
                    if episode == self.n_training_episodes-1 or (episode > 0 and episode % self.eval_every == 0):
                        eval_reward = self.eval(policy, env)
                        if eval_reward > best_reward:
                            best_reward = eval_reward
                            best_policy = policy.state_dict()
                        print(f"Evaluation: {int(episode/self.eval_every)}\t average reward: {eval_reward}")
                        evaluations.append(eval_reward)

                    # print training information
                    if self.debug:
                        print(f"episode: {episode + 1:03d}\t steps: {steps + 1:05d}\t total steps:"
                              f"{total_steps + 1:06d}\t epsilon: {epsilon:.2f}")#\t average Q: {q_measures[-1]:.3f}")
                    break

                if self.fixed_target:
                    # copy policy network weights to target net every copy_target steps
                    if total_steps % self.copy_target <= 4:
                        target.load_state_dict(policy.state_dict())

        return total_rewards, total_steps_list, best_policy, evaluations, td_errors, policy.state_dict()




## DQN

In [2]:
# Hyperparameters
n_training_episodes = 1000
gamma = 0.99
learning_rate = 0.00025  # 0.1
max_training_steps = 10000

# Exploration parameters
epsilon_max = 1
epsilon_min = 0.1
eval_epsilon = 0.05
# epsilon_frame = 100000

# replay memory parameters
replay_size = 200000
batch_size = 32

# fixed target network
fixed_target = True
copy_target = 10000

debug = True

epsilon_frame = 1000000

min_replay = 80000

transform = scale_and_resize()

env = gym.make('MountainCar-v0', render_mode='rgb_array')

car = TrainMountainCar(n_training_episodes=n_training_episodes, gamma=gamma, learning_rate=learning_rate,
                       epsilon_max=epsilon_max, epsilon_min=epsilon_min, min_replay=min_replay,
                       max_steps=max_training_steps, batch_size=batch_size, fixed_target=fixed_target,
                       copy_target=copy_target, debug=debug, env=env, epsilon_frame=epsilon_frame,
                       eval_epsilon=eval_epsilon)

total_rewards, total_steps_list, best_policy, evaluations, td_error, final_policy = car.train()

# save best policy as well as steps and q measures
torch.save(best_policy, 'data/DQN_MLP_best.pth')
torch.save(final_policy, 'data/DQN_MLP_final.pth')
np.savetxt(f'data/td_error_DQN_MLP.txt', td_error)
np.savetxt(f'data/steps_DQN_MLP_txt', total_steps_list)
# np.savetxt(f'data/q_values_DQN.txt', q_measures)
np.savetxt(f'data/eval_DQN_<MLP.txt', evaluations)

# Plot steps per episode
plt.plot(np.arange(len(total_steps_list)) + 1, total_steps_list, zorder=0, label='training')
x = np.arange(50, n_training_episodes+1, 50)
plt.scatter(x, [-e for e in evaluations], color='r', marker='x', zorder=1, label='evaluations')
N = 10
steps_mean = running_mean(total_steps_list, N)
plt.plot(np.arange(len(steps_mean)) + 1, steps_mean, zorder=0, label='running average')
plt.legend()
plt.xlabel('Episode')
plt.ylabel('Steps')
plt.title('Steps per Episode - DQN_MLP')
plt.savefig('plots/steps_DQN_MLP.png')
plt.close()

# Plot q measures per episode
# plt.plot(np.arange(len(q_measures)) + 1, q_measures)
# plt.xlabel('Episode')
# plt.ylabel('Average Q')
# plt.title('Average Q measure over sampled states')
# plt.savefig('plots/q_measures_DQN.png')
# plt.close()

episode: 001	 steps: 10000	 total steps:010001	 epsilon: 0.99
episode: 002	 steps: 10000	 total steps:020001	 epsilon: 0.98
episode: 003	 steps: 03565	 total steps:023566	 epsilon: 0.98
episode: 004	 steps: 10000	 total steps:033566	 epsilon: 0.97
episode: 005	 steps: 10000	 total steps:043566	 epsilon: 0.96
episode: 006	 steps: 10000	 total steps:053566	 epsilon: 0.95
episode: 007	 steps: 10000	 total steps:063566	 epsilon: 0.94
episode: 008	 steps: 10000	 total steps:073566	 epsilon: 0.93
episode: 009	 steps: 10000	 total steps:083566	 epsilon: 0.92
episode: 010	 steps: 10000	 total steps:093566	 epsilon: 0.92
episode: 011	 steps: 10000	 total steps:103566	 epsilon: 0.91
episode: 012	 steps: 10000	 total steps:113566	 epsilon: 0.90
episode: 013	 steps: 10000	 total steps:123566	 epsilon: 0.89
episode: 014	 steps: 10000	 total steps:133566	 epsilon: 0.88
episode: 015	 steps: 10000	 total steps:143566	 epsilon: 0.87
episode: 016	 steps: 10000	 total steps:153566	 epsilon: 0.86
episode:

episode: 132	 steps: 03303	 total steps:966274	 epsilon: 0.13
episode: 133	 steps: 05670	 total steps:971944	 epsilon: 0.13
episode: 134	 steps: 04898	 total steps:976842	 epsilon: 0.12
episode: 135	 steps: 01614	 total steps:978456	 epsilon: 0.12
episode: 136	 steps: 03358	 total steps:981814	 epsilon: 0.12
episode: 137	 steps: 04152	 total steps:985966	 epsilon: 0.11
episode: 138	 steps: 04935	 total steps:990901	 epsilon: 0.11
episode: 139	 steps: 07687	 total steps:998588	 epsilon: 0.10
episode: 140	 steps: 01952	 total steps:1000540	 epsilon: 0.10
episode: 141	 steps: 00927	 total steps:1001467	 epsilon: 0.10
episode: 142	 steps: 01887	 total steps:1003354	 epsilon: 0.10
episode: 143	 steps: 01941	 total steps:1005295	 epsilon: 0.10
episode: 144	 steps: 01120	 total steps:1006415	 epsilon: 0.10
episode: 145	 steps: 02433	 total steps:1008848	 epsilon: 0.10
episode: 146	 steps: 01411	 total steps:1010259	 epsilon: 0.10
episode: 147	 steps: 02130	 total steps:1012389	 epsilon: 0.10


episode: 261	 steps: 01510	 total steps:1434201	 epsilon: 0.10
episode: 262	 steps: 08929	 total steps:1443130	 epsilon: 0.10
episode: 263	 steps: 10000	 total steps:1453130	 epsilon: 0.10
episode: 264	 steps: 03848	 total steps:1456978	 epsilon: 0.10
episode: 265	 steps: 01828	 total steps:1458806	 epsilon: 0.10
episode: 266	 steps: 01673	 total steps:1460479	 epsilon: 0.10
episode: 267	 steps: 01166	 total steps:1461645	 epsilon: 0.10
episode: 268	 steps: 02522	 total steps:1464167	 epsilon: 0.10
episode: 269	 steps: 05327	 total steps:1469494	 epsilon: 0.10
episode: 270	 steps: 03743	 total steps:1473237	 epsilon: 0.10
episode: 271	 steps: 04273	 total steps:1477510	 epsilon: 0.10
episode: 272	 steps: 02544	 total steps:1480054	 epsilon: 0.10
episode: 273	 steps: 02169	 total steps:1482223	 epsilon: 0.10
episode: 274	 steps: 01739	 total steps:1483962	 epsilon: 0.10
episode: 275	 steps: 00705	 total steps:1484667	 epsilon: 0.10
episode: 276	 steps: 00746	 total steps:1485413	 epsilo

episode: 390	 steps: 03503	 total steps:1913019	 epsilon: 0.10
episode: 391	 steps: 01282	 total steps:1914301	 epsilon: 0.10
episode: 392	 steps: 03317	 total steps:1917618	 epsilon: 0.10
episode: 393	 steps: 03039	 total steps:1920657	 epsilon: 0.10
episode: 394	 steps: 02504	 total steps:1923161	 epsilon: 0.10
episode: 395	 steps: 10000	 total steps:1933161	 epsilon: 0.10
episode: 396	 steps: 07456	 total steps:1940617	 epsilon: 0.10
episode: 397	 steps: 01463	 total steps:1942080	 epsilon: 0.10
episode: 398	 steps: 10000	 total steps:1952080	 epsilon: 0.10
episode: 399	 steps: 03089	 total steps:1955169	 epsilon: 0.10
episode: 400	 steps: 00331	 total steps:1955500	 epsilon: 0.10
Evaluation: 8	 average reward: -8658.9
episode: 401	 steps: 03526	 total steps:1959026	 epsilon: 0.10
episode: 402	 steps: 04098	 total steps:1963124	 epsilon: 0.10
episode: 403	 steps: 09095	 total steps:1972219	 epsilon: 0.10
episode: 404	 steps: 04900	 total steps:1977119	 epsilon: 0.10
episode: 405	 st

episode: 519	 steps: 00562	 total steps:2424271	 epsilon: 0.10
episode: 520	 steps: 05605	 total steps:2429876	 epsilon: 0.10
episode: 521	 steps: 01423	 total steps:2431299	 epsilon: 0.10
episode: 522	 steps: 02414	 total steps:2433713	 epsilon: 0.10
episode: 523	 steps: 02121	 total steps:2435834	 epsilon: 0.10
episode: 524	 steps: 02273	 total steps:2438107	 epsilon: 0.10
episode: 525	 steps: 01230	 total steps:2439337	 epsilon: 0.10
episode: 526	 steps: 04476	 total steps:2443813	 epsilon: 0.10
episode: 527	 steps: 01052	 total steps:2444865	 epsilon: 0.10
episode: 528	 steps: 02570	 total steps:2447435	 epsilon: 0.10
episode: 529	 steps: 01743	 total steps:2449178	 epsilon: 0.10
episode: 530	 steps: 01843	 total steps:2451021	 epsilon: 0.10
episode: 531	 steps: 00630	 total steps:2451651	 epsilon: 0.10
episode: 532	 steps: 01053	 total steps:2452704	 epsilon: 0.10
episode: 533	 steps: 02034	 total steps:2454738	 epsilon: 0.10
episode: 534	 steps: 01493	 total steps:2456231	 epsilo

episode: 648	 steps: 01618	 total steps:2906534	 epsilon: 0.10
episode: 649	 steps: 01753	 total steps:2908287	 epsilon: 0.10
episode: 650	 steps: 02714	 total steps:2911001	 epsilon: 0.10
Evaluation: 13	 average reward: -10000.0
episode: 651	 steps: 01315	 total steps:2912316	 epsilon: 0.10
episode: 652	 steps: 02319	 total steps:2914635	 epsilon: 0.10
episode: 653	 steps: 07026	 total steps:2921661	 epsilon: 0.10
episode: 654	 steps: 01020	 total steps:2922681	 epsilon: 0.10
episode: 655	 steps: 01240	 total steps:2923921	 epsilon: 0.10
episode: 656	 steps: 04483	 total steps:2928404	 epsilon: 0.10
episode: 657	 steps: 01736	 total steps:2930140	 epsilon: 0.10
episode: 658	 steps: 02838	 total steps:2932978	 epsilon: 0.10
episode: 659	 steps: 00938	 total steps:2933916	 epsilon: 0.10
episode: 660	 steps: 04243	 total steps:2938159	 epsilon: 0.10
episode: 661	 steps: 05974	 total steps:2944133	 epsilon: 0.10
episode: 662	 steps: 01243	 total steps:2945376	 epsilon: 0.10
episode: 663	 

episode: 777	 steps: 01584	 total steps:3397356	 epsilon: 0.10
episode: 778	 steps: 07283	 total steps:3404639	 epsilon: 0.10
episode: 779	 steps: 01308	 total steps:3405947	 epsilon: 0.10
episode: 780	 steps: 01134	 total steps:3407081	 epsilon: 0.10
episode: 781	 steps: 03068	 total steps:3410149	 epsilon: 0.10
episode: 782	 steps: 02044	 total steps:3412193	 epsilon: 0.10
episode: 783	 steps: 00922	 total steps:3413115	 epsilon: 0.10
episode: 784	 steps: 03131	 total steps:3416246	 epsilon: 0.10
episode: 785	 steps: 10000	 total steps:3426246	 epsilon: 0.10
episode: 786	 steps: 01539	 total steps:3427785	 epsilon: 0.10
episode: 787	 steps: 02546	 total steps:3430331	 epsilon: 0.10
episode: 788	 steps: 05730	 total steps:3436061	 epsilon: 0.10
episode: 789	 steps: 09037	 total steps:3445098	 epsilon: 0.10
episode: 790	 steps: 03773	 total steps:3448871	 epsilon: 0.10
episode: 791	 steps: 00484	 total steps:3449355	 epsilon: 0.10
episode: 792	 steps: 08108	 total steps:3457463	 epsilo

episode: 906	 steps: 03240	 total steps:3993686	 epsilon: 0.10
episode: 907	 steps: 10000	 total steps:4003686	 epsilon: 0.10
episode: 908	 steps: 01666	 total steps:4005352	 epsilon: 0.10
episode: 909	 steps: 05222	 total steps:4010574	 epsilon: 0.10
episode: 910	 steps: 10000	 total steps:4020574	 epsilon: 0.10
episode: 911	 steps: 08171	 total steps:4028745	 epsilon: 0.10
episode: 912	 steps: 01249	 total steps:4029994	 epsilon: 0.10
episode: 913	 steps: 10000	 total steps:4039994	 epsilon: 0.10
episode: 914	 steps: 03121	 total steps:4043115	 epsilon: 0.10
episode: 915	 steps: 05187	 total steps:4048302	 epsilon: 0.10
episode: 916	 steps: 06531	 total steps:4054833	 epsilon: 0.10
episode: 917	 steps: 10000	 total steps:4064833	 epsilon: 0.10
episode: 918	 steps: 03337	 total steps:4068170	 epsilon: 0.10
episode: 919	 steps: 09449	 total steps:4077619	 epsilon: 0.10
episode: 920	 steps: 04573	 total steps:4082192	 epsilon: 0.10
episode: 921	 steps: 01007	 total steps:4083199	 epsilo

RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

## DDQN

In [None]:
#  Hyperparameters
n_training_episodes = 1000
gamma = 0.99
learning_rate = 0.00025  # 0.1
max_training_steps = 10000

# Exploration parameters
epsilon_max = 1
epsilon_min = 0.1
eval_epsilon = 0.05
# epsilon_frame = 100000

# replay memory parameters
replay_size = 200000
batch_size = 32

# fixed target network
fixed_target = True
copy_target = 10000

debug = True

double = True

epsilon_frame = 500000
min_replay = 80000


transform = scale_and_resize()

env = gym.make('MountainCar-v0', render_mode='rgb_array')

car = TrainMountainCar(n_training_episodes=n_training_episodes, gamma=gamma, learning_rate=learning_rate,
                       epsilon_max=epsilon_max, epsilon_min=epsilon_min,double=double, min_replay=min_replay,
                       max_steps=max_training_steps, batch_size=batch_size, fixed_target=fixed_target,
                       copy_target=copy_target, debug=debug, env=env, epsilon_frame=epsilon_frame,
                       eval_epsilon=eval_epsilon)

total_rewards, total_steps_list, best_policy, evaluations, td_error, final_policy = car.train()

# save best policy as well as steps and q measures
torch.save(best_policy, 'data/DDQN_MLP_best.pth')
torch.save(final_policy, 'data/DDQN_MLP_final.pth')
np.savetxt(f'data/td_error_DDQN_MLP.txt', td_error)
np.savetxt(f'data/steps_DDQN_MLP_txt', total_steps_list)
# np.savetxt(f'data/q_values_DDQN_MLP.txt', q_measures)
np.savetxt(f'data/eval_DDQN_MLP.txt', evaluations)

# Plot steps per episode
plt.plot(np.arange(len(total_steps_list)) + 1, total_steps_list, zorder=0, label='training')
x = np.arange(50, n_training_episodes+1, 50)
plt.scatter(x, [-e for e in evaluations], color='r', marker='x', zorder=1, label='evaluations')
N = 10
steps_mean = running_mean(total_steps_list, N)
plt.plot(np.arange(len(steps_mean)) + 1, steps_mean, zorder=0, label='running average')
plt.legend()
plt.xlabel('Episode')
plt.ylabel('Steps')
plt.title('Steps per Episode - DDQN_MLP')
plt.savefig('plots/steps_DDQN_MLP.png')
plt.close()

# Plot q measures per episode
# plt.plot(np.arange(len(q_measures)) + 1, q_measures)
# plt.xlabel('Episode')
# plt.ylabel('Average Q')
# plt.title('Average Q measure over sampled states')
# plt.savefig('plots/q_measures_DRQN.png')
# plt.close()

episode: 001	 steps: 02307	 total steps:002308	 epsilon: 1.00
episode: 002	 steps: 10000	 total steps:012308	 epsilon: 0.98
episode: 003	 steps: 04551	 total steps:016859	 epsilon: 0.97
episode: 004	 steps: 10000	 total steps:026859	 epsilon: 0.95
episode: 005	 steps: 10000	 total steps:036859	 epsilon: 0.93
episode: 006	 steps: 10000	 total steps:046859	 epsilon: 0.92
episode: 007	 steps: 10000	 total steps:056859	 epsilon: 0.90
episode: 008	 steps: 10000	 total steps:066859	 epsilon: 0.88
episode: 009	 steps: 10000	 total steps:076859	 epsilon: 0.86
episode: 010	 steps: 10000	 total steps:086859	 epsilon: 0.84
episode: 011	 steps: 04503	 total steps:091362	 epsilon: 0.84
episode: 012	 steps: 06360	 total steps:097722	 epsilon: 0.82
episode: 013	 steps: 07937	 total steps:105659	 epsilon: 0.81
episode: 014	 steps: 01681	 total steps:107340	 epsilon: 0.81
episode: 015	 steps: 10000	 total steps:117340	 epsilon: 0.79
episode: 016	 steps: 04707	 total steps:122047	 epsilon: 0.78
episode:

episode: 132	 steps: 03067	 total steps:597770	 epsilon: 0.10
episode: 133	 steps: 00475	 total steps:598245	 epsilon: 0.10
episode: 134	 steps: 03576	 total steps:601821	 epsilon: 0.10
episode: 135	 steps: 03558	 total steps:605379	 epsilon: 0.10
episode: 136	 steps: 00904	 total steps:606283	 epsilon: 0.10
episode: 137	 steps: 06388	 total steps:612671	 epsilon: 0.10
episode: 138	 steps: 03143	 total steps:615814	 epsilon: 0.10
episode: 139	 steps: 01442	 total steps:617256	 epsilon: 0.10
episode: 140	 steps: 06444	 total steps:623700	 epsilon: 0.10
episode: 141	 steps: 02344	 total steps:626044	 epsilon: 0.10
episode: 142	 steps: 02239	 total steps:628283	 epsilon: 0.10
episode: 143	 steps: 01177	 total steps:629460	 epsilon: 0.10
episode: 144	 steps: 00746	 total steps:630206	 epsilon: 0.10
episode: 145	 steps: 05541	 total steps:635747	 epsilon: 0.10
episode: 146	 steps: 02937	 total steps:638684	 epsilon: 0.10
episode: 147	 steps: 04574	 total steps:643258	 epsilon: 0.10
episode:

## Prioritized_DDQN

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

device = torch.device("cpu")

# Hyperparameters
n_training_episodes = 1000
gamma = 0.99
learning_rate = 0.00025/4  # 0.1
max_training_steps = 10000

# Exploration parameters
epsilon_max = 1
epsilon_min = 0.01
epsilon_frame = 500000

# replay memory parameters
replay_size = 200000
batch_size = 32
min_replay = 80000


# fixed target network
fixed_target = True
copy_target = 30000


debug = True
double = True
prioritized = True

env = gym.make('MountainCar-v0', render_mode='rgb_array')


car = TrainMountainCar(n_training_episodes=n_training_episodes, gamma=gamma, learning_rate=learning_rate,
                       epsilon_max=epsilon_max, epsilon_min=epsilon_min, epsilon_frame=epsilon_frame,
                       max_steps=max_training_steps, batch_size=batch_size, fixed_target=fixed_target,
                       copy_target=copy_target, replay_size=replay_size, double=double, prioritized=prioritized,
                       debug=debug, min_replay=min_replay)

total_rewards, total_steps_list, best_policy, evaluations, td_error, final_policy = car.train()

torch.save(best_policy, 'data/Prioritized_DDQN_MLP_best.pth')
torch.save(final_policy, 'data/Prioritized_DDQN_MLP_final.pth')
np.savetxt(f'data/td_error_Prioritized_DDQN_MLP.txt', td_error)
np.savetxt(f'data/steps_Prioritized_DDQN_MLP.txt', total_steps_list)
# np.savetxt(f'data/q_values_Prioritized_DDQN_MLP.txt', q_measures)
np.savetxt(f'data/eval_Prioritized_DDQN_MLP.txt', evaluations)

plt.plot(np.arange(len(total_steps_list)) + 1, total_steps_list, zorder=0, label='evaluations')
x = np.arange(50, n_training_episodes+1, 50)
plt.scatter(x, [-e for e in evaluations], color='r', marker='x', zorder=1, label='evaluations')
N = 10
steps_mean = running_mean(total_steps_list, N)
plt.plot(np.arange(len(steps_mean)) + 1, steps_mean, zorder=0, label='running average')
plt.legend()
plt.xlabel('Episode')
plt.ylabel('Steps')
plt.title('Steps per Episode - Prioritized_DDQN_MLP')
plt.savefig('plots/steps_Prioritized_DDQN_MLP.png')
plt.close()


## Dueling DDQN

In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

device = torch.device("cuda")

# Hyperparameters
n_training_episodes = 1000
gamma = 0.99
learning_rate = 0.00025  # 0.1
max_training_steps = 10000

# Exploration parameters
epsilon_max = 1
epsilon_min = 0.1
eval_epsilon = 0.05

# replay memory parameters
replay_size = 200000
batch_size = 32
min_memory = 80000


# fixed target network
fixed_target = True
copy_target = 30000


debug = True
double = True       # DDQN
dueling = True      # Dueling Network

car = TrainMountainCar(n_training_episodes=n_training_episodes, gamma=gamma, learning_rate=learning_rate,
                       epsilon_max=epsilon_max, epsilon_min=epsilon_min, min_memory=min_memory,
                       max_steps=max_training_steps, batch_size=batch_size, fixed_target=fixed_target,
                       copy_target=copy_target, replay_size=replay_size, double=double, dueling=dueling, debug=debug,
                       eval_epsilon=eval_epsilon)

total_rewards, total_steps_list, best_policy, evaluations, td_error, final_policy = car.train()

torch.save(best_policy, 'data/Dueling_DDQN_MLP_best.pth')
torch.save(final_policy, 'data/Dueling_DDQN_MLP_final.pth')
np.savetxt(f'data/steps_Dueling_DDQN_MLP.txt', total_steps_list)
# np.savetxt(f'data/q_values_Dueling_DDQN_MLP.txt', q_measures)
np.savetxt(f'data/eval_Dueling_DDQN_MLP.txt', evaluations)
np.savetxt(f'data/td_error_Dueling_DDQN_MLP.txt', td_error)

plt.plot(np.arange(len(total_steps_list)) + 1, total_steps_list, zorder=0, label='training')
x = np.arange(50, n_training_episodes, 50)
plt.scatter(x, [-e*4 for e in evaluations], color='r', marker='x', zorder=1, label='evaluations')
N = 10
steps_mean = running_mean(total_steps_list, N)
plt.plot(np.arange(len(steps_mean)) + 1, steps_mean, zorder=0, label='running average')
plt.legend()
plt.xlabel('Episode')
plt.ylabel('Steps')
plt.title('Steps per Episode - Dueling_DDQN_MLP')
plt.savefig('plots/steps_Dueling_DDQN_MLP.png')
plt.close()