In [1]:
%matplotlib inline
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from itertools import count
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from collections import deque
import copy

### Opprett en simulator

In [2]:
env = gym.make('CartPole-v0')
env._max_episode_steps = 500

In [3]:
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython: from IPython import display

In [4]:
class Agent():
    def __init__(
        self,
        num_features,
        num_actions,
        hidden_layers=[64],
        bias=True,
        learning_rate=0.001,
        experience_capacity=1000
    ):
        torch.manual_seed(1234)
        self.num_features = num_features
        self.num_actions = num_actions
        
        self.q_net = self.create_model(hidden_layers, bias)
        self.target_net = copy.deepcopy(self.q_net)
        self.target_net.eval()
        
        self.loss_func = torch.nn.MSELoss()
        self.optimizer = torch.optim.AdamW(
            self.q_net.parameters(),
            lr=learning_rate,
            weight_decay=0
        )
        
        self.experiences = deque(maxlen=experience_capacity)
        
        self.sync_counter = 0
        
    def create_model(self, hidden_layers, bias=True):
        layer_dims = [self.num_features] + [
            layer for layer in hidden_layers if layer > 0
        ] + [self.num_actions]
        layers = []
        for index in range(len(layer_dims) - 1):
            layers.append(nn.Linear(layer_dims[index], layer_dims[index + 1], bias=bias))
            layers.append(nn.Identity() if index == len(layer_dims) - 2 else nn.Tanh())
        return nn.Sequential(*layers)
        
    def get_action(self, state, epsilon=0):
        with torch.no_grad():
            Qp = self.q_net(state)
        Q, A = torch.max(Qp, axis=0)
        A = A if torch.rand(1, ).item() > epsilon else torch.randint(0, self.num_actions, (1,))
        return A
        
    def add_experience(self, experience):
        self.experiences.append(experience)
        
    def get_experience(self, batch_size):
        if len(self.experiences) < batch_size:
            batch_size = len(self.experiences)
        sample = random.sample(self.experiences, batch_size)
        states = torch.stack([exp[0] for exp in sample]).float()
        actions = torch.tensor([exp[1] for exp in sample]).float()
        rewards = torch.tensor([exp[2] for exp in sample]).float()
        next_states = torch.tensor([exp[3] for exp in sample]).float()
        return states, actions, rewards, next_states

    def get_q_next(self, state):
        with torch.no_grad():
            qp = self.target_net(state)
        q, _ = torch.max(qp, axis=1)
        return q
    
    def fit(self, batch_size, gamma=0.95):
        states, actions, rewards, next_states = self.get_experience(batch_size)
        
        if self.sync_counter == 1:
            self.target_net.load_state_dict(self.q_net.state_dict())
            self.target_net.eval()
            self.sync_counter = 0
        
        q_pred = self.q_net(states)
        pred_return, _ = torch.max(q_pred, axis=1)

        # get target return using target network
        q_next = self.get_q_next(next_states)
        target_return = rewards + gamma * q_next

        loss = self.loss_func(pred_return, target_return)
        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        nn.utils.clip_grad_value_(self.q_net.parameters(), clip_value=0.75)
        #nn.utils.clip_grad_norm_(self.q_net.parameters(), max_norm=2.0, norm_type=2)
        self.optimizer.step()

        self.sync_counter += 1

In [5]:
batch_size = 256
gamma = 0.999
epsilon = 1
epsilon_decay = 1 / 5000

### Test forskjellige learning rates

In [107]:
for learning_rate in [0.01, 0.001, 0.00075, 0.0005, 0.00025, 0.0001]:
    agent = Agent(
        num_features = env.observation_space.shape[0],
        num_actions = env.action_space.n,
        hidden_layers = [64],
        bias = False,
        learning_rate = learning_rate,
        experience_capacity = 10000
    )

    epsilon = 1
    episode_durations = []
    print(f'\n\nLearning rate = {learning_rate}')

    for episode in range(7501):
        state, done = env.reset(), False

        for timestep in count():
            state = torch.tensor(state)

            action = agent.get_action(state, epsilon)
            next_state, reward, done, _ = env.step(action.item())
            agent.add_experience([state, action.item(), reward, next_state])

            state = next_state

            if done:
                #for _ in range(2 + round(timestep / 50)):
                agent.fit(batch_size, gamma)
                episode_durations.append(timestep)
                break


        if epsilon > 0.05 :
            epsilon -= epsilon_decay

        avg = np.mean(episode_durations[-100:])

        if episode % 500 == 0:
            print(f'Episode {episode}: {avg}')

        if avg > 197.5:
            break

    print(f'Finished at episode {episode}: {avg}')



Learning rate = 0.01
Episode 0: 18.0
Episode 500: 27.6
Episode 1000: 31.16
Episode 1500: 39.5
Episode 2000: 47.99
Episode 2500: 72.17
Episode 3000: 81.39
Episode 3500: 112.29
Episode 4000: 163.73
Episode 4500: 188.03
Finished at episode 4856: 197.91


Learning rate = 0.001
Episode 0: 18.0
Episode 500: 24.87
Episode 1000: 33.53
Episode 1500: 45.52
Episode 2000: 58.96
Episode 2500: 81.57
Episode 3000: 117.48
Episode 3500: 140.93
Episode 4000: 185.26
Finished at episode 4135: 197.62


Learning rate = 0.00075
Episode 0: 17.0
Episode 500: 25.55
Episode 1000: 32.7
Episode 1500: 39.42
Episode 2000: 58.27
Episode 2500: 69.93
Episode 3000: 88.55
Episode 3500: 100.82
Episode 4000: 112.07
Episode 4500: 143.94
Finished at episode 4794: 197.61


Learning rate = 0.0005
Episode 0: 20.0
Episode 500: 24.85
Episode 1000: 33.21
Episode 1500: 42.65
Episode 2000: 53.33
Episode 2500: 63.36
Episode 3000: 81.68
Episode 3500: 96.64
Episode 4000: 109.89
Episode 4500: 147.34
Finished at episode 4968: 197.62




### Se på utviklingen til en agent mens den trener

In [6]:
agent = Agent(
    num_features = env.observation_space.shape[0],
    num_actions = env.action_space.n,
    hidden_layers = [64],
    bias = False,
    learning_rate = 0.01,
    experience_capacity = 10000
)

epsilon = 1
episode_durations = []

for episode in range(10001):
    state, done = env.reset(), False

    for timestep in count():
        state = torch.tensor(state)

        action = agent.get_action(state, epsilon)
        next_state, reward, done, _ = env.step(action.item())
        agent.add_experience([state, action.item(), reward, next_state])

        state = next_state

        if done:
            #for _ in range(2 + round(timestep / 50)):
            agent.fit(batch_size, gamma)
            episode_durations.append(timestep)
            break


    if epsilon > 0.05 :
        epsilon -= epsilon_decay

    avg = np.mean(episode_durations[-100:])

    if episode % 500 == 0:
        print(f'Episode {episode}: {avg}')
        state, done = env.reset(), False
        while not done:
            state = torch.tensor(state)

            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action.item())

            state = next_state

            env.render("human")

  next_states = torch.tensor([exp[3] for exp in sample]).float()


Episode 0: 19.0


2022-02-11 08:57:38.656 Python[99961:6490018] ApplePersistenceIgnoreState: Existing state will not be touched. New state will be written to /var/folders/0l/drf1dc212g3dgfhqpz6vpxww0000gn/T/org.python.python.savedState


Episode 500: 24.87
Episode 1000: 34.25
Episode 1500: 44.75
Episode 2000: 60.91
Episode 2500: 77.14
Episode 3000: 119.82
Episode 3500: 175.68
Episode 4000: 322.89
Episode 4500: 416.83
Episode 5000: 494.82
Episode 5500: 486.02
Episode 6000: 492.3
Episode 6500: 493.15
Episode 7000: 490.05
Episode 7500: 492.96
Episode 8000: 494.63
Episode 8500: 497.23
Episode 9000: 496.75
Episode 9500: 492.09
Episode 10000: 495.35


In [None]:
env._max_episode_steps = 5000

while True:
    state, done = env.reset(), False
    while not done:
        state = torch.tensor(state)
        
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action.item())
        
        state = next_state
        
        env.render("human")

2022-02-11 08:51:58.394 Python[95760:6219866] _mthid_copyDeviceInfo(288230376672660929) failed
2022-02-11 08:51:58.396 Python[95760:6219866] _mthid_copyDeviceInfo(288230376672660929) failed
2022-02-11 08:51:58.398 Python[95760:6219866] _mthid_copyDeviceInfo(288230376672660929) failed
2022-02-11 08:51:58.399 Python[95760:6219866] _mthid_copyDeviceInfo(288230376672660929) failed
2022-02-11 08:51:58.401 Python[95760:6219866] _mthid_copyDeviceInfo(288230376672660929) failed
2022-02-11 08:51:58.403 Python[95760:6219866] _mthid_copyDeviceInfo(288230376672660929) failed
2022-02-11 08:51:58.405 Python[95760:6219866] _mthid_copyDeviceInfo(288230376672660929) failed
2022-02-11 08:51:58.406 Python[95760:6219866] _mthid_copyDeviceInfo(288230376672660929) failed
2022-02-11 08:51:58.408 Python[95760:6219866] _mthid_copyDeviceInfo(288230376672660929) failed
2022-02-11 08:51:58.410 Python[95760:6219866] _mthid_copyDeviceInfo(288230376672660929) failed
2022-02-11 08:51:58.412 Python[95760:6219866] _mth