<a href="https://colab.research.google.com/github/maxmatical/Reinforcement-Learning/blob/master/Reinforcement_Learning_Experiments_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y
!pip install pyvirtualdisplay
!pip install piglet

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1400, 900))
display.start()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.2).
The following package was automatically installed and is no longer required:
  libnvidia-common-410
Use 'apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.


<Display cmd_param=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] cmd=['Xvfb', '-br', '-nolisten', 'tcp', '-screen', '0', '1400x900x24', ':1001'] oserror=None return_code=None stdout="None" stderr="None" timeout_happened=False>


**OpenAI Gym**

We're gonna spend several next weeks learning algorithms that solve decision processes. We are then in need of some interesting decision problems to test our algorithms.

That's where OpenAI gym comes into play. It's a python library that wraps many classical decision problems including robot control, videogames and board games.

So here's how it works:


In [0]:
import gym
env = gym.make("MountainCar-v0")
env = gym.make('CartPole-v1')

plt.imshow(env.render('rgb_array'))
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Box(4,)
Action space: Discrete(2)


reset() - reset environment to initial state, return first observation

render() - show current environment state (a more colorful version :) )

step(a) - commit action a and return (new observation, reward, is done, info)


In [0]:
obs0 = env.reset()
print("initial observation code:", obs0)

# Note: in MountainCar, observation is just two numbers: car position and velocity

initial observation code: [-0.41729141  0.        ]


In [0]:
print("taking action 2 (right)")
new_obs, reward, is_done, _ = env.step(2)

print("new observation code:", new_obs)
print("reward:", reward)
print("is game over?:", is_done)

# Note: as you can see, the car has moved to the riht slightly (around 0.0005)

taking action 2 (right)
new observation code: [-4.17075265e-01  2.16141991e-04]
reward: -1.0
is game over?: False


# Time to play

Try tweaking t or action strategy

**Remember**: Target is to reach to flag

In [0]:

# create env manually to set time limit. Please don't change this.
TIME_LIMIT = 250
env = gym.wrappers.TimeLimit(gym.envs.classic_control.MountainCarEnv(),
                             max_episode_steps=TIME_LIMIT + 1)
s = env.reset()
actions = {'left': 0, 'stop': 1, 'right': 2}

# prepare "display"
%matplotlib notebook
fig = plt.figure()
ax = fig.add_subplot(111)
fig.show()

def policy(t):
    if t>20 and t<60:
        return actions['left']
    else:
        return actions['right']


for t in range(TIME_LIMIT):
    
    # change the line below to reach the flag
    s, r, done, _ = env.step(policy(t))
    
    #draw game image on display
    ax.clear()
    ax.imshow(env.render('rgb_array'))
    fig.canvas.draw()
    
    if done:
        print("Well done!")
        break
else:    
    print("Time limit exceeded. Try again.")

<IPython.core.display.Javascript object>

Well done!


<IPython.core.display.Javascript object>

# Cartpole with Deep Q-Learning

In [0]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

Using TensorFlow backend.


In [0]:
import gym
# env = gym.make("MountainCar-v0")
env = gym.make('CartPole-v1')

# plt.imshow(env.render('rgb_array'))
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Box(4,)
Action space: Discrete(2)


In [0]:
ENV_NAME = "CartPole-v1"

### Building DQN

In [0]:

GAMMA = 0.95
LEARNING_RATE = 0.001

MEMORY_SIZE = 1000000
BATCH_SIZE = 20

EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

In [0]:

class DQNSolver:

    def __init__(self, observation_space, action_space):
        self.exploration_rate = EXPLORATION_MAX

        self.action_space = action_space # 2 actions, left or right
        self.memory = deque(maxlen=MEMORY_SIZE)

        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(observation_space,), activation="relu")) # 4 obs states
        self.model.add(Dense(24, activation="relu"))
        self.model.add(Dense(self.action_space, activation="linear"))
        
        
        self.model.compile(loss="mse", optimizer=Adam(lr=LEARNING_RATE))

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() < self.exploration_rate:
            return random.randrange(self.action_space)
        q_values = self.model.predict(state)
        return np.argmax(q_values[0])

    def experience_replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        batch = random.sample(self.memory, BATCH_SIZE)
        for state, action, reward, state_next, terminal in batch:
            q_update = reward
            if not terminal:
                q_update = (reward + GAMMA * np.amax(self.model.predict(state_next)[0]))
            q_values = self.model.predict(state)
            q_values[0][action] = q_update
            self.model.fit(state, q_values, verbose=0)
        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

In [0]:
n_runs = 500

def cartpole():
    env = gym.make(ENV_NAME)
    observation_space = env.observation_space.shape[0]
    action_space = env.action_space.n
    dqn_solver = DQNSolver(observation_space, action_space)
    run = 0
    while run <=n_runs:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, observation_space])
        step = 0
        while True:
            step += 1
            #env.render()
            action = dqn_solver.act(state)
            state_next, reward, terminal, info = env.step(action)
            reward = reward if not terminal else -reward
            state_next = np.reshape(state_next, [1, observation_space])
            dqn_solver.remember(state, action, reward, state_next, terminal)
            state = state_next
            if terminal:
                print( "Run: " + str(run) + ", exploration: " + str(dqn_solver.exploration_rate) + ", score: " + str(step))
                break
            dqn_solver.experience_replay()

In [0]:
if __name__ == "__main__":
    cartpole()

# Cartpole with policy gradients




https://medium.com/@ts1829/policy-gradient-reinforcement-learning-in-pytorch-df1383ea0baf

In [0]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
%matplotlib inline

In [3]:
import gym
# env = gym.make("MountainCar-v0")
env = gym.make('CartPole-v1')

# plt.imshow(env.render('rgb_array'))
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Observation space: Box(4,)
Action space: Discrete(2)


Setting hyperparameters

In [0]:
lr = 0.01
gamma = 0.99 #discount factor

In [5]:
print(env.observation_space.shape[0])
print(env.action_space.n)

4
2


## Policy Network

In [0]:
class Policy_Network(nn.Module):
    def __init__(self):
        super(Policy_Network, self).__init__()
        self.state_space = env.observation_space.shape[0]
        self.action_space = env.action_space.n
        
        self.linear1 = nn.Linear(self.state_space, 128, bias=False)
        self.linear2 = nn.Linear(128, self.action_space, bias=False)
        
        self.gamma = gamma
        
        # Episode policy and reward history 
        self.policy_history = Variable(torch.Tensor()) 
        self.reward_episode = []
        
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []

    def forward(self, x):    
        model = torch.nn.Sequential(
            self.linear1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.linear2,
            nn.Softmax(dim=-1)
        )
        return model(x)
    


In [0]:
policy = Policy_Network()
optimizer = optim.Adam(net.parameters(), lr=lr)

eps = np.finfo(np.float32).eps.item()



## Select Action


In [0]:
print(policy.policy_history.dim())
print(policy.policy_history)


In [0]:
def select_action(state):
    #Select an action (0 or 1) by running policy model and choosing based on the probabilities in state
    state = torch.from_numpy(state).type(torch.FloatTensor)
    state = policy(Variable(state))
    c = Categorical(state) # turns the 
    action = c.sample()
    
    # Add log probability of our chosen action to our history    
    if policy.policy_history.dim() != 0:
#         policy.policy_history = torch.cat([policy.policy_history, c.log_prob(action)])
#         policy.policy_history = torch.stack([policy.policy_history, c.log_prob(action)])
        policy.policy_history = torch.cat([policy.policy_history, c.log_prob(action).view(1)])

    else:
        policy.policy_history = (c.log_prob(action))
    return action




## Update Policy

In [0]:
def update_policy():
    R = 0
    rewards = []
    
    # Discount future rewards back to the present using gamma
    for r in policy.reward_episode[::-1]:
        R = r + net.gamma * R
        rewards.insert(0,R)
        
    # Scale rewards
    rewards = torch.FloatTensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    
    # Calculate loss
    loss = (torch.sum(torch.mul(policy.policy_history, Variable(rewards)).mul(-1), -1))
    
    # Update network weights
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    #Save and intialize episode history counters
    policy.loss_history.append(loss.data[0])
    policy.reward_history.append(np.sum(policy.reward_episode))
    policy.policy_history = Variable(torch.Tensor())
    policy.reward_episode= []

## Training Model

In [0]:

def main(episodes):
    running_reward = 10
    for episode in range(episodes):
        state = env.reset() # Reset environment and record the starting state
        done = False       
    
        for time in range(1000):
            action = select_action(state)
            # Step through environment using chosen action
            state, reward, done, _ = env.step(action.data[0])

            # Save reward
            policy.reward_episode.append(reward)
            if done:
                break
        
        # Used to determine when the environment is solved.
        running_reward = (running_reward * 0.99) + (time * 0.01)

        update_policy()

        if episode % 50 == 0:
            print('Episode {}\tLast length: {:5d}\tAverage length: {:.2f}'.format(episode, time, running_reward))

        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and the last episode runs to {} time steps!".format(running_reward, time))
            break

In [0]:
episodes = 1000
main(episodes)

## View Results

In [0]:

window = int(episodes/20)

fig, ((ax1), (ax2)) = plt.subplots(2, 1, sharey=True, figsize=[9,9]);
rolling_mean = pd.Series(policy.reward_history).rolling(window).mean()
std = pd.Series(policy.reward_history).rolling(window).std()
ax1.plot(rolling_mean)
ax1.fill_between(range(len(policy.reward_history)),rolling_mean-std, rolling_mean+std, color='orange', alpha=0.2)
ax1.set_title('Episode Length Moving Average ({}-episode window)'.format(window))
ax1.set_xlabel('Episode'); ax1.set_ylabel('Episode Length')

ax2.plot(policy.reward_history)
ax2.set_title('Episode Length')
ax2.set_xlabel('Episode'); ax2.set_ylabel('Episode Length')

fig.tight_layout(pad=2)
plt.show()
#fig.savefig('results.png')

# Cartpole with policy gradients (Pytorch example)




https://github.com/pytorch/examples/blob/master/reinforcement_learning/reinforce.py

In [0]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
%matplotlib inline
from itertools import count


In [0]:
env = gym.make('CartPole-v1')
env.seed(1)
torch.manual_seed(1)

Setting hyperparameters

In [0]:
lr = 0.01
gamma = 0.99 #discount factor

In [0]:
print(env.observation_space.shape[0])
print(env.action_space.n)

4
2


## Policy Network

In [0]:
class Policy_Network(nn.Module):
    def __init__(self):
        super(Policy_Network, self).__init__()
        self.state_space = env.observation_space.shape[0]
        self.action_space = env.action_space.n
        
        self.linear1 = nn.Linear(self.state_space, 128, bias=False)
        self.linear2 = nn.Linear(128, self.action_space, bias=False)
        
        self.gamma = gamma
        
        # Episode policy and reward history 
        self.saved_log_probs = []
        self.rewards = []

        
        # Overall reward and loss history
        self.reward_history = []
        self.loss_history = []

    def forward(self, x):    
        model = torch.nn.Sequential(
            self.linear1,
            nn.Dropout(p=0.6),
            nn.ReLU(),
            self.linear2,
            nn.Softmax(dim=-1)
        )
        return model(x)
    


In [0]:
policy = Policy_Network()
optimizer = optim.Adam(net.parameters(), lr=lr)

eps = np.finfo(np.float32).eps.item()



## Select Action


In [0]:
def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item()




## Update Policy

In [0]:
def finish_episode():
    R = 0
    policy_loss = []
    returns = []
    for r in policy.rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    
    
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    
    
    del policy.rewards[:]
    del policy.saved_log_probs[:]

## Training Model

In [0]:
def main():
    running_reward = 10
    for i_episode in count(1):
        state, ep_reward = env.reset(), 0
        for t in range(1, 10000):  # Don't infinite loop while learning
            action = select_action(state)
            state, reward, done, _ = env.step(action)
#             if args.render:
#                 env.render()
            policy.rewards.append(reward)
            ep_reward += reward
            if done:
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        finish_episode()
        if i_episode % 50 == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break


In [0]:
if __name__ == '__main__':
    main()

# Cartpole with Actor Critic (A2C)

https://github.com/pytorch/examples/blob/master/reinforcement_learning/actor_critic.py


https://github.com/yc930401/Actor-Critic-pytorch

In [0]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm, trange
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
from torch.distributions import Categorical
%matplotlib inline
from itertools import count
from collections import namedtuple



In [7]:
env = gym.make('CartPole-v1')
env.seed(1)
torch.manual_seed(1)

<torch._C.Generator at 0x7f9cb593ec70>

Setting hyperparameters

In [0]:
lr = 0.01
gamma = 0.99 #discount factor

In [9]:
print(env.observation_space.shape[0])
print(env.action_space.n)
print(env.action_space)

4
2
Discrete(2)


In [0]:
SavedAction = namedtuple('SavedAction', ['log_prob', 'value'])

## Policy Network

In [0]:
# Initial A2C network

class AC_Network(nn.Module):
    def __init__(self):
        super(AC_Network, self).__init__()
        self.linear1 = nn.Linear(4, 128)
        self.action_head = nn.Linear(128, 2) # actor (policy gradient)
        self.value_head = nn.Linear(128, 1) # critic (DQN)

        self.saved_actions = []
        self.rewards = []

    def forward(self, x):
        x = F.relu(self.linear1(x),)
        action_scores = self.action_head(x)
        state_values = self.value_head(x)
        return F.softmax(action_scores, dim=-1), state_values

    
    
# class AC_Network(nn.Module):
#     def __init__(self):
#         super(AC_Network, self).__init__()
#         self.linear1 = nn.Linear(env.observation_space.shape[0], 1256)
#         self.linear2 = nn.Linear(1256, 1256)
#         self.action_head = nn.Linear(1256, env.action_space.n) # actor (policy gradient)
#         self.value_head = nn.Linear(1256, 1) # critic (DQN)

#         self.saved_actions = []
#         self.rewards = []

#     def forward(self, x):
#         x = F.relu(self.linear1(x),)
#         x = F.dropout(x, p = 0.0)
#         x = F.relu(self.linear2(x),)
#         x = F.dropout(x, p = 0.0)
#         action_scores = self.action_head(x)
#         state_values = self.value_head(x)
#         return F.softmax(action_scores, dim=-1), state_values

In [0]:
model = AC_Network()
optimizer = optim.Adam(model.parameters(), lr=lr)

eps = np.finfo(np.float32).eps.item()



## Action

In [0]:
# selecting an action from a state
# state is from the env

def select_action(state): 
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs, state_value = model(state) #probs is prob of action_values (which action to take) from the actor, 2D (since 2 actions)
    # state_value is from the critic, 1D
    m = Categorical(probs) 
    action = m.sample()
    model.saved_actions.append(SavedAction(m.log_prob(action), state_value))
    return action.item()

## Update Policy

In [0]:

def finish_episode():
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []
    value_losses = []
    returns = []
    for r in model.rewards[::-1]:
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns)
    returns = (returns - returns.mean()) / (returns.std() + eps)
    for (log_prob, value), R in zip(saved_actions, returns):
        advantage = R - value.item()
        policy_losses.append(-log_prob * advantage)
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([R])))
    optimizer.zero_grad()
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    loss.backward()
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]

## Train

In [0]:
def main():
    running_reward = 10
    for i_episode in count(1):
        state, ep_reward = env.reset(), 0
        for t in range(1, 10000):  # Don't infinite loop while learning
            action = select_action(state)
            state, reward, done, _ = env.step(action)
#             if args.render:
#                 env.render()
            model.rewards.append(reward)
            ep_reward += reward
            if done:
                break

        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        finish_episode()
        if i_episode % 25 == 0:
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
        if running_reward > env.spec.reward_threshold:
            print("Solved! Running reward is now {} and "
                  "the last episode runs to {} time steps!".format(running_reward, t))
            break




In [0]:
if __name__ == '__main__':
    main()

Episode 25	Last reward: 48.00	Average reward: 48.52
Episode 50	Last reward: 113.00	Average reward: 53.53
Episode 75	Last reward: 135.00	Average reward: 55.94
Episode 100	Last reward: 211.00	Average reward: 145.05
Episode 125	Last reward: 265.00	Average reward: 302.47
Episode 150	Last reward: 233.00	Average reward: 298.81
Episode 175	Last reward: 500.00	Average reward: 420.15
Episode 200	Last reward: 199.00	Average reward: 219.41
Episode 225	Last reward: 278.00	Average reward: 203.50
Episode 250	Last reward: 214.00	Average reward: 295.85
Episode 275	Last reward: 381.00	Average reward: 381.33
Episode 300	Last reward: 500.00	Average reward: 460.44
Solved! Running reward is now 475.0647101963014 and the last episode runs to 500 time steps!
