## MountainCar-v0 toy problem

Made by Zhakshylyk Nurlanov, 2020,

with the help of [this pytorch tutorial](https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html)

In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device: ", device)

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
print("Is ipython: ", is_ipython)
if is_ipython:
    from IPython import display

plt.ion()

device:  cuda
Is ipython:  True


## Environment

In [3]:
env = gym.make('MountainCar-v0')
env.seed(505);
state_size = env.observation_space.shape[0]

In [4]:
state = env.reset()
score = 0
for t in range(200):
    action = env.action_space.sample()
    env.render()
    state, reward, done, _ = env.step(action)
    score += reward
    if done:
        break 
print('Final score:', score)
env.close()

Final score: -200.0


In [5]:
# Explore state (observation) space
print("State space:", env.observation_space)
print("- low:", env.observation_space.low)
print("- high:", env.observation_space.high)

State space: Box(2,)
- low: [-1.2  -0.07]
- high: [0.6  0.07]


In [6]:
# Generate some samples from the state space 
print("State space samples:")
print(np.array([env.observation_space.sample() for i in range(10)]))

State space samples:
[[-0.69460154 -0.03406936]
 [ 0.56517607  0.04102291]
 [ 0.24912098 -0.01930538]
 [-0.8728045   0.02078829]
 [ 0.19375084  0.0398021 ]
 [ 0.00863032  0.05810598]
 [ 0.535222    0.01968305]
 [ 0.19391622 -0.03819394]
 [-0.5036222   0.04459077]
 [-0.7227649  -0.03934928]]


In [7]:
# Explore the action space
print("Action space:", env.action_space)

# Generate some samples from the action space
print("Action space samples:")
print(np.array([env.action_space.sample() for i in range(10)]))

Action space: Discrete(3)
Action space samples:
[2 2 1 2 0 2 1 2 2 1]


## Replay Memory

In [8]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

In [9]:
class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

## DQN

In [10]:
class DQN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DQN, self).__init__()
        self.linear_1 = nn.Linear(input_size, hidden_size)
        self.linear_2 = nn.Linear(hidden_size, hidden_size)
        self.linear_3 = nn.Linear(hidden_size, output_size)
        
    def forward(self, x):
        x = F.relu(self.linear_1(x))
        x = F.relu(self.linear_2(x))
        x = self.linear_3(x)
        return x

## Initializing training procedures

In [11]:
BATCH_SIZE = 128
GAMMA = 0.9
EPS_START = 1.0
EPS_END = 0.01
EPS_DECAY = 200
TARGET_UPDATE = 5

TRAIN_NOW = False

In [12]:
n_actions = env.action_space.n

In [13]:
policy_net = DQN(state_size, 20, n_actions).to(device)
target_net = DQN(state_size, 20, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

DQN(
  (linear_1): Linear(in_features=2, out_features=20, bias=True)
  (linear_2): Linear(in_features=20, out_features=20, bias=True)
  (linear_3): Linear(in_features=20, out_features=3, bias=True)
)

In [14]:
optimizer = optim.Adam(policy_net.parameters(), lr=0.0001, weight_decay=1e-7)
memory = ReplayMemory(10000)


steps_done = 0

In [15]:
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


In [16]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-3, 3)
    optimizer.step()

## Main training loop

In [17]:
if TRAIN_NOW:
    num_episodes = 551

    avg_reward = 0
    for i_episode in range(num_episodes):
        if i_episode % 20 == 0: print("episode: ", i_episode+1, ", reward: ", avg_reward)
        # Initialize the environment and state
        state = env.reset()
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        total_reward = 0
        avg_reward =  0
        for t in range(200):
            # Select and perform an action
            action = select_action(state)
            next_state, reward, done, info = env.step(action.item())
            total_reward += reward

            # Trick for this task
            if done:
                reward = 290 + total_reward
            else:
                reward = abs(next_state[0] - state[0][0].item())

            avg_reward += reward

            next_state = torch.from_numpy(next_state).float().unsqueeze(0).to(device)
            reward = torch.tensor([reward], device=device).float()

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model()
            if done:
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

        # Test every 100 episodes
        if i_episode % 50 == 0 and i_episode > 10:
            total_reward = 0
            for i in range(10):
                state = env.reset()
                for j in range(200):
                    env.render()
                    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
                    action = target_net(state).max(1)[1].view(1, 1)
                    state,reward,done,_ = env.step(action.item())
                    total_reward += reward
                    if done:
                        break
                env.close()
            ave_reward = total_reward/10
            print("___________________________________")
            print('episode: ',i_episode+1,'Evaluation Average Reward:',ave_reward)
            print("___________________________________")

            # save resulting model
            torch.save(target_net.cpu().eval().state_dict(), f'gym_results/model_{i_episode}.pth')
            target_net.to(device)
            if ave_reward > -110:
                break


    print('Complete')

## Load best model

In [35]:
i_episode = 500
target_net.cpu().eval()
target_net.load_state_dict(torch.load(f'gym_results/model_{i_episode}.pth'))
target_net.eval()
target_net.to(device)

DQN(
  (linear_1): Linear(in_features=2, out_features=20, bias=True)
  (linear_2): Linear(in_features=20, out_features=20, bias=True)
  (linear_3): Linear(in_features=20, out_features=3, bias=True)
)

## Watch a smart agent

In [36]:
state = env.reset()
score = 0
for t in range(400):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    action = target_net(state).max(1)[1].view(1, 1)
    env.render()
    state, reward, done, _ = env.step(action.item())
    score += reward
    if done:
        break 
print('Final score:', score)
env.close()

Final score: -119.0


## Write a video with results

In [37]:
from gym.wrappers.monitoring.video_recorder import VideoRecorder

In [38]:
target_net.cpu()

vr = VideoRecorder(env, 'gym_results/video_result_550.mp4', enabled=True)

score = 0
num_episodes = 0
for i in range(15):
    state = env.reset()
    for t in range(200):
        env.unwrapped.render()
        vr.capture_frame()
        action = target_net(torch.from_numpy(state).float().unsqueeze(0)).max(1)[1].view(1, 1)
        state, reward, done, _ = env.step(action.item())
        score += reward
        if done:
            break
    
        
print('Final average score:', score / 15)
vr.close()
env.close()

Final average score: -120.26666666666667
