### ACROBOT ###


In [1]:
# Libraries will not be installed if running on ifi-europa.uibk.ac.at

# Make sure that the required libraries are installed on your local system
# If you are using Google Colab, remember to upload the requirements file before 
# running this cell
# If you are running this notebook locally, the requirements file needs to be in 
# the same location as this notebook
import os
running_local = True if os.getenv('JUPYTERHUB_USER') is None else False
    
if running_local:
    import sys
    !{sys.executable} -m pip install -r requirements_acrobot.txt;

Collecting matplotlib==3.3.1
  Downloading matplotlib-3.3.1-cp38-cp38-macosx_10_9_x86_64.whl (8.5 MB)
[K     |████████████████████████████████| 8.5 MB 2.4 MB/s eta 0:00:01
[?25hCollecting torch
  Downloading torch-1.7.1-cp38-none-macosx_10_9_x86_64.whl (108.9 MB)
[K     |████████████████████████████████| 108.9 MB 4.8 MB/s eta 0:00:01    |████████▍                       | 28.4 MB 441 kB/s eta 0:03:03     |███████████████████████████     | 91.8 MB 2.5 MB/s eta 0:00:07
[?25hCollecting gym==0.17.3
  Downloading gym-0.17.3.tar.gz (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 4.5 MB/s eta 0:00:01
[?25hCollecting tqdm==4.50.2
  Downloading tqdm-4.50.2-py2.py3-none-any.whl (70 kB)
[K     |████████████████████████████████| 70 kB 6.3 MB/s eta 0:00:01
[?25hCollecting numpy==1.19.1
  Downloading numpy-1.19.1-cp38-cp38-macosx_10_9_x86_64.whl (15.3 MB)
[K     |████████████████████████████████| 15.3 MB 5.3 MB/s eta 0:00:01
Collecting cycler>=0.10
  Downloading cycler-0.10.0-py2.p

In [3]:
import os
import numpy as np
import itertools
from itertools import count
import matplotlib.pyplot as plt
from tqdm import trange
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.autograd import Variable
from torchsummary import summary
from importlib import reload

# from networks import NNetwork, NeuralNetworkPolicy
from agents import Q_Agent, Q_DQN_Agent, SARSA_Agent, SARSA_DQN_Agent
from agents_nnp import AAC_Agent, MC_PolGrad_Agent

%load_ext autoreload
%autoreload 2

ModuleNotFoundError: No module named 'torchsummary'

Acrobot is a 2-link pendulum with only the second joint actuated.
Initially, both links point downwards. The goal is to swing the
end-effector at a height at least the length of one link above the base.
Both links can swing freely and can pass by each other, i.e., they don't
collide when they have the same angle.

**STATE:**
The state consists of the sin() and cos() of the two rotational joint
angles and the joint angular velocities :
[cos(theta1) sin(theta1) cos(theta2) sin(theta2) thetaDot1 thetaDot2].
For the first link, an angle of 0 corresponds to the link pointing downwards.
The angle of the second link is relative to the angle of the first link.
An angle of 0 corresponds to having the same angle between the two links.
A state of [1, 0, 1, 0, ..., ...] means that both links point downwards.

**ACTIONS:**
The action is either applying +1, 0 or -1 torque on the joint between
the two pendulum links.

In [None]:
# Define the environment
env = gym.make('Acrobot-v1')

# Set a seed for reproducing results
# random_seed = 1234
# np.random.seed(random_seed)
# torch.manual_seed(random_seed)
# env.seed(random_seed);

In [None]:
# What is the type and size of the action space
print("Action space: {}".format(env.action_space))  # 3 discrete actions, 

# What does an action look like
sample_action = env.action_space.sample()  # Action can be one of these: 0, 1, 2
print("Sample action: {}".format(sample_action))  # Execute multiple times to see different actions
print("Type of action: {}".format(type(sample_action)))

# What is the type and size of the observation (state) space
print("Observationtate space: {}".format(env.observation_space))  # continuous states

# Which state does the agent start in?
initial_state = env.reset()
print("Initial state: {}".format(initial_state))  

# What is an observation
sample_observation = env.observation_space.sample()
print("Sample observation: {}".format(sample_observation))
print("Type of observation: {}".format(type(sample_observation)))

In [None]:
###TRAINING####
training_results = list() # A list for storing the hyperparameters and the corresponding results
max_episodes = 1000

In [None]:
### Train AAC-Agent (neural network policy - agent) ###
hyperparam_dict = {'name': 'AAC-Agent', 'learning_rate':0.01, 'gamma':0.99}

aac_agent = AAC_Agent(env, num_episodes=max_episodes, num_steps=500, learning_rate=0.001,
                      gamma=0.99, hidden_dim=100, dropout=0.6, log_interval=5)
ep_rewards, running_rewards = aac_agent.train()
training_results.append((hyperparam_dict, ep_rewards, running_rewards))

In [None]:
### Train Monte Carlo policy gradient Agent (REINFORCE - Agent) ###
hyperparam_dict = {'name': 'MC_PolGrad-Agent', 'learning_rate':0.001, 'gamma':0.99}
mc_polGrad_agent = MC_PolGrad_Agent(env, num_episodes=max_episodes, num_steps=500, learning_rate=0.001,
                      gamma=0.99, hidden_dim=100, dropout=0.6, log_interval=5)
ep_rewards, running_rewards = mc_polGrad_agent.train()
training_results.append((hyperparam_dict, ep_rewards, running_rewards))

In [None]:
### Train SARSA-Agent (semi-gradient) ###
hyperparam_dict = {'name': 'SARSA-Agent', 'learning_rate':0.0002, 'gamma':0.99, 'epsilon':0.1}
sarsa_agent = SARSA_Agent(env, num_episodes=max_episodes, num_steps=500, learning_rate=0.0001,
                          gamma=0.99, epsilon=0.2, n_hidden_neurons=16, log_interval=5)
ep_rewards, running_rewards = sarsa_agent.train()
training_results.append((hyperparam_dict, ep_rewards, running_rewards))

In [None]:
### Train Q-Agent (semi-gradient) ###
hyperparam_dict = {'name': 'Q-Agent', 'learning_rate':0.0001, 'gamma':0.99, 'epsilon':0.1}
q_agent = Q_Agent(env, num_episodes=max_episodes, num_steps=500, learning_rate=0.0001,
                  gamma=0.99, epsilon=0.1, n_hidden_neurons=200, log_interval=100)
ep_rewards, running_rewards = q_agent.train()
training_results.append((hyperparam_dict, ep_rewards, running_rewards))

In [None]:
### Train Q_DQN-Agent (semi-gradient) ###
# neurons=16 and epsilon=0.2
hyperparam_dict = {'name': 'Q-DQN-Agent', 'learning_rate':0.0001, 'gamma':0.99, 'epsilon':0.1}
q_dqn_agent = Q_DQN_Agent(env, num_episodes=max_episodes, num_steps=500, learning_rate=0.0001,
                  gamma=0.99, epsilon=0.2, n_hidden_neurons=16, log_interval=5)
ep_rewards, running_rewards = q_dqn_agent.train()
training_results.append((hyperparam_dict, ep_rewards, running_rewards))

In [None]:
### Train SARSA_DQN-Agent (semi-gradient) ###
# neurons=16 and epsilon=0.2
hyperparam_dict = {'name': 'SARSA-DQN-Agent', 'learning_rate':0.0001, 'gamma':0.99, 'epsilon':0.1}
sarsa_dqn_agent = SARSA_DQN_Agent(env, num_episodes=max_episodes, num_steps=500, learning_rate=0.0001,
                  gamma=0.99, epsilon=0.2, n_hidden_neurons=16, log_interval=5)
ep_rewards, running_rewards = sarsa_dqn_agent.train()
training_results.append((hyperparam_dict, ep_rewards, running_rewards))

In [None]:
plt.rcParams.update({'font.size': 18})

# Plot the results
fig = plt.figure(1, figsize=(20,8))

for result in training_results:
    hp = result[0]
    ep_rewards = result[1]
    running_rewards = result[2]
    # plt.plot(range(len(ep_rewards)), ep_rewards, lw=2, color="red", label=hp['name'])
    plt.plot(range(len(running_rewards)), running_rewards, lw=2, label=hp['name'])
    
    # title_str = hp['name'] + '($\gamma$:' + str(hp['gamma']) + ',lr:' + str(hp['learning_rate']) + ')'
    title_str = "Acrobot-v1 ($n_{hidden}$: 16, $\gamma$: 0.99)"
    plt.title(title_str)

plt.grid()
plt.xlabel('Episodes')
plt.ylabel('Running average of Rewards')
plt.legend() # ncol=1
plt.show()

In [None]:
ep_rewards = sarsa_agent.evaluation(n_episodes=10)

In [None]:
ep_rewards = q_dqn_agent.evaluation(n_episodes=10)

In [None]:
####VIDEO#####
env_to_wrap = gym.make('Acrobot-v1')
env = gym.wrappers.Monitor(env_to_wrap, 'videos/AAC', force = True)
ep_rewards = aac_agent.polGrad_evaluation(n_episodes=10, vid_env=env)
env.close()
env_to_wrap.close()

In [None]:
env_to_wrap = gym.make('Acrobot-v1')
env = gym.wrappers.Monitor(env_to_wrap, 'videos/SARSA', force = True)
ep_rewards = sarsa_agent.evaluation(n_episodes=3, vid_env=env)
env.close()
env_to_wrap.close()