In [1]:
# Always execute this cell

# Libraries will not be installed if running on ifi-europa.uibk.ac.at

# Make sure that the required libraries are installed on your local system
# If you are using Google Colab, remember to upload the requirements file before 
# running this cell
# If you are running this notebook locally, the requirements file needs to be in 
# the same location as this notebook
import os
running_local = True if os.getenv('JUPYTERHUB_USER') is None else False
    
if running_local:
    import sys
    !{sys.executable} -m pip install -r requirements_04_rl_pol_grad.txt

#     !{sys.executable} -m pip install -r requirements_04_rl_qlearn.txt

You should consider upgrading via the '/home/c7031297/FNN/bin/python3 -m pip install --upgrade pip' command.[0m


In [2]:
import os
import numpy as np
import itertools
from itertools import count
import matplotlib.pyplot as plt
from tqdm import trange
import gym

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import time

from torch.autograd import Variable
from torchsummary import summary

# Set a seed for reproducing results
random_seed = 1234
np.random.seed(random_seed)
torch.manual_seed(random_seed)


<torch._C.Generator at 0x7fadf459ef48>

In [3]:
# Define the environment
# env = gym.make('CartPole-v1')
env = gym.make('Acrobot-v1')
# env = gym.make('MountainCar-v0')
# Set the seed for gym
env.seed(random_seed)

# What is the type and size of the action space
print("Action space: {}".format(env.action_space))  # 2 discrete actions, 

# What does an action look like
sample_action = env.action_space.sample()  # Action can be one of these: 0, 1
print("Sample action: {}".format(sample_action))  # Execute multiple times to see different actions
print("Type of action: {}".format(type(sample_action)))

# What is the type and size of the observation (state) space
print("Observationtate space: {}".format(env.observation_space))  # continuous states

# Which state does the agent start in?
initial_state = env.reset()
print("Initial state: {}".format(initial_state))  

# What is an observation
sample_observation = env.observation_space.sample()
print("Sample observation: {}".format(sample_observation))
print("Type of observation: {}".format(type(sample_observation)))

Action space: Discrete(3)
Sample action: 0
Type of action: <class 'int'>
Observationtate space: Box(-28.274333953857422, 28.274333953857422, (6,), float32)
Initial state: [ 0.99965389 -0.02630794  0.99678118 -0.08017036  0.0960371  -0.00889859]
Sample observation: [ -0.6874997    0.5421291   -0.3066365    0.07097009   6.284189
 -13.656844  ]
Type of observation: <class 'numpy.ndarray'>


In [4]:
class Critic_Network(nn.Module):
    def __init__(self,env):
        super(Critic_Network, self).__init__()
        
        self.env = env
        self.Num_states = env.observation_space.shape[0]
        self.Num_actions = env.action_space.n
        print("States and actions=", self.Num_states, self.Num_actions)
        

        
        NumNeurons=64
        self.affine1 = nn.Linear(self.Num_states,NumNeurons)
        self.affine2 = nn.Linear(NumNeurons, NumNeurons)
        self.affine5 = nn.Linear(NumNeurons, 1)
        # Used for storing the log probabilities of the actions
        # which is required to compute the loss (and hence needed for the parameter update step)
        self.saved_log_probs = []
        #List to save critic value
        self.critic_value_history = []
        
        # Used for tracking the rewards the agent recieves in an episode.
        self.rewards = []

    def forward(self, x):
        """
        Defines the forward pass of the policy network.
        
        Args:
            x (Tensor): The current state as observed by the agent.
        Returns:
            (Tensor): (Actor,Critic_value)
        """

    
        #Commun Layers
        out=F.relu(self.affine1(x))
        out=F.relu(self.affine2(out))
        out=self.affine5(out)
        return out
    
    
    

    
    def Calulcate_value(self,state):
        """
        *Selects an action for the agent, by sampling from the action probabilities
        produced by the network, based on the current state. 
        *stores the log probability of the actions.
        *Stores the critic value
        
        Args:
            state (numpy array): The current state as observed by the agent.
            
        Returns:
            (int): Action to perform.
        """
        # Convert the state from a numpy array to a torch tensor
        state = torch.from_numpy(state).float().unsqueeze(0)
        
        #GET THE ACTION
        V = self.forward(state)
        return V

    def save(self, state_file='Critic_network.pt', save_dir='models'):
        """
        Saves a trained policy network.
        """
        # Save the model state
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
        torch.save(self.state_dict(), os.path.join(save_dir, state_file))

    @staticmethod
    def load(state_file='models/Critic_Network.pt'):
        """
        Loads a trained Critic network.
        """
        # Create a network object with the constructor parameters
        Critic = Critic_Network()
        # Load the weights
        Critic.load_state_dict(torch.load(state_file))
        # Set the network to evaluation mode
        Critic.eval()
        return Critic

In [5]:
class NeuralNetworkPolicy(nn.Module):
    def __init__(self,env):
        super(NeuralNetworkPolicy, self).__init__()
        
        self.env = env
        self.Num_states = env.observation_space.shape[0]
        self.Num_actions = env.action_space.n
        print("States and actions=", self.Num_states, self.Num_actions)
        

        #Layer definitions
        NumNeurons=64
        self.affine1 = nn.Linear(self.Num_states,NumNeurons)
        self.affine2 = nn.Linear(NumNeurons, NumNeurons)
        self.affine5 = nn.Linear(NumNeurons, self.Num_actions)
        
        # Used for storing the log probabilities of the actions
        # which is required to compute the loss (and hence needed for the parameter update step)
        self.saved_log_probs = []
        # Used for tracking the rewards the agent recieves in an episode.
        self.rewards = []

    def forward(self, x):
        """
        Defines the forward pass of the policy network.
        
        Args:
            x (Tensor): The current state as observed by the agent.
        Returns:
            (Tensor): (Actor,Critic_value)
        """

    
        #Commun Layers
        out=F.relu(self.affine1(x))
        out=F.relu(self.affine2(out))
        out=F.relu(self.affine5(out))
        out=F.softmax(out,dim=-1)
        return out
    
    

        
    
    def select_action(self,state):
        """
        *Selects an action for the agent, by sampling from the action probabilities
        produced by the network, based on the current state. 
        *stores the log probability of the actions.
        *Stores the critic value
        
        Args:
            state (numpy array): The current state as observed by the agent.
            
        Returns:
            (int): Action to perform.
        """
        # Convert the state from a numpy array to a torch tensor
        state = torch.from_numpy(state).float().unsqueeze(0)
        
        #GET THE ACTION
        # Get the predicted probabilities from the policy network
        probs = self.forward(state)
        # Sample the actions according to their respective probabilities
        m = Categorical(probs)
        action = m.sample()
        # Also calculate the log of the probability for the selected action
        logProb=m.log_prob(action)
        self.saved_log_probs.append(m.log_prob(action))
        

        # Return the chosen action
        return action.item(),logProb

    def save(self, state_file='policy_network.pt', save_dir='models'):
        """
        Saves a trained policy network.
        """
        # Save the model state
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
        torch.save(self.state_dict(), os.path.join(save_dir, state_file))

    @staticmethod
    def load(state_file='models/policy_network.pt'):
        """
        Loads a trained policy network.
        """
        # Create a network object with the constructor parameters
        policy = NeuralNetworkPolicy()
        # Load the weights
        policy.load_state_dict(torch.load(state_file))
        # Set the network to evaluation mode
        policy.eval()
        return policy

In [6]:
class Actor_Critic:
    def __init__(self,env=env,log_interval=5, max_episodes=1000, T=1000, save=False, **hyperparam_dict):
          
        """
        Loading hyperparameters.
        """

        # Fetch the hyperparameters
        self.gamma = hyperparam_dict['gamma']
        self.learning_rate = hyperparam_dict['learning_rate']

        state_size = env.observation_space.shape[0]

        # Create the policy function and set the training mode
        self.policy = NeuralNetworkPolicy(env=env)
        self.policy.train()

        # Define the optimizer and set the learning rate
        self.optimizer = optim.Adam(self.policy.parameters(), lr=self.learning_rate)
        self.T=T
        self.log_interval=log_interval
        self.max_episodes=max_episodes
        self.save=save
        
        #Critic Network
        self.Value_Function = Critic_Network(env=env)
        self.Value_Function.train()
        self.optimizer_Critic = optim.Adam(self.Value_Function.parameters(), lr=self.learning_rate)
        self.Critic_loss=nn.MSELoss()
        
        # Lists to store the episodic and running rewards for plotting
        self.ep_rewards = list()
        self.running_rewards = list()
        self.Bandera=False
    
    def Reinforce(self):
          

        """
        Implementation of the main body of the ACTOR-CRITIC algorithm.

        Args:
            policy (NeuralNetworkPolicy): The policy neural network.
            optimizer (child of torch.optim.Optimizer): Optimizer algorithm for gradient ascent.
            gamma (float): Discount factor in the range [0.0,1.0]. Defaults to 0.9.
            log_interval (int): Prints the progress after this many episodes. Defaults to 100.
            max_episodes (int): Maximum number of episodes to train for. Defaults to 1000.
            save (bool): Whether to save the trained network. Defaults to False.

        Returns:
            ep_rewards (list): List of actual cumulative rewards in each episode. 
            running_rewards (numpy array): List of smoothed cumulative rewards in each episode. 
        """

       



        # Start executing an episode
        for i_episode in count(1):
            # Steps you need to implement
            # 1. Reset the environment
            current_state = env.reset()
            # 2. Initialize `ep_reward` (the total reward for this episode)
            ep_reward =0
            # 3. For each step of the episode
            done=False
            t=0
            I=1
            while (not done):
                
                
                # 3.1 Select an action using the policy network
                action,logProb = self.policy.select_action(current_state)
                # 3.2 Perform the action and note the next state and reward and if the episode is done
                next_state, reward, done, _ = env.step(action)
                # Calculate Current and next value with Critic Network
                V_current=self.Value_Function.Calulcate_value(current_state)
                V_next=self.Value_Function.Calulcate_value(next_state)
                #V_next is 0 if the state is terminal
                if done==True:
                    V_next=torch.tensor([[0]])
                    
                #TD Target
                y_t=reward+self.gamma*V_next
                I=pow(self.gamma,I)
                
                # Reset the gradients of the parameters
                self.optimizer.zero_grad()
                self.optimizer_Critic.zero_grad()

                #Critic minimizing loss
                delta_t=self.Critic_loss(y_t,V_current)

                # Compute the cumulative loss
                Critic_loss_ = delta_t
                policy_loss = -logProb*I*delta_t

                # Backpropagate the loss through the network
                policy_loss.backward(retain_graph=True)
                Critic_loss_.backward()#retain_graph=True)
                
                # Perform a parameter update step
                self.optimizer_Critic.step()                
                self.optimizer.step()
                
                #Update state
                current_state=next_state
                
                # 3.3 Store the current reward in `policy.rewards`
                self.policy.rewards.append(reward)

                # 3.4 Increment the total reward in this episode
                ep_reward+=reward
                
                # 3.5 Check if the episode is finished using the `done` variable and break if yes
                t+=1
                if done==True or t>self.T:
                    break
            
            if i_episode==1:
                # To track the reward across consecutive episodes (smoothed)
                running_reward = ep_reward
                
            # Update the running reward
            running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward

            # Store the rewards for plotting
            self.ep_rewards.append(ep_reward)
            self.running_rewards.append(running_reward)


            if i_episode % self.log_interval == 0:
                print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                      i_episode, ep_reward, running_reward))

            if i_episode >= self.max_episodes:
                print('Max episodes exceeded, quitting.')
                break
        # Save the trained policy network
        if self.save:
            self.policy.save()
        

    
    def Plot(self):
        
        """
        Plot the Episode Rewards and Running Rewards
        """
        
        plt.rcParams.update({'font.size': 18})

        # Plot the results
        fig = plt.figure(1, figsize=(20,8))
        title_str = "Neural_Network" + '($\gamma$:' + str(self.gamma) + ',lr:' + str(self.learning_rate) + ')'
        plt.plot(range(len(self.ep_rewards)), self.ep_rewards, lw=2, color="red", label="episode rewards")
        plt.plot(range(len(self.running_rewards)), self.running_rewards, lw=2, color="blue", label="running rewards")
        plt.title(title_str)

        plt.grid()
        plt.xlabel('Episodes')
        plt.ylabel('Running average of Rewards')
        plt.legend(ncol=1)
        plt.show()
        
    def Test_Environment(self):
        "Test the trained Agent in the environment"
        
        
        
        # Steps you need to implement
        # 1. Reset the environment
        current_state = env.reset()
        # 3. For each step of the episode
        for i in range(self.max_episodes):
            # 3.1 Select an action using the policy network
            action,logProb = self.policy.select_action(current_state)
            # 3.2 Perform the action and note the next state and reward and if the episode is done
            next_state, reward, done, _ = env.step(action)
            if i%1==0:
                env.render()
            current_state=next_state
            # 3.5 Check if the episode is finished using the `done` variable and break if yes
            time.sleep(0.05)
            if done==True:
                
                
                print("i=",i)
                break



        time.sleep(2)
        env.close()
    def Delete(self):
        del self.optimizer
        del self.optimizer_Critic
        del self.policy
        del self.Value_Function
        

In [17]:
# Test cell: Here we will just test that all the functions execute without error
# Agent.Delete()
# Run the REINFORCE algorithm 
hyperparam_dict = {'name': 'neural_network', 'gamma':0.99, 'learning_rate':0.001}

# hyperparam_dict = {'name': 'neural_network', 'gamma':0.1, 'learning_rate':0.002}

Agent=Actor_Critic(env=env,log_interval=5,max_episodes=200,T=1000, **hyperparam_dict)

# Check the network structure of the q-network
print("SUMMARY ACTOR NETWORK")
print(summary(Agent.policy, input_size=env.observation_space.shape))
print("SUMMARY CRITIC NETWORK")
print(summary(Agent.Value_Function, input_size=env.observation_space.shape))




States and actions= 6 3
States and actions= 6 3
SUMMARY ACTOR NETWORK
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 64]             448
            Linear-2                   [-1, 64]           4,160
            Linear-3                    [-1, 3]             195
Total params: 4,803
Trainable params: 4,803
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.02
Estimated Total Size (MB): 0.02
----------------------------------------------------------------
None
SUMMARY CRITIC NETWORK
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 64]             448
            Linear-2                   [-1, 64]           4,160
           

In [None]:
Agent.Reinforce()


Episode 5	Last reward: -500.00	Average reward: -500.00
Episode 10	Last reward: -500.00	Average reward: -500.00
Episode 15	Last reward: -500.00	Average reward: -500.00
Episode 20	Last reward: -500.00	Average reward: -500.00
Episode 25	Last reward: -500.00	Average reward: -500.00
Episode 30	Last reward: -500.00	Average reward: -500.00
Episode 35	Last reward: -500.00	Average reward: -500.00
Episode 40	Last reward: -500.00	Average reward: -500.00
Episode 45	Last reward: -500.00	Average reward: -500.00
Episode 50	Last reward: -500.00	Average reward: -500.00
Episode 55	Last reward: -500.00	Average reward: -500.00
Episode 60	Last reward: -500.00	Average reward: -500.00
Episode 65	Last reward: -500.00	Average reward: -500.00
Episode 70	Last reward: -500.00	Average reward: -500.00
Episode 75	Last reward: -500.00	Average reward: -500.00
Episode 80	Last reward: -500.00	Average reward: -500.00
Episode 85	Last reward: -500.00	Average reward: -500.00
Episode 90	Last reward: -500.00	Average reward: -

In [None]:
Agent.Plot()



In [None]:
Agent.Test_Environment()

In [None]:
Agent.Delete()