<a href="https://colab.research.google.com/github/nsattiraju/RL-Simple-implementation-of-AWAC-algorithm/blob/main/AWAC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade gspread

Requirement already up-to-date: gspread in /usr/local/lib/python3.6/dist-packages (3.6.0)


In [None]:
!pip install gym
!apt-get install python-opengl -y
!apt install xvfb -y

Reading package lists... Done
Building dependency tree       
Reading state information... Done
python-opengl is already the newest version (3.1.0+dfsg-1).
0 upgraded, 0 newly installed, 0 to remove and 14 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
xvfb is already the newest version (2:1.19.6-1ubuntu4.8).
0 upgraded, 0 newly installed, 0 to remove and 14 not upgraded.


In [None]:
!pip install gym[Box2D]



For rendering the environment, I need pyvirtualdisplay

In [None]:
!pip install pyvirtualdisplay



In [None]:
!pip install piglet



To activate virtual display we need to run a script once for training an agent, as follows:

In [None]:
from pyvirtualdisplay import Display

display=Display(visible=0,size=(1400,900))
display.start()

<pyvirtualdisplay.display.Display at 0x7fe9f3c79588>

make our environment

In [None]:
import gym
import matplotlib.pyplot as plt

env=gym.make('LunarLander-v2')
env.reset()
actions=env.action_space
print(actions.n)

4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from collections import deque
import random
import csv

class ReplayBuffer(object):
    def __init__(self,buffer_size):
        self.buffer_size=buffer_size
        self.num_experiences=0
        self.buffer=deque()
    
    def getBatch(self,batch_size):
        #Randomly sample batch_size examples
        if (self.num_experiences<batch_size):
            return random.sample(self.buffer,self.num_experiences)
        else:
            return random.sample(self.buffer,batch_size)
    def size(self):
        return self.buffer_size
    def add(self,state,action,reward,state_,done):
        experience=(state,action,reward,state_,done)
        with open('/content/drive/My Drive/AWAC_RL_ECE6504/AWAC_Implementation/dataset_AC.csv', 'a+') as f:
            header=['State','Action','Reward','Next_State','Done']
            writer=csv.DictWriter(f, fieldnames = header)
            writer.writeheader()
            writer.writerow({'State':state,
                             'Action':action,
                             'Reward':reward,
                             'Next_State':state_,
                             'Done':done})
            f.close
            #f.write(experience)
        if (self.num_experiences<self.buffer_size):
            self.buffer.append(experience)
            self.num_experiences+=1            
        else:
            self.buffer.popleft()
            self.buffer.append(experience)
    def count(self):
        return self.num_experiences
    def erase(self):
        self.buffer=deque()
        self.num_experiences=0

To plot curves

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_learning_curve(x, scores, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    #plt.savefig(figure_file)
    images_dir = '/content/drive/My Drive/AWAC_RL_ECE6504/AWAC_Implementation/Images'
    plt.savefig(f"{figure_file}/abc.png")

New Actor Critic Method

In [None]:
import numpy as np
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

class ActorCriticNetwork(nn.Module):
    def __init__(self, lr, input_dims, n_actions, fc1_dims=256, fc2_dims=256):
        super(ActorCriticNetwork, self).__init__()
        self.fc1 = nn.Linear(*input_dims, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.pi = nn.Linear(fc2_dims, n_actions)
        self.v = nn.Linear(fc2_dims, 1)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        pi = self.pi(x)
        v = self.v(x)

        return (pi, v)

class Agent():
    def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions, 
                 gamma=0.99):
        self.gamma = gamma
        self.lr = lr
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.actor_critic = ActorCriticNetwork(lr, input_dims, n_actions, 
                                               fc1_dims, fc2_dims)
        self.log_prob = None

    def choose_action(self, observation):
        state = T.tensor([observation], dtype=T.float).to(self.actor_critic.device)
        probabilities, _ = self.actor_critic.forward(state)
        probabilities = F.softmax(probabilities, dim=1)
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_prob = action_probs.log_prob(action)
        self.log_prob = log_prob

        return action.item()

    def learn(self, state, reward, state_, done):
        self.actor_critic.optimizer.zero_grad()

        state = T.tensor([state], dtype=T.float).to(self.actor_critic.device)
        state_ = T.tensor([state_], dtype=T.float).to(self.actor_critic.device)
        reward = T.tensor(reward, dtype=T.float).to(self.actor_critic.device)

        _, critic_value = self.actor_critic.forward(state)
        _, critic_value_ = self.actor_critic.forward(state_)

        delta = reward + self.gamma*critic_value_*(1-int(done)) - critic_value

        actor_loss = -self.log_prob*delta
        critic_loss = delta**2

        (actor_loss + critic_loss).backward()
        self.actor_critic.optimizer.step()



New main method

In [None]:
import gym
import numpy as np
#from actor_critic_torch import Agent
#from utils import plot_learning_curve

if __name__ == '__main__':
    env = gym.make('LunarLander-v2')
    buff=ReplayBuffer(10000)
    agent = Agent(gamma=0.99, lr=5e-6, input_dims=[8], n_actions=4,
                  fc1_dims=2048, fc2_dims=1536)
    n_games = 3000

    fname = 'ACTOR_CRITIC_' + 'lunar_lander_' + str(agent.fc1_dims) + \
            '_fc1_dims_' + str(agent.fc2_dims) + '_fc2_dims_lr' + str(agent.lr) +\
            '_' + str(n_games) + 'games'
    figure_file =  fname + '.png'

    scores = []
    for i in range(n_games):
        done = False
        observation = env.reset()
        score = 0
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            #add to our file
            buff.add(observation,action,reward, observation_,done)
            score += reward
            agent.learn(observation, reward, observation_, done)
            observation = observation_
        scores.append(score)

        avg_score = np.mean(scores[-100:])
        print('episode ', i, 'score %.1f' % score,
                'average score %.1f' % avg_score)

    x = [i+1 for i in range(n_games)]
    plot_learning_curve(x, scores, figure_file)

episode  0 score -256.1 average score -256.1


KeyboardInterrupt: ignored

Rough work

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import auth
auth.authenticate_user()

To read back from the Dataset

Offline dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

from google.colab import auth
auth.authenticate_user()

In [None]:
from keras.utils.np_utils import to_categorical
def one_hot(a, num_classes):  
    categorical_labels = to_categorical(a, num_classes)
    return categorical_labels

In [None]:
import gspread
from oauth2client.client import GoogleCredentials
import pandas as pd




def getStates(df):

    a=(df['State'].iloc[1:]).to_numpy()
    b=[]
    for i in a:
        i=i.strip('[]')
        i=i.replace('\n','')
        i=i.replace('\t','')
        l=[]
        for t in i.split():
            try:
                l.append(float(t))
            except ValueError:
                pass
    
        b.append(l)
    states=np.asarray(b,dtype=float)
    return states

def getNextStates(df):

    c=(df['Next_State'].iloc[1:]).to_numpy()
    d=[]
    for i in c:
        i=i.strip('[]')
        i=i.replace('\n','')
        i=i.replace('\t','')
        l=[]
        for t in i.split():
            try:
                l.append(float(t))
            except ValueError:
                pass
    
        d.append(l)
    next_states=np.asarray(d,dtype=float)
    return next_states

def getRewards(df):
    current_array=(df['Reward'].iloc[1:]).to_numpy()
    a = [float(numeric_string) for numeric_string in current_array]
    rewards=np.asarray(a,dtype=float)
    return rewards

def getActions(df):
    current_array=(df['Action'].iloc[1:]).to_numpy()
    a = [int(numeric_string) for numeric_string in current_array]
    actions=np.asarray(a,dtype=int)
    return actions

def getTerminals(df):
    a=[]
    for boolean_value in current_array:
        if (boolean_value=='TRUE'):
            boolean_value=True        
        else:
            boolean_value=False
        a.append(boolean_value)

def offline_dataset():
    gc = gspread.authorize(GoogleCredentials.get_application_default())
    worksheet = gc.open('dataset_AC').sheet1
    rows = worksheet.get_all_values()
    df=pd.DataFrame.from_records(rows)
    df.rename(columns={0: 'State', 1: 'Action',2:'Reward',3:'Next_State',4:'Done'}, inplace=True)
    new_header = df.iloc[0] #grab the first row for the header
    df = df[0:] #take the data less the header row
    df.columns = new_header #set the header row as the df header
    df.reset_index(drop=True)

    D_states=getStates(df)
    D_actions=getActions(df)
    D_rewards=getRewards(df)
    D_nextStates=getNextStates(df)
    D_done=getTerminals(df)
    D_length=len(df)
    
    return D_states,D_actions,D_rewards,D_nextStates,D_done,D_length
    







Replay Buffer AWAC


In [None]:
import numpy as np

class ReplayBuffer_AWAC():
    def __init__(self, input_shape, n_actions):
        D_states,D_actions,D_rewards,D_nextStates,D_done,D_length=offline_dataset()
        self.mem_size = D_length-1
        self.state_memory = D_states
        self.new_state_memory = D_nextStates
        self.action_memory = D_actions
        self.reward_memory = D_rewards
        self.terminal_memory = D_done
        
       
    def store_transition(self, state, action, reward, state_, done):
        index = len(Dataset)-1
        self.state_memory[index] = state
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.new_state_memory[index] = state_
        self.terminal_memory[index] = done

        #self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        #max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(self.mem_size, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        dones = self.terminal_memory[batch]

        return states, actions, rewards, states_, dones

Initialize a critic network(Q phi) and actor network (pi)


In [None]:
import os
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

class QNetwork_Q_phi(nn.Module):
    def __init__(self,beta,input_dims,fc1_dims,fc2_dims,n_actions,name,checkpoint_dir='/content/drive/My Drive/AWAC_RL_ECE6504/AWAC_Implementation/'):# have to provide the directory link here
        super(QNetwork_Q_phi,self).__init__()
        self.input_dims=input_dims
        self.fc1_dims=fc1_dims
        self.fc2_dims=fc2_dims
        self.n_actions=n_actions
        self.name=name
        self.checkpoint_dir=checkpoint_dir
        self.checkpoint_file=os.path.join(self.checkpoint_dir,name+'_AWAC')

        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.Q = nn.Linear(fc2_dims, n_actions)
        self.v = nn.Linear(fc2_dims, 1)

        self.optimizer = optim.Adam(self.parameters(), lr=beta)
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self,state,action):
        #print('state shape',state.shape)
        #print('action shape',action.shape)
        #k=T.cat([state,action],dim=1)
        #print(k.shape)
    
        Q_action_value = F.relu(self.fc1(state))
        Q_action_value = F.relu(self.fc2(Q_action_value))
        Q = self.Q(Q_action_value)
        V = self.v(Q_action_value)

        return Q,V
        
        
    
    def save_checkpoint(self):
        print('...Saving checkpoint...')
        T.save(self.state_dict(),self.checkpoint_file)
    
    def load_checkpoint(self):
        print('...loading checkpoint...')
        self.load_state_dict(T.load(self.checkpoint_file))

class PolicyNetwork_pi(nn.Module):
    def __init__(self,alpha,input_dims,fc1_dims,fc2_dims,n_actions,name,checkpoint_dir='/content/drive/My Drive/AWAC_RL_ECE6504/AWAC_Implementation/'):
        super(PolicyNetwork_pi,self).__init__()
        self.input_dims=input_dims
        self.fc1_dims=fc1_dims
        self.fc2_dims=fc2_dims
        self.n_actions=n_actions
        #print('policy network n_actions',n_actions)
        self.name=name
        self.checkpoint_dir=checkpoint_dir
        self.checkpoint_file=os.path.join(self.checkpoint_dir,name+'_AWAC')

        self.fc1=nn.Linear(*self.input_dims,self.fc1_dims)
        self.fc2=nn.Linear(self.fc1_dims,self.fc2_dims)
        self.pi=nn.Linear(self.fc2_dims,n_actions)

        self.optimizer=optim.Adam(self.parameters(),lr=alpha)
        self.device=T.device('cuda:0' if T.cuda.is_available() else 'cpu')

        self.to(self.device)

    def forward(self,state):
        prob =self.fc1(state)
        prob=F.relu(prob)
        prob=self.fc2(prob)
        prob=F.relu(prob)

        prob=T.tanh(self.pi(prob)) # if action is > +/- 1 then multiply by max action
        return prob
    
    def save_checkpoint(self):
        print('...saving checkpoint...')
        T.save(self.state_dict(),self.checkpoint_file)

    def load_checkpoint(self):
        print('...loading checkpoint...')
        self.load_state_dict(T.load(self.checkpoint_file))







Agent AWAC

In [None]:
import os
import torch as T
import torch.nn.functional as F
import numpy as np

class Agent_AWAC():
    #def __init__(self,alpha=5e-6,beta=5e-5,, input_dims=env.observation_space.shape, n_actions=env.action_space.n,gamma=0.99,fc1_dims=400, fc2_dims=300,batch_size)
    def __init__(self,alpha,beta,input_dims,n_actions,gamma,layer1_size,layer2_size,batch_size):
        
        self.alpha=alpha
        self.beta=beta
        self.input_dims=input_dims
        self.n_actions=n_actions
        self.gamma=gamma   
        self.layer1_size=layer1_size
        self.layer2_size=layer2_size 
        self.memory=ReplayBuffer_AWAC(input_dims, n_actions)
        self.batch_size=batch_size
        self.n_actions=n_actions
        self.log_prob = None
        
        # the ones using off policy data
        self.actor_beta=PolicyNetwork_pi(alpha,input_dims,layer1_size,layer2_size,n_actions=n_actions, name='Actor Pi Beta')
        self.actor_theta=PolicyNetwork_pi(alpha,input_dims,layer1_size,layer2_size,n_actions=n_actions, name='Actor Pi Theta')

        self.critic=QNetwork_Q_phi(beta, input_dims,layer1_size,layer2_size,n_actions=n_actions,name='Critic Q')

        self.target_actor_beta=PolicyNetwork_pi(alpha,input_dims,layer1_size,layer2_size,n_actions=n_actions,name='Target Pi Beta')
        #self.target_actor_theta=PolicyNetwork_pi(alpha,input_dims,layer1_size,layer2_size,n_actions=n_actions,name='Target Pi Theta')

        self.target_critic=QNetwork_Q_phi(beta,input_dims,layer1_size,layer2_size,n_actions=n_actions,name='Target Critic')

    def choose_action(self,observation):
        state_beta = T.tensor([observation], dtype=T.float).to(self.actor_beta.device)
        probabilities_beta = self.actor_beta.forward(state_beta)
        probabilities_beta = F.softmax(probabilities_beta, dim=1)
        action_probs_beta = T.distributions.Categorical(probabilities_beta)

        action_beta = action_probs_beta.sample()
        log_prob = action_probs_beta.log_prob(action_beta)
        self.log_prob = log_prob

        print(action.item())

        return action.item()
        
        

    def remember(self,state,action,reward,state_,done):
        self.memory.store_transition(state,action,reward,state_,done)
    
    
    def integrand(lambda_,delta):
        integrand_=np.exp((1/lambda_)*delta)
        return integrand_


    from scipy.integrate import quad
    def Z(actions_beta,n_actions,lambda_,delta):
        #since advantage can also be considered as delta/td error

        s1=actions_beta[action]
        s2=integrand(lambda_,delta)
        
        Z_s = quad((s1*s2), 0, n_actions, args=(actions_beta))
        return Z_s


        
    def learn(self,n_actions,lambda_,batch_size,flag):
        
        #sampling from beta

        states_beta,actions_beta,rewards_beta,next_states_beta,dones_beta=self.memory.sample_buffer(self.batch_size)

        print(dones_beta[:3]) 
        self.critic.optimizer.zero_grad()

        rewards_beta=T.tensor(rewards_beta,dtype=T.float).to(self.critic.device)
        dones_beta=T.tensor(dones_beta,dtype=T.bool).to(self.critic.device)
        #print('offline done',dones_beta)
        next_states_beta=T.tensor(next_states_beta,dtype=T.float).to(self.critic.device)
        states_beta=T.tensor(states_beta,dtype=T.float).to(self.critic.device)
        actions_beta=T.tensor(actions_beta,dtype=T.int).to(self.critic.device)

        scores=[]
        
        for index in range(self.batch_size):
            score=0
            #print('done',dones_beta[index])
            while not (dones_beta[index]):
                if (flag=='OFFLINE'):
                    target_action=self.target_actor_beta.forward(next_states_beta[index])
                    actions_beta=self.actor_beta.forward(states_beta[index])
                else:
                    target_action=self.target_actor_theta.forward(next_states_beta[index])
                    actions_beta=self.actor_theta.forward(states_beta[index])
            

                Q_beta_,value_beta_=self.target_critic.forward(next_states_beta[index],target_action)
                Q_beta,value_beta=self.critic.forward(states_beta[index],actions_beta[index])

                print('rewards_beta[index]',rewards_beta[index])
                print('value_beta_',value_beta_)
                print('int(dones_beta[index])',int(dones_beta[index]))
                print('value_beta',value_beta)
            
                delta_beta = rewards_beta[index] + self.gamma*value_beta_*(1-int(dones_beta[index])) - value_beta
                score += rewards_beta[index] + self.gamma*value_beta
            
            
                actor_loss = -self.log_prob*delta_beta
                critic_loss = delta_beta**2

                actor_loss.backward()
                critic_loss.backward()

                self.critic.optimizer.step()
                if (flag=='OFFLINE'):
                    self.actor_beta.optimizer.step()
                else:
                    self.actor_theta.optimizer.step()

                Z_beta=Z(actions_beta[index],n_actions,lambda_,delta_beta)

                pi_theta=self.actor_theta.forward(states_beta[index]) #we get policy theta from input state_beta
                log_pi_theta=np.log(pi_theta)

                actor_theta_=log_pi_theta*(1/Z_beta)*integrand(lambda_,delta_beta)
            scores.append(score)
        return scores    

        
   




        

      
        




In [None]:
import gym
env = gym.make('LunarLander-v2')
input_dims=env.observation_space.shape
n_actions=env.action_space
print(n_actions)
print(input_dims)
observation=env.reset()
action=env.action_space.sample()
print('action',action)
state_, reward, done, info =env.step(3)
print(input_dims)
print(state_)
print(reward)
print(info)

Plotting curve

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_learning_curve(x, scores, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)

Main Function

In [None]:
import gym
import numpy as np

if __name__ == '__main__':
    
    
    
    env = gym.make('LunarLander-v2')
    number_offline_steps=3000
    n_games=5
    lambda_=5e-6
    input_dims=[8]
    n_actions=4
    gamma=0.99
    layer1_size=2048
    layer2_size=1536
    batch_size=100
    alpha=5e-6
    beta=5e-5
    agent = Agent_AWAC(alpha,beta,input_dims, n_actions,gamma,layer1_size, layer2_size,batch_size)
    buff=ReplayBuffer_AWAC(input_dims, n_actions)
    

    fname = 'AWAC' + 'lunar_lander_' + str(agent.layer1_size) + \
            ':layer1_size' + str(agent.layer2_size) + ':layer2_size'+ str(agent.alpha)+'alpha for actor' + str(agent.beta)+' :beta for critic' +\
            '_' + str(n_games) + 'games'
    figure_file =  fname + '.png'

    scores_history = []
    offline_step=0
    for i in range(n_games):
        if (offline_step>number_offline_steps):
            Flag='ONLINE'
        else:
            Flag='OFFLINE'    
             
        scores=agent.learn(n_actions,lambda_,batch_size,Flag)
        offline_step+=1

        if (Flag=='ONLINE'):
            observation = env.reset()
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            buff.remember(observation,action,reward, observation_,done)
            score+=reward           
        
        print(np.mean(scores))         
        scores_history.append(np.mean(scores))
        
        


        #avg_score = np.mean(scores[-100:])
        #print('iteration ', i, ' avg score %.1f' % np.mean(scores))
                

    #x = [i+1 for i in range(n_games)]
    #plot_learning_curve(x, scores_history, figure_file)

TypeError: ignored