Starting notebook for this project. Instantiate and run Policy Gradient on CartPole environment. 

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym 
#from gym import wrappers
import matplotlib.pyplot as plt
%matplotlib inline

ModuleNotFoundError: No module named 'matplotlib'

In [None]:


class PolicyNetwork(nn.Module):
    def __init__(self,lr,input_dims,fc1_dims,fc2_dims,n_actions):
        super(PolicyNetwork,self).__init__()
        self.input_dims = input_dims
        self.lr = lr
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        
        self.fc1 = nn.Linear(*self.input_dims, self.fc1_dims)
        self.fc2 = nn.Linear(self.fc1_dims,self.fc2_dims)
        self.fc3 = nn.Linear(self.fc2_dims,self.n_actions)
        
        self.optimizer = torch.optim.Adam(self.parameters(),lr=lr)
        
        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        print("Device: ", self.device)
        self.to(self.device)
        
    def forward(self,observation):
        state = torch.Tensor(observation).to(self.device)
        
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x) #no activation, handled later
        
        return x


class Agent(object):
    #contains policy network and more!
    def __init__(self, lr, input_dims, gamma = 0.99, n_actions = 4, l1_size = 256, l2_size = 256,batch_size = 32):
    
        self.gamma = gamma
        self.reward_memory = [] #way of keeping track of rewards 
        self.action_memory = [] # and actions the agent took
        self.batch_action_memory = [] #way to keep track or rewards and action lists inside a list of length batch size
        self.batch_reward_memory = []
        self.batch_size = batch_size
        self.policy = PolicyNetwork(lr, input_dims, l1_size, l2_size, n_actions) #probability distribution used by the agent to select actions, given an observation/state. 
    
    def choose_action(self, observation):
        probabilities = F.softmax(self.policy.forward(observation))
        
        #now calculate an actual distribution from this. 
        
        action_probs = torch.distributions.Categorical(probabilities) # probability distribuition dictated by policy network
        
        # now we pick an action using sample method.
        
        action = action_probs.sample()
        
        #keep track of action log probability... log pi!  You know this!
        
        log_probs = action_probs.log_prob(action) #clearly this is a Pytorch specific method of how to select. 
        #looks like it takes distribution, and calculates log probability of that specific action. cool. 
        
        
        #now save. 
        
        self.action_memory.append(log_probs) #save a step in an episode. 
        
        #return an action
        
        return action.item() #this item is an integer, now able to get fed into gym env. 
    
    def store_rewards(self,reward):
        self.reward_memory.append(reward) #why not just do this in main? idk, it's a little more convenient. 
        
        
    def learn(self):
        #heart of the problem. Now for doing it with an entire batch..
        
        
        self.policy.optimizer.zero_grad()
        
        
        
        #It learns at the end of the episode, which is not a good thing! Would want an average of future returns, not per ep. 
        Gs = []
        for episode in self.batch_reward_memory:
            
            G = np.zeros_like(self.reward_memory,dtype = np.float64) #For mc Reinforce
            for t in range(len(self.reward_memory)):
                G_sum = 0
                discount = 1 
                for k in range(t,len(self.reward_memory)): #why from t? Rewards to go! 
                    G_sum += self.reward_memory[k] * discount
                    discount *= self.gamma  # decreases for future time steps. 
                
                G[t] = G_sum  # at the end of episode, store sum of returns at timestep t
            
            #standardize to reduce variance. Free lunch! 
            mean = np.mean(G)
            std = np.std(G) if np.std(G) > 0 else 1

            G = (G - mean)/std
            G = torch.Tensor(G).to(self.policy.device) #for some reason, worked like this. Phil had to use a specific data type. 
            Gs.append(G)
        
        
        loss = 0
        for G, action_memory in zip(Gs, self.batch_action_memory):
            for g, logprob in zip(G, action_memory):
                loss += -g * logprob #weight each probbility by future + current reward at that timestpe

                    #it's objective is to maximize this prbabil8ty 
            #spoilers. To do this for multiple trajectories, need to iterate over N episodes, and average these losses. 
            
            #another spoiler. Baseline should be subjtracted from here, perhaps just as mean G? 
            
            # another spoiler. Critic also goes here too! 
        #backprop!
        loss = loss / self.batch_size # so it's an average. 
        
        loss.backward()
        self.policy.optimizer.step()

        #zero out and repeat. This is a Sample inefficient MC, and a future improvement 
        self.action_memory = []
        self.reward_memory = []
        self.batch_action_memory = []
        self.batch_reward_memory = []

            #now for the main!

In [8]:


#Main 
score_history = []
score = 0
n_episodes = 800
batch_size = 1
#env = gym.make('LunarLander-v2')
env = gym.make('CartPole-v0')

agent = Agent(lr = 0.001, input_dims=[4],gamma=0.99,n_actions=env.action_space.n
             ,l1_size=128,l2_size=128,batch_size=batch_size)



#env = wrappers.Monitor(env, 'tmp/lunar-lander',
 #                      video_callable=lambda episodeid: True, force=True)

for i in range(n_episodes):

    for _ in range(batch_size):
        done = False
        score = 0
        observation = env.reset()
        while not done:
           # env.render()
            action = agent.choose_action(observation)
            observation_,reward, done, info = env.step(action)
            agent.store_rewards(reward)
            observation = observation_ #set the old obvs to the new one
            score += reward
        agent.batch_action_memory.append(agent.action_memory)
        agent.batch_reward_memory.append(agent.reward_memory)
    
    score_history.append(score)
    agent.learn()
    print('episode : ', i, 'score ', score)



Device:  cuda:0




episode :  0 score  15.0
episode :  1 score  47.0
episode :  2 score  13.0
episode :  3 score  25.0
episode :  4 score  17.0
episode :  5 score  14.0
episode :  6 score  8.0
episode :  7 score  16.0
episode :  8 score  18.0
episode :  9 score  26.0
episode :  10 score  75.0
episode :  11 score  41.0
episode :  12 score  32.0
episode :  13 score  34.0
episode :  14 score  75.0
episode :  15 score  38.0
episode :  16 score  19.0
episode :  17 score  43.0
episode :  18 score  42.0
episode :  19 score  64.0
episode :  20 score  27.0
episode :  21 score  88.0
episode :  22 score  19.0
episode :  23 score  45.0
episode :  24 score  52.0
episode :  25 score  17.0
episode :  26 score  18.0
episode :  27 score  35.0
episode :  28 score  19.0
episode :  29 score  14.0
episode :  30 score  35.0
episode :  31 score  24.0
episode :  32 score  14.0
episode :  33 score  28.0
episode :  34 score  20.0
episode :  35 score  12.0
episode :  36 score  60.0
episode :  37 score  27.0
episode :  38 score  34

KeyboardInterrupt: 

In [2]:
env = gym.make('CartPole-v0')

In [4]:
env.render()

NoSuchDisplayException: Cannot connect to "None"

In [9]:


def show_video():
    mp4list = glob.glob('video/*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else: 
        print("Could not find video")


def wrap_env(env):
    env = Monitor(env, './video', force=True)
    return env



In [5]:
env = gym.make('CartPole-v0')
env.monitor.start('/tmp/cartpole-experiment-1', force=True)
observation = env.reset()
for t in range(100):
#    env.render()
    print(observation)
    action = env.action_space.sample()
    observation, reward, done, info = env.step(action)
    if done:
        print("Episode finished after {} timesteps".format(t+1))
        break

env.monitor.close()

AttributeError: 'CartPoleEnv' object has no attribute 'monitor'

In [6]:
!pip install matplotlib

Collecting matplotlib
  Downloading matplotlib-3.3.4.tar.gz (37.9 MB)
     |################################| 37.9 MB 3.5 kB/s             
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting cycler>=0.10
  Downloading cycler-0.10.0-py2.py3-none-any.whl (6.5 kB)
Collecting kiwisolver>=1.0.1
  Downloading kiwisolver-1.3.1-cp36-cp36m-manylinux2014_aarch64.whl (77 kB)
     |################################| 77 kB 2.7 MB/s             
Building wheels for collected packages: matplotlib
  Building wheel for matplotlib (setup.py) ... [?25ldone
[?25h  Created wheel for matplotlib: filename=matplotlib-3.3.4-cp36-cp36m-linux_aarch64.whl size=9965820 sha256=dec106dbb41f5bee6534eb3cb9c0b65e96f52178ac5678eafa29ff06ee522b4c
  Stored in directory: /root/.cache/pip/wheels/09/f4/84/841a4c463638ce3204dcb7137673efe82f22fba82544946b1f
Successfully built matplotlib
Installing collected packages: kiwisolver, cycler, matplotlib
Successfully installed cycler-0.10.0 kiwisolver-1.3.1 matplo

Solution from https://stackoverflow.com/questions/40195740/how-to-run-openai-gym-render-over-a-server.

Idea from this comment is to just capture renders as video by using OpenAI Gym wrappers.Monitor and then display it within the Notebook.

In [7]:
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()


Reading package lists... Done
Building dependency tree       
Reading state information... Done
E: Unable to locate package python-opengl
Reading package lists... Done
Building dependency tree       
Reading state information... Done
E: Unable to locate package ffmpeg
Reading package lists... Done
Building dependency tree       
Reading state information... Done
E: Unable to locate package xvfb
Collecting pyvirtualdisplay
  Downloading PyVirtualDisplay-2.2-py3-none-any.whl (15 kB)
Collecting EasyProcess
  Downloading EasyProcess-0.3-py2.py3-none-any.whl (7.9 kB)
Installing collected packages: EasyProcess, pyvirtualdisplay
Successfully installed EasyProcess-0.3 pyvirtualdisplay-2.2


EasyProcessError: start error <EasyProcess cmd_param=['Xvfb', '-help'] cmd=['Xvfb', '-help'] oserror=[Errno 2] No such file or directory: 'Xvfb': 'Xvfb' return_code=None stdout="None" stderr="None" timeout_happened=False>

In [8]:
import gym
from gym import wrappers

env = gym.make("CartPole-v0")
env = wrappers.Monitor(env, "/tmp/CartPole-v0")

for episode in range(2):
    observation = env.reset()
    step = 0
    total_reward = 0

    while True:
        step += 1
        env.render()
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            print("Episode: {0},\tSteps: {1},\tscore: {2}"
                  .format(episode, step, total_reward)
            )
            break
env.close()

NoSuchDisplayException: Cannot connect to "None"