In [1]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import obstacle_env
import time
import pickle

# Constants
GAMMA = 0.99
init = True
model_path = 'reinforce_weights.pt'
save = False

class PolicyNetwork(nn.Module):

    def __init__(self, num_inputs, num_actions, hidden_size=256):
        super(PolicyNetwork, self).__init__()
        self.num_actions = num_actions
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.softmax(self.linear2(x), dim=1)
        return x 
    
class Agent:

    def __init__(self, env, init, path, learning_rate=0.01):
        self.env = env
        self.num_actions = self.env.action_space.n
        self.policy_network = PolicyNetwork(self.env.observation_space.shape[0], self.env.action_space.n)
        if not init:
            self.policy_network.load_state_dict(torch.load(path))
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=learning_rate)

    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.policy_network.forward(Variable(state))
        highest_prob_action = np.random.choice(self.num_actions, p=np.squeeze(probs.detach().numpy()))
        log_prob = torch.log(probs.squeeze(0)[highest_prob_action])
        return highest_prob_action, log_prob

    def update_policy(self, rewards, log_probs):
        discounted_rewards = []

        for t in range(len(rewards)):
            Gt = 0 
            pw = 0
            for r in rewards[t:]:
                Gt = Gt + GAMMA**pw * r
                pw = pw + 1
            discounted_rewards.append(Gt)
            
        discounted_rewards = torch.FloatTensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) # normalize discounted rewards

        policy_gradient = []
        for log_prob, Gt in zip(log_probs, discounted_rewards):
            policy_gradient.append(-log_prob * Gt)
        
        self.optimizer.zero_grad()
        policy_gradient = torch.stack(policy_gradient).sum()
        policy_gradient.backward()
        if init:
            self.optimizer.step()

    def train(self, max_episode=3000, max_step=200):
        N_TRIALS = 25
        mean_trial_rewards = []
        std_trial_rewards = []
        timestamps = []
        time_start = time.time()
        episode_rewards = []
        for episode in range(max_episode):
            state = env.reset()
            log_probs = []
            rewards = []
            episode_reward = 0

            for steps in range(max_step):
                action, log_prob = self.get_action(state)
                new_state, reward, done, _ = self.env.step(action)
                
                log_probs.append(log_prob)
                rewards.append(reward)
                episode_reward += reward

                if done:
                    self.update_policy(rewards, log_probs)
                    if episode % 10 == 0:
                        print("episode " + str(episode) + ": " + str(episode_reward))

                    break
                
                state = new_state
            episode_rewards.append(episode_reward)
            mean_trial_rewards.append(np.mean(episode_rewards[-N_TRIALS:]))
            std_trial_rewards.append(np.std(episode_rewards[-N_TRIALS:]))
            timestamps.append(time.time())
        return time_start, timestamps, mean_trial_rewards, std_trial_rewards
        


def save_results(mean_return, std_return, timestamps, seed, time_start, env_name, init, name='reinforce'):
    run_dict = {'name': name, 
                'avg_ret': mean_return,
                'std_dev': std_return,
                'time_start': time_start, 
                'timestamps': timestamps,
                'seed': seed,
                'env_name': env_name}
    if not init:
        filename = '../run_files/run_time_%s_%s_%s.pickle' % (name, seed, 'pretrained')
    else:
        filename = '../run_files/run_time_%s_%s.pickle' % (name, seed)
    with open(filename, 'wb') as handle:
        pickle.dump(run_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
if __name__ == '__main__':
    env_name = 'CartPole-v0'
    env = gym.make(env_name)
    SEED = 69
    env.seed(SEED);
    np.random.seed(SEED);
    torch.manual_seed(SEED);
    agent = Agent(env, init, model_path)
    time_start, timestamps, mean_trial_rewards, std_trial_rewards = agent.train(500,200)
    save_results(mean_trial_rewards, std_trial_rewards, timestamps, SEED, time_start, env_name, init)
    if save:
        torch.save(agent.policy_network.state_dict(), model_path)

  result = entry_point.load(False)


episode 0: 47.0
episode 10: 13.0
episode 20: 11.0
episode 30: 13.0
episode 40: 9.0
episode 50: 11.0
episode 60: 9.0
episode 70: 9.0
episode 80: 8.0
episode 90: 10.0
episode 100: 9.0
episode 110: 40.0
episode 120: 31.0
episode 130: 62.0
episode 140: 43.0
episode 150: 44.0
episode 160: 41.0
episode 170: 42.0
episode 180: 66.0
episode 190: 43.0
episode 200: 41.0
episode 210: 35.0
episode 220: 41.0
episode 230: 53.0
episode 240: 57.0
episode 250: 71.0
episode 260: 47.0
episode 270: 76.0
episode 280: 44.0
episode 290: 61.0
episode 300: 46.0
episode 310: 156.0
episode 320: 57.0
episode 330: 44.0
episode 340: 120.0
episode 350: 90.0
episode 360: 61.0
episode 370: 64.0
episode 380: 49.0
episode 390: 52.0
episode 400: 91.0
episode 410: 89.0
episode 420: 48.0
episode 430: 54.0
episode 440: 45.0
episode 450: 190.0
episode 460: 97.0
episode 470: 56.0
episode 480: 94.0
episode 490: 52.0


In [2]:
# def main():
#     env = gym.make('obstacle-v0')
#     policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n, 128)
    
#     max_episode_num = 5000
#     max_steps = 10000
#     numsteps = []
#     avg_numsteps = []
#     all_rewards = []

#     for episode in range(max_episode_num):
#         state = env.reset()
#         log_probs = []
#         rewards = []

#         for steps in range(max_steps):
#             env.render()
#             action, log_prob = policy_net.get_action(state)
#             new_state, reward, done, _ = env.step(action)
#             log_probs.append(log_prob)
#             rewards.append(reward)

#             if done:
#                 update_policy(policy_net, rewards, log_probs)
#                 numsteps.append(steps)
#                 avg_numsteps.append(np.mean(numsteps[-10:]))
#                 all_rewards.append(np.sum(rewards))
#                 if episode % 1 == 0:
#                     sys.stdout.write("episode: {}, total reward: {}, average_reward: {}, length: {}\n".format(episode, np.round(np.sum(rewards), decimals = 3),  np.round(np.mean(all_rewards[-10:]), decimals = 3), steps))

#                 break
            
#             state = new_state
        
#     plt.plot(numsteps)
#     plt.plot(avg_numsteps)
#     plt.xlabel('Episode')
#     plt.show()