In [12]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import obstacle_env
import time
import pickle

# Constants
GAMMA = 0.99
init = True
model_path = 'reinforce_weights.pt'
save = False

class PolicyNetwork(nn.Module):

    def __init__(self, num_inputs, num_actions, hidden_size=256):
        super(PolicyNetwork, self).__init__()
        self.num_actions = num_actions
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.softmax(self.linear2(x), dim=1)
        return x 
    
class Agent:

    def __init__(self, env, init, path, learning_rate=0.01):
        self.env = env
        self.num_actions = self.env.action_space.n
        self.policy_network = PolicyNetwork(self.env.observation_space.shape[0], self.env.action_space.n)
        if not init:
            self.policy_network.load_state_dict(torch.load(path))
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=learning_rate)

    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.policy_network.forward(Variable(state))
        highest_prob_action = np.random.choice(self.num_actions, p=np.squeeze(probs.detach().numpy()))
        log_prob = torch.log(probs.squeeze(0)[highest_prob_action])
        return highest_prob_action, log_prob

    def update_policy(self, rewards, log_probs):
        discounted_rewards = []

        for t in range(len(rewards)):
            Gt = 0 
            pw = 0
            for r in rewards[t:]:
                Gt = Gt + GAMMA**pw * r
                pw = pw + 1
            discounted_rewards.append(Gt)
            
        discounted_rewards = torch.FloatTensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) # normalize discounted rewards

        policy_gradient = []
        for log_prob, Gt in zip(log_probs, discounted_rewards):
            policy_gradient.append(-log_prob * Gt)
        
        self.optimizer.zero_grad()
        policy_gradient = torch.stack(policy_gradient).sum()
        policy_gradient.backward()
        if init:
            self.optimizer.step()

    def train(self, max_episode=3000, max_step=200):
        N_TRIALS = 25
        mean_trial_rewards = []
        std_trial_rewards = []
        timestamps = []
        time_start = time.time()
        episode_rewards = []
        for episode in range(max_episode):
            state = env.reset()
            log_probs = []
            rewards = []
            episode_reward = 0

            for steps in range(max_step):
                action, log_prob = self.get_action(state)
                new_state, reward, done, _ = self.env.step(action)
                
                log_probs.append(log_prob)
                rewards.append(reward)
                episode_reward += reward
                
                if done:
                    print(done, new_state, reward)
                    self.update_policy(rewards, log_probs)
                    if episode % 10 == 0:
                        print("episode " + str(episode) + ": " + str(episode_reward))

                    break
                
                state = new_state
            episode_rewards.append(episode_reward)
            mean_trial_rewards.append(np.mean(episode_rewards[-N_TRIALS:]))
            std_trial_rewards.append(np.std(episode_rewards[-N_TRIALS:]))
            timestamps.append(time.time())
        return time_start, timestamps, mean_trial_rewards, std_trial_rewards
        


def save_results(mean_return, std_return, timestamps, seed, time_start, env_name, init, name='reinforce'):
    run_dict = {'name': name, 
                'avg_ret': mean_return,
                'std_dev': std_return,
                'time_start': time_start, 
                'timestamps': timestamps,
                'seed': seed,
                'env_name': env_name}
    if not init:
        filename = '../run_files/run_time_%s_%s_%s.pickle' % (name, seed, 'pretrained')
    else:
        filename = '../run_files/run_time_%s_%s.pickle' % (name, seed)
    #with open(filename, 'wb') as handle:
    #    pass#pickle.dump(run_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
if __name__ == '__main__':
    env_name = 'CartPole-v0'
    #env_name = 'MountainCar-v0'
    env = gym.make(env_name)
    SEED = 1234
    env.seed(SEED);
    np.random.seed(SEED);
    #env.render()
    torch.manual_seed(SEED);
    agent = Agent(env, init, model_path)
    time_start, timestamps, mean_trial_rewards, std_trial_rewards = agent.train(500,300)
    save_results(mean_trial_rewards, std_trial_rewards, timestamps, SEED, time_start, env_name, init)
    if save:
        torch.save(agent.policy_network.state_dict(), model_path)

True [0.36719117 0.08074458 0.2194969  1.36973359] 1.0
episode 0: 69.0
True [ 0.07122658  0.77459367 -0.22127263 -1.70040789] 1.0
True [ 0.10505218  1.21829256 -0.22349921 -2.04431334] 1.0
True [ 0.06488043  0.18985897 -0.21440691 -0.99718976] 1.0
True [-0.05309283 -0.16781267 -0.21080176 -0.6808287 ] 1.0
True [-1.94743236 -2.93712361  0.23027384  1.73228313] 1.0
True [-0.43562271 -0.89585787 -0.22651089 -0.65617565] 1.0
True [-0.10541313 -0.29491654 -0.22044357 -0.73294291] 1.0
True [ 0.11165356  0.04765813 -0.22004849 -0.78070678] 1.0
True [-0.55648532 -0.89074769 -0.22004459 -0.61460657] 1.0
True [ 2.41966006  2.58145836 -0.16321557 -0.58225845] 1.0
episode 10: 104.0
True [-0.04149668 -0.51873263 -0.21322947 -0.2526288 ] 1.0
True [-0.65336738 -1.11642921 -0.21783067 -0.37508981] 1.0
True [0.41895074 0.91619705 0.21336921 0.33635478] 1.0
True [-0.33328076 -0.84578098 -0.21177714 -0.50659281] 1.0
True [-0.54216855 -0.93260524 -0.21816503 -0.28798094] 1.0
True [0.1923691  0.87508247 0.

KeyboardInterrupt: 

In [2]:
# def main():
#     env = gym.make('obstacle-v0')
#     policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n, 128)
    
#     max_episode_num = 5000
#     max_steps = 10000
#     numsteps = []
#     avg_numsteps = []
#     all_rewards = []

#     for episode in range(max_episode_num):
#         state = env.reset()
#         log_probs = []
#         rewards = []

#         for steps in range(max_steps):
#             env.render()
#             action, log_prob = policy_net.get_action(state)
#             new_state, reward, done, _ = env.step(action)
#             log_probs.append(log_prob)
#             rewards.append(reward)

#             if done:
#                 update_policy(policy_net, rewards, log_probs)
#                 numsteps.append(steps)
#                 avg_numsteps.append(np.mean(numsteps[-10:]))
#                 all_rewards.append(np.sum(rewards))
#                 if episode % 1 == 0:
#                     sys.stdout.write("episode: {}, total reward: {}, average_reward: {}, length: {}\n".format(episode, np.round(np.sum(rewards), decimals = 3),  np.round(np.mean(all_rewards[-10:]), decimals = 3), steps))

#                 break
            
#             state = new_state
        
#     plt.plot(numsteps)
#     plt.plot(avg_numsteps)
#     plt.xlabel('Episode')
#     plt.show()