In [6]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import obstacle_env
import time
import pickle
import os
from sklearn import pipeline, preprocessing
from sklearn.kernel_approximation import RBFSampler
import continuous_cartpole

# Constants
GAMMA = 0.99
init = True
model_path = 'reinforce_weights.pt'
save = False

class PolicyNetwork(nn.Module):

    def __init__(self, num_inputs, num_actions, hidden_size=256):
        super(PolicyNetwork, self).__init__()
        #self.mu = nn.Linear(num_inputs, 1)
        #self.sigma = nn.Linear(num_inputs, 1)
        self.num_actions = num_actions
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)

    def forward(self, state):
        #x = torch.distrbution.Normal(self.mu, self.sigma).sample(torch.Tensor(1).size())
        x = F.relu(self.linear1(state))
        x = F.softmax(self.linear2(x), dim=1)
        return x 
    
class Agent:

    def __init__(self, env, init, path, learning_rate=0.01):
        self.env = env
        self.num_actions = self.env.action_space.n # for discrete actions env
        self.policy_network = PolicyNetwork(self.env.observation_space.shape[0], self.num_actions)
        if not init:
            self.policy_network.load_state_dict(torch.load(path))
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=learning_rate)
        
        

    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        #state = process_state(state)
        
        action = self.policy_network.forward(Variable(state))
        #print(probs)
        #print(state)
        highest_prob_action = np.random.choice(self.num_actions, p=np.squeeze(probs.detach().numpy())) # for discrete actions env
        #highest_prob_action = [0]
        log_prob = torch.log(probs.squeeze(0)[highest_prob_action])
        return highest_prob_action, log_prob

    def update_policy(self, rewards, log_probs):
        discounted_rewards = []

        for t in range(len(rewards)):
            Gt = 0 
            pw = 0
            for r in rewards[t:]:
                Gt = Gt + GAMMA**pw * r
                pw = pw + 1
            discounted_rewards.append(Gt)
            
        discounted_rewards = torch.FloatTensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) # normalize discounted rewards

        policy_gradient = []
        for log_prob, Gt in zip(log_probs, discounted_rewards):
            policy_gradient.append(-log_prob * Gt)
        
        self.optimizer.zero_grad()
        policy_gradient = torch.stack(policy_gradient).sum()
        policy_gradient.backward()
        if init:
            self.optimizer.step()

    def train(self, max_episode=3000, max_step=1000):
        N_TRIALS = 25
        mean_trial_rewards = []
        std_trial_rewards = []
        timestamps = []
        time_start = time.time()
        episode_rewards = []
        for episode in range(max_episode):
            state = env.reset()
            log_probs = []
            rewards = []
            episode_reward = 0
            for steps in range(max_step):
                action, log_prob = self.get_action(state)
                print(action)
                new_state, reward, done, _ = self.env.step(action)
                
                log_probs.append(log_prob)
                rewards.append(reward)
                episode_reward += reward

                if steps == max_step - 1:#done:
                    self.update_policy(rewards, log_probs)
                    if episode % 10 == 0:
                        print("episode " + str(episode) + ": " + str(episode_reward))

                    break
                
                state = new_state
            episode_rewards.append(episode_reward)
            mean_trial_rewards.append(np.mean(episode_rewards[-N_TRIALS:]))
            std_trial_rewards.append(np.std(episode_rewards[-N_TRIALS:]))
            timestamps.append(time.time())
        return time_start, timestamps, mean_trial_rewards, std_trial_rewards
        


def save_results(mean_return, std_return, timestamps, seed, time_start, env_name, init, name='reinforce'):
    run_dict = {'name': name, 
                'avg_ret': mean_return,
                'std_dev': std_return,
                'time_start': time_start, 
                'timestamps': timestamps,
                'seed': seed,
                'env_name': env_name}
    if not init:
        filename = '../run_files/run_time_%s_%s_%s.pickle' % (name, seed, 'pretrained')
    else:
        filename = '../run_files/run_time_%s_%s.pickle' % (name, seed)
    with open(filename, 'wb') as handle:
        pickle.dump(run_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
if __name__ == '__main__':
    env_name = 'CartPole-v0'
    #env_name = 'MountainCarContinuous-v0'
    
    
    
    
    
    env = gym.envs.make(env_name)
    #env = continuous_cartpole.ContinuousCartPoleEnv()

    '''
    video_dir = os.path.abspath("./videos")
    if not os.path.exists(video_dir):
        os.makedirs(video_dir)
    env = gym.wrappers.Monitor(env, video_dir, force=True)
    # Feature Preprocessing: Normalize to zero mean and unit variance
    # We use a few samples from the observation space to do this
    observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
    scaler = preprocessing.StandardScaler()
    scaler.fit(observation_examples)
    # Used to convert a state to a featurizes represenation.
    # We use RBF kernels with different variances to cover different parts of the space
    featurizer = pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
    ])
    featurizer.fit(scaler.transform(observation_examples))
    
    def process_state(state):
        scaled = scaler.transform([state])
        print(state)
        featurized = featurizer.transform(scaled)
        return featurized[0]

    
    '''
    
    #env = gym.make(env_name)
    SEED = 69
    env.seed(SEED);
    np.random.seed(SEED);
    torch.manual_seed(SEED);
    agent = Agent(env, init, model_path)
    time_start, timestamps, mean_trial_rewards, std_trial_rewards = agent.train(500,10)
    save_results(mean_trial_rewards, std_trial_rewards, timestamps, SEED, time_start, env_name, init)
    if save:
        torch.save(agent.policy_network.state_dict(), model_path)

NameError: name 'probs' is not defined

In [None]:
# def main():
#     env = gym.make('obstacle-v0')
#     policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n, 128)
    
#     max_episode_num = 5000
#     max_steps = 10000
#     numsteps = []
#     avg_numsteps = []
#     all_rewards = []

#     for episode in range(max_episode_num):
#         state = env.reset()
#         log_probs = []
#         rewards = []

#         for steps in range(max_steps):
#             env.render()
#             action, log_prob = policy_net.get_action(state)
#             new_state, reward, done, _ = env.step(action)
#             log_probs.append(log_prob)
#             rewards.append(reward)

#             if done:
#                 update_policy(policy_net, rewards, log_probs)
#                 numsteps.append(steps)
#                 avg_numsteps.append(np.mean(numsteps[-10:]))
#                 all_rewards.append(np.sum(rewards))
#                 if episode % 1 == 0:
#                     sys.stdout.write("episode: {}, total reward: {}, average_reward: {}, length: {}\n".format(episode, np.round(np.sum(rewards), decimals = 3),  np.round(np.mean(all_rewards[-10:]), decimals = 3), steps))

#                 break
            
#             state = new_state
        
#     plt.plot(numsteps)
#     plt.plot(avg_numsteps)
#     plt.xlabel('Episode')
#     plt.show()