In [32]:
import sys
import torch  
import gym
import numpy as np  
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import matplotlib.pyplot as plt
import obstacle_env
import time
import pickle

# Constants
GAMMA = 0.99
init = True
model_path = 'reinforce_weights.pt'
save = False

class PolicyNetwork(nn.Module):

    def __init__(self, num_inputs, num_actions, hidden_size=256):
        super(PolicyNetwork, self).__init__()
        self.num_actions = num_actions
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, num_actions)

    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.softmax(self.linear2(x), dim=1)
        return x 
    
class Agent:

    def __init__(self, env, init, path, learning_rate=0.01):
        self.env = env
        self.num_actions = 1 #
        #self.num_actions = self.env.action_space.n # for discrete actions env
        self.policy_network = PolicyNetwork(self.env.observation_space.shape[0], self.num_actions)
        if not init:
            self.policy_network.load_state_dict(torch.load(path))
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=learning_rate)

    def get_action(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.policy_network.forward(Variable(state))
        #print(probs)
        print(state)
        #highest_prob_action = np.random.choice(self.num_actions, p=np.squeeze(probs.detach().numpy())) # for discrete actions env
        highest_prob_action = [0]
        log_prob = torch.log(probs.squeeze(0)[highest_prob_action])
        return highest_prob_action, log_prob

    def update_policy(self, rewards, log_probs):
        discounted_rewards = []

        for t in range(len(rewards)):
            Gt = 0 
            pw = 0
            for r in rewards[t:]:
                Gt = Gt + GAMMA**pw * r
                pw = pw + 1
            discounted_rewards.append(Gt)
            
        discounted_rewards = torch.FloatTensor(discounted_rewards)
        discounted_rewards = (discounted_rewards - discounted_rewards.mean()) / (discounted_rewards.std() + 1e-9) # normalize discounted rewards

        policy_gradient = []
        for log_prob, Gt in zip(log_probs, discounted_rewards):
            policy_gradient.append(-log_prob * Gt)
        
        self.optimizer.zero_grad()
        policy_gradient = torch.stack(policy_gradient).sum()
        policy_gradient.backward()
        if init:
            self.optimizer.step()

    def train(self, max_episode=3000, max_step=1000):
        N_TRIALS = 25
        mean_trial_rewards = []
        std_trial_rewards = []
        timestamps = []
        time_start = time.time()
        episode_rewards = []
        for episode in range(max_episode):
            state = env.reset()
            log_probs = []
            rewards = []
            episode_reward = 0
            for steps in range(max_step):
                action, log_prob = self.get_action(state)
                print(action)
                new_state, reward, done, _ = self.env.step(action)
                
                log_probs.append(log_prob)
                rewards.append(reward)
                episode_reward += reward

                if steps == max_step - 1:#done:
                    self.update_policy(rewards, log_probs)
                    if episode % 10 == 0:
                        print("episode " + str(episode) + ": " + str(episode_reward))

                    break
                
                state = new_state
            episode_rewards.append(episode_reward)
            mean_trial_rewards.append(np.mean(episode_rewards[-N_TRIALS:]))
            std_trial_rewards.append(np.std(episode_rewards[-N_TRIALS:]))
            timestamps.append(time.time())
        return time_start, timestamps, mean_trial_rewards, std_trial_rewards
        


def save_results(mean_return, std_return, timestamps, seed, time_start, env_name, init, name='reinforce'):
    run_dict = {'name': name, 
                'avg_ret': mean_return,
                'std_dev': std_return,
                'time_start': time_start, 
                'timestamps': timestamps,
                'seed': seed,
                'env_name': env_name}
    if not init:
        filename = '../run_files/run_time_%s_%s_%s.pickle' % (name, seed, 'pretrained')
    else:
        filename = '../run_files/run_time_%s_%s.pickle' % (name, seed)
    with open(filename, 'wb') as handle:
        pickle.dump(run_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
if __name__ == '__main__':
    #env_name = 'CartPole-v0'
    env_name = 'MountainCarContinuous-v0'
    
    
    
    
    
    
    env = gym.envs.make("MountainCarContinuous-v0")
    video_dir = os.path.abspath("./videos")
    if not os.path.exists(video_dir):
        os.makedirs(video_dir)
    env = gym.wrappers.Monitor(env, video_dir, force=True)
    # Feature Preprocessing: Normalize to zero mean and unit variance
    # We use a few samples from the observation space to do this
    observation_examples = np.array([env.observation_space.sample() for x in range(10000)])
    scaler = sklearn.preprocessing.StandardScaler()
    scaler.fit(observation_examples)
    # Used to convert a state to a featurizes represenation.
    # We use RBF kernels with different variances to cover different parts of the space
    featurizer = sklearn.pipeline.FeatureUnion([
        ("rbf1", RBFSampler(gamma=5.0, n_components=100)),
        ("rbf2", RBFSampler(gamma=2.0, n_components=100)),
        ("rbf3", RBFSampler(gamma=1.0, n_components=100)),
        ("rbf4", RBFSampler(gamma=0.5, n_components=100))
    ])
    featurizer.fit(scaler.transform(observation_examples))
    def process_state(state):
        scaled = scaler.transform([state])
        featurized = featurizer.transform(scaled)
    return featurized[0]

    
    
    
    #env = gym.make(env_name)
    SEED = 69
    env.seed(SEED);
    np.random.seed(SEED);
    torch.manual_seed(SEED);
    agent = Agent(env, init, model_path)
    time_start, timestamps, mean_trial_rewards, std_trial_rewards = agent.train(500,10)
    save_results(mean_trial_rewards, std_trial_rewards, timestamps, SEED, time_start, env_name, init)
    if save:
        torch.save(agent.policy_network.state_dict(), model_path)

tensor([[-0.5951,  0.0000]])
[0]
tensor([[-5.9454e-01,  5.3198e-04]])
[0]
tensor([[-0.5935,  0.0011]])
[0]
tensor([[-0.5919,  0.0016]])
[0]
tensor([[-0.5898,  0.0021]])
[0]
tensor([[-0.5872,  0.0026]])
[0]
tensor([[-0.5842,  0.0031]])
[0]
tensor([[-0.5807,  0.0035]])
[0]
tensor([[-0.5767,  0.0039]])
[0]
tensor([[-0.5724,  0.0043]])
[0]
episode 0: 0.0
tensor([[-0.5963,  0.0000]])
[0]
tensor([[-5.9572e-01,  5.4067e-04]])
[0]
tensor([[-0.5946,  0.0011]])
[0]
tensor([[-0.5930,  0.0016]])
[0]
tensor([[-0.5909,  0.0021]])
[0]
tensor([[-0.5883,  0.0026]])
[0]
tensor([[-0.5852,  0.0031]])
[0]
tensor([[-0.5816,  0.0036]])
[0]
tensor([[-0.5776,  0.0040]])
[0]
tensor([[-0.5732,  0.0044]])
[0]
tensor([[-0.5656,  0.0000]])
[0]
tensor([[-5.6525e-01,  3.1393e-04]])
[0]
tensor([[-0.5646,  0.0006]])
[0]
tensor([[-0.5637,  0.0009]])
[0]
tensor([[-0.5625,  0.0012]])
[0]
tensor([[-0.5609,  0.0015]])
[0]
tensor([[-0.5591,  0.0018]])
[0]
tensor([[-0.5571,  0.0021]])
[0]
tensor([[-0.5547,  0.0023]])
[0]
tens

tensor([[-0.4487, -0.0024]])
[0]
tensor([[-0.4516, -0.0029]])
[0]
tensor([[-0.4551, -0.0034]])
[0]
tensor([[-0.4590, -0.0040]])
[0]
tensor([[-0.4635, -0.0044]])
[0]
tensor([[-0.4684, -0.0049]])
[0]
tensor([[-0.5095,  0.0000]])
[0]
tensor([[-5.0960e-01, -1.0578e-04]])
[0]
tensor([[-5.0981e-01, -2.1076e-04]])
[0]
tensor([[-5.1012e-01, -3.1416e-04]])
[0]
tensor([[-5.1054e-01, -4.1521e-04]])
[0]
tensor([[-0.5111, -0.0005]])
[0]
tensor([[-0.5117, -0.0006]])
[0]
tensor([[-0.5124, -0.0007]])
[0]
tensor([[-0.5131, -0.0008]])
[0]
tensor([[-0.5140, -0.0009]])
[0]
tensor([[-0.4922,  0.0000]])
[0]
tensor([[-4.9245e-01, -2.3501e-04]])
[0]
tensor([[-4.9292e-01, -4.6827e-04]])
[0]
tensor([[-0.4936, -0.0007]])
[0]
tensor([[-0.4945, -0.0009]])
[0]
tensor([[-0.4957, -0.0011]])
[0]
tensor([[-0.4970, -0.0013]])
[0]
tensor([[-0.4986, -0.0015]])
[0]
tensor([[-0.5003, -0.0017]])
[0]
tensor([[-0.5022, -0.0019]])
[0]
tensor([[-0.5005,  0.0000]])
[0]
tensor([[-5.0072e-01, -1.7274e-04]])
[0]
tensor([[-5.0107e-01

[0]
tensor([[-5.3059e-01,  2.8592e-04]])
[0]
tensor([[-5.3025e-01,  3.3835e-04]])
[0]
tensor([[-5.2986e-01,  3.8824e-04]])
[0]
tensor([[-5.2943e-01,  4.3521e-04]])
[0]
tensor([[-5.2895e-01,  4.7892e-04]])
[0]
tensor([[-0.5233,  0.0000]])
[0]
tensor([[-5.2326e-01, -2.5647e-06]])
[0]
tensor([[-5.2326e-01, -5.1101e-06]])
[0]
tensor([[-5.2327e-01, -7.6172e-06]])
[0]
tensor([[-5.2328e-01, -1.0067e-05]])
[0]
tensor([[-5.2329e-01, -1.2442e-05]])
[0]
tensor([[-5.2331e-01, -1.4723e-05]])
[0]
tensor([[-5.2333e-01, -1.6894e-05]])
[0]
tensor([[-5.2335e-01, -1.8938e-05]])
[0]
tensor([[-5.2337e-01, -2.0840e-05]])
[0]
tensor([[-0.5306,  0.0000]])
[0]
tensor([[-5.3051e-01,  5.2230e-05]])
[0]
tensor([[-5.3041e-01,  1.0407e-04]])
[0]
tensor([[-5.3025e-01,  1.5513e-04]])
[0]
tensor([[-5.3005e-01,  2.0502e-04]])
[0]
tensor([[-5.2979e-01,  2.5338e-04]])
[0]
tensor([[-5.2949e-01,  2.9984e-04]])
[0]
tensor([[-5.2915e-01,  3.4405e-04]])
[0]
tensor([[-5.2876e-01,  3.8567e-04]])
[0]
tensor([[-5.2834e-01,  4.244

tensor([[-0.4921, -0.0013]])
[0]
tensor([[-0.4937, -0.0015]])
[0]
tensor([[-0.4954, -0.0017]])
[0]
tensor([[-0.4974, -0.0020]])
[0]
tensor([[-0.4995, -0.0022]])
[0]
episode 110: 0.0
tensor([[-0.5158,  0.0000]])
[0]
tensor([[-5.1588e-01, -5.8328e-05]])
[0]
tensor([[-5.1600e-01, -1.1622e-04]])
[0]
tensor([[-5.1617e-01, -1.7324e-04]])
[0]
tensor([[-5.1640e-01, -2.2896e-04]])
[0]
tensor([[-5.1668e-01, -2.8296e-04]])
[0]
tensor([[-5.1702e-01, -3.3484e-04]])
[0]
tensor([[-5.1740e-01, -3.8422e-04]])
[0]
tensor([[-5.1783e-01, -4.3071e-04]])
[0]
tensor([[-5.1830e-01, -4.7397e-04]])
[0]
tensor([[-0.5102,  0.0000]])
[0]
tensor([[-5.1027e-01, -1.0070e-04]])
[0]
tensor([[-5.1047e-01, -2.0065e-04]])
[0]
tensor([[-5.1077e-01, -2.9910e-04]])
[0]
tensor([[-5.1116e-01, -3.9530e-04]])
[0]
tensor([[-5.1165e-01, -4.8854e-04]])
[0]
tensor([[-0.5122, -0.0006]])
[0]
tensor([[-0.5129, -0.0007]])
[0]
tensor([[-0.5136, -0.0007]])
[0]
tensor([[-0.5145, -0.0008]])
[0]
tensor([[-0.5134,  0.0000]])
[0]
tensor([[-5.1

tensor([[-0.4843, -0.0024]])
[0]
tensor([[-0.4871, -0.0027]])
[0]
tensor([[-0.4900, -0.0030]])
[0]
tensor([[-0.5085,  0.0000]])
[0]
tensor([[-5.0858e-01, -1.1343e-04]])
[0]
tensor([[-5.0881e-01, -2.2600e-04]])
[0]
tensor([[-5.0915e-01, -3.3688e-04]])
[0]
tensor([[-5.0959e-01, -4.4524e-04]])
[0]
tensor([[-0.5101, -0.0006]])
[0]
tensor([[-0.5108, -0.0007]])
[0]
tensor([[-0.5115, -0.0007]])
[0]
tensor([[-0.5124, -0.0008]])
[0]
tensor([[-0.5133, -0.0009]])
[0]
episode 150: 0.0
tensor([[-0.5516,  0.0000]])
[0]
tensor([[-5.5141e-01,  2.0993e-04]])
[0]
tensor([[-5.5099e-01,  4.1829e-04]])
[0]
tensor([[-0.5504,  0.0006]])
[0]
tensor([[-0.5495,  0.0008]])
[0]
tensor([[-0.5485,  0.0010]])
[0]
tensor([[-0.5473,  0.0012]])
[0]
tensor([[-0.5459,  0.0014]])
[0]
tensor([[-0.5444,  0.0016]])
[0]
tensor([[-0.5427,  0.0017]])
[0]
tensor([[-0.4240,  0.0000]])
[0]
tensor([[-0.4248, -0.0007]])
[0]
tensor([[-0.4262, -0.0015]])
[0]
tensor([[-0.4284, -0.0022]])
[0]
tensor([[-0.4313, -0.0029]])
[0]
tensor([[-0

tensor([[-5.6370e-01,  3.0225e-04]])
[0]
tensor([[-0.5631,  0.0006]])
[0]
tensor([[-0.5622,  0.0009]])
[0]
tensor([[-0.5610,  0.0012]])
[0]
tensor([[-0.5595,  0.0015]])
[0]
tensor([[-0.5578,  0.0017]])
[0]
tensor([[-0.5558,  0.0020]])
[0]
tensor([[-0.5536,  0.0022]])
[0]
tensor([[-0.5511,  0.0025]])
[0]
tensor([[-0.5049,  0.0000]])
[0]
tensor([[-5.0502e-01, -1.4030e-04]])
[0]
tensor([[-5.0530e-01, -2.7955e-04]])
[0]
tensor([[-5.0572e-01, -4.1670e-04]])
[0]
tensor([[-0.5063, -0.0006]])
[0]
tensor([[-0.5070, -0.0007]])
[0]
tensor([[-0.5078, -0.0008]])
[0]
tensor([[-0.5087, -0.0009]])
[0]
tensor([[-0.5097, -0.0010]])
[0]
tensor([[-0.5109, -0.0011]])
[0]
tensor([[-0.5834,  0.0000]])
[0]
tensor([[-5.8295e-01,  4.4609e-04]])
[0]
tensor([[-0.5821,  0.0009]])
[0]
tensor([[-0.5807,  0.0013]])
[0]
tensor([[-0.5790,  0.0018]])
[0]
tensor([[-0.5768,  0.0022]])
[0]
tensor([[-0.5743,  0.0026]])
[0]
tensor([[-0.5713,  0.0029]])
[0]
tensor([[-0.5680,  0.0033]])
[0]
tensor([[-0.5644,  0.0036]])
[0]
ten

tensor([[-0.5478,  0.0006]])
[0]
tensor([[-0.5470,  0.0007]])
[0]
tensor([[-0.5461,  0.0009]])
[0]
tensor([[-0.5450,  0.0011]])
[0]
tensor([[-0.5438,  0.0012]])
[0]
tensor([[-0.5424,  0.0014]])
[0]
tensor([[-0.5408,  0.0015]])
[0]
tensor([[-0.4375,  0.0000]])
[0]
tensor([[-0.4382, -0.0006]])
[0]
tensor([[-0.4394, -0.0013]])
[0]
tensor([[-0.4413, -0.0019]])
[0]
tensor([[-0.4438, -0.0025]])
[0]
tensor([[-0.4469, -0.0031]])
[0]
tensor([[-0.4506, -0.0037]])
[0]
tensor([[-0.4548, -0.0042]])
[0]
tensor([[-0.4596, -0.0047]])
[0]
tensor([[-0.4648, -0.0052]])
[0]
tensor([[-0.5666,  0.0000]])
[0]
tensor([[-5.6624e-01,  3.2130e-04]])
[0]
tensor([[-0.5656,  0.0006]])
[0]
tensor([[-0.5646,  0.0010]])
[0]
tensor([[-0.5634,  0.0013]])
[0]
tensor([[-0.5618,  0.0016]])
[0]
tensor([[-0.5600,  0.0018]])
[0]
tensor([[-0.5579,  0.0021]])
[0]
tensor([[-0.5555,  0.0024]])
[0]
tensor([[-0.5529,  0.0026]])
[0]
tensor([[-0.4332,  0.0000]])
[0]
tensor([[-0.4339, -0.0007]])
[0]
tensor([[-0.4352, -0.0013]])
[0]
te

tensor([[-5.3121e-01,  3.8713e-04]])
[0]
tensor([[-5.3077e-01,  4.4422e-04]])
[0]
tensor([[-5.3027e-01,  4.9797e-04]])
[0]
tensor([[-0.5297,  0.0005]])
[0]
tensor([[-0.4297,  0.0000]])
[0]
tensor([[-0.4304, -0.0007]])
[0]
tensor([[-0.4317, -0.0014]])
[0]
tensor([[-0.4338, -0.0021]])
[0]
tensor([[-0.4365, -0.0027]])
[0]
tensor([[-0.4399, -0.0034]])
[0]
tensor([[-0.4439, -0.0040]])
[0]
tensor([[-0.4485, -0.0046]])
[0]
tensor([[-0.4536, -0.0051]])
[0]
tensor([[-0.4593, -0.0057]])
[0]
tensor([[-0.5009,  0.0000]])
[0]
tensor([[-5.0109e-01, -1.6997e-04]])
[0]
tensor([[-5.0143e-01, -3.3867e-04]])
[0]
tensor([[-0.5019, -0.0005]])
[0]
tensor([[-0.5026, -0.0007]])
[0]
tensor([[-0.5034, -0.0008]])
[0]
tensor([[-0.5044, -0.0010]])
[0]
tensor([[-0.5055, -0.0011]])
[0]
tensor([[-0.5068, -0.0013]])
[0]
tensor([[-0.5082, -0.0014]])
[0]
tensor([[-0.5773,  0.0000]])
[0]
tensor([[-5.7692e-01,  4.0119e-04]])
[0]
tensor([[-0.5761,  0.0008]])
[0]
tensor([[-0.5749,  0.0012]])
[0]
tensor([[-0.5734,  0.0016]])

KeyboardInterrupt: 

In [None]:
# def main():
#     env = gym.make('obstacle-v0')
#     policy_net = PolicyNetwork(env.observation_space.shape[0], env.action_space.n, 128)
    
#     max_episode_num = 5000
#     max_steps = 10000
#     numsteps = []
#     avg_numsteps = []
#     all_rewards = []

#     for episode in range(max_episode_num):
#         state = env.reset()
#         log_probs = []
#         rewards = []

#         for steps in range(max_steps):
#             env.render()
#             action, log_prob = policy_net.get_action(state)
#             new_state, reward, done, _ = env.step(action)
#             log_probs.append(log_prob)
#             rewards.append(reward)

#             if done:
#                 update_policy(policy_net, rewards, log_probs)
#                 numsteps.append(steps)
#                 avg_numsteps.append(np.mean(numsteps[-10:]))
#                 all_rewards.append(np.sum(rewards))
#                 if episode % 1 == 0:
#                     sys.stdout.write("episode: {}, total reward: {}, average_reward: {}, length: {}\n".format(episode, np.round(np.sum(rewards), decimals = 3),  np.round(np.mean(all_rewards[-10:]), decimals = 3), steps))

#                 break
            
#             state = new_state
        
#     plt.plot(numsteps)
#     plt.plot(avg_numsteps)
#     plt.xlabel('Episode')
#     plt.show()