In [4]:
import gym
import math
import torch
import numpy as np
import collections
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical

In [3]:
env_names = [
#     'Acrobot-v1',
    'CartPole-v1',
#     'MountainCar-v0',
#     'Pendulum-v0',
]
envs = [gym.make(env_name) for env_name in env_names]
for env in envs:
    print(
        str(env),
        env._max_episode_steps,
        env.observation_space,
        env.action_space,
        env.spec.reward_threshold,
    )

NameError: name 'gym' is not defined

In [5]:
def in_out_length(env):
    input_length = env.observation_space.shape[0]
    if isinstance(env.action_space, gym.spaces.box.Box):
        output_length = env.action_space.shape[0]
    elif isinstance(env.action_space, gym.spaces.discrete.Discrete):
        output_length = env.action_space.n
    return input_length, output_length

# class PolicyNN(nn.Module):
#     def __init__(self, input_length, output_length, is_distribution=True, output_quantization_levels=11):
#         super(PolicyNN, self).__init__()
#         self.is_distribution = is_distribution
#         if not self.is_distribution:
#             output_length *= output_quantization_levels
#         self.fc = nn.Linear(input_length, output_length)
        
#     def forward(self, x):
#         x = self.fc(x)
#         return F.softmax(x, dim=1)
# #         if self.is_distribution:
# #             return F.softmax(x, dim=1)
# #         else:
# #             return x

class PolicyNN(nn.Module):
    def __init__(self, input_length, output_length):
        super(PolicyNN, self).__init__()
        self.fc1 = nn.Linear(input_length, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, output_length)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return F.softmax(x, dim=1)

def select_action_from_policy(model, state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    x = model(state)
    m = Categorical(x)
    action = m.sample()
    return action.item(), m.log_prob(action)

def standard_normal(x):
    t = torch.FloatTensor(x)
    t = (t - t.mean()) / (t.std() + np.finfo(np.float32).eps)
    return t

def compute_loss(probs, rewards, states, reward_func, gamma=0.95):
#     print('rewards', rewards)
#     print('states', states)
    print(rewards)
    rewards = reward_func(rewards, states)
    print(rewards)
#     print('after reward_func', rewards)
    print(sum(rewards))
    scaled_rewards = []
    # decay rewards with gamma
    tail = 0
    for reward in rewards[::-1]: # backward
        tail = reward + gamma * tail
        scaled_rewards.insert(0, tail) # insert at beginning
#     print('after gamma', scaled_rewards)
#     scaled_rewards = standard_normal(scaled_rewards)
#     print('after scale', scaled_rewards)
#     print('sum r %.2f' % sum(scaled_rewards))
#     print('avg p %.2f' % (sum(math.exp(p) for p in probs) / len(probs)))
    loss = 0
    for p, r in zip(probs, scaled_rewards):
        loss += p * r
    loss *= -1
    return loss

def is_solved(env, episode_rewards, env_desc):
    if len(episode_rewards) < 10:
        return False
    reward_threshold = env.spec.reward_threshold
    if reward_threshold is None:
            reward_threshold = env_desc.reward_threshold
    if sum(episode_rewards[-10:]) / 10.0 >= reward_threshold: # last 10 avg exceeds...
        return True
    return False

def train_solve(env, reward_func, num_episodes=10*1000):
    input_length, output_length = in_out_length(env)
    model = PolicyNN(input_length, output_length, False)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    episode_rewards = []
    loss = 0
    for episode in range(num_episodes):
        probs = []
        rewards = []
        states = []
        state = env.reset()
        for t in range(env._max_episode_steps):
            action, prob = select_action_from_policy(model, state)
            action = np.array([(action - (math.floor(model.output_quantization_levels/2))) * 0.2])
            state, reward, done, _ = env.step(action)
            if episode > 0 and episode % 200 == 0:
                env.render()
            probs.append(prob)
            rewards.append(reward)
            states.append(state.copy())
            if done:
                break
        episode_reward = sum(rewards)
        episode_rewards.append(episode_reward)
        if is_solved(env, episode_rewards):
            print('Solved in %s episodes!' % (episode+1))
            return model
        loss += compute_loss(probs, rewards, states, reward_func)
        if episode > 0 and episode % 10 == 0:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print('%d \t %0.3f \t %d' % (episode, loss.item(), episode_reward))
            loss = 0
    print('Failed!')
    return model
    
def mountaincar_reward_func(rewards, states):
    states = [(state[0]+0.5, state[1]/0.07) for state in states] # normalize
    mx = max(state[0] for state in states)
    mv = max(state[1] for state in states)
    print('%.2f' % max(state[0] for state in states))
    for i in range(len(rewards)):
        rewards[i] = 0
        if states[i][0] == mx:
            rewards[i] += mx
        if states[i][1] == mv:
            rewards[i] += mv
    return rewards

def pendelum_reward_func(rewards, states):
    rewards = [state[0] for state in states]
    head = [0] * 100
    tail = rewards[-100:]
    rewards = head + tail
    return rewards

# envs = [
# #     ('Acrobot-v1',     lambda rewards, states: rewards),
# #     ('CartPole-v1',    lambda rewards, states: rewards),
# #    ('MountainCar-v0', mountaincar_reward_func)
# #     'MountainCarContinuous-v0',
#     ('Pendulum-v0',   pendelum_reward_func),
# ]
# models = []
# for env_name, reward_func in envs:
#     env = gym.make(env_name)
#     print('Doing %s: %s' % (env_name, (
#         str(env),
#         env._max_episode_steps,
#         env.observation_space,
#         env.action_space,
#         env.spec.reward_threshold,
#     )))
#     model = train_solve(env, reward_func)
#     models.append((env_name, model))

In [8]:
def mountaincar_reward_func(rewards, states):
    rewards = [state[0] for state in states]
    head = [0] * 150
    tail = rewards[-50:]
    rewards = head + tail
    return rewards

def pendelum_reward_func(rewards, states):
    rewards = [state[0] for state in states]
    head = [0] * 100
    tail = rewards[-100:]
    rewards = head + tail
    return rewards

class PolicyShallowNN(nn.Module):
    def __init__(self, input_length, output_length):
        super(PolicyShallowNN, self).__init__()
        self.fc = nn.Linear(input_length, output_length)
        
    def forward(self, x):
        x = self.fc(x)
        return F.softmax(x, dim=1)

class PolicyDeepNN(nn.Module):
    def __init__(self, input_length, output_length):
        super(PolicyDeepNN, self).__init__()
        self.fc1 = nn.Linear(input_length, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, output_length)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return F.softmax(x, dim=1)

fields = [
    'name',
    'model_class',
    # the rest are optional, in case it's needed for the env
    'reward_func',
    'output_quantization_levels',
    'output_range',
    'reward_threshold',
]
EnvDescription = collections.namedtuple('EnvDescription', fields, defaults=[None] * len(fields))    

env_descriptions = [
    EnvDescription(name='CartPole-v1',
                   model_class=PolicyShallowNN,
                  ),
    EnvDescription(name='Acrobot-v1',
                   model_class=PolicyShallowNN,
                  ),
#     EnvDescription(name='Pendulum-v0',
#                    model_class=PolicyDeepNN,
#                    reward_func=pendelum_reward_func,
#                    output_quantization_levels=101,
#                    output_range=[-2, 2],
#                    reward_threshold=-150,
#                   ),
#     EnvDescription(name='MountainCar-v0',
#                    model_class=PolicyDeepNN,
#                    reward_func=mountaincar_reward_func
#                   ),
]

def reward_threshold(env, env_desc):
    if env_desc.reward_threshold is not None:
        return env_desc.reward_threshold
    else:
        return env.spec.reward_threshold

def is_solved(env, episode_rewards, reward_threshold):
    if len(episode_rewards) < 10:
        return False
    if sum(episode_rewards[-10:]) / 10.0 >= reward_threshold: # last 10 avg exceeds...
        return True
    return False

def compute_loss(probs, rewards, states=None, reward_func=None, gamma=0.99):
#     print('rewards', rewards)
#     print('states', states)
    if reward_func is not None:
        rewards = reward_func(rewards, states)
#     print('after reward_func', rewards)
#     print(sum(rewards))
    scaled_rewards = []
    # decay rewards with gamma
    tail = 0
    for reward in rewards[::-1]: # backward
        tail = reward + gamma * tail
        scaled_rewards.insert(0, tail) # insert at beginning
#     print('after gamma', scaled_rewards)
#     scaled_rewards = standard_normal(scaled_rewards)
#     print('after scale', scaled_rewards)
#     print('sum r %.2f' % sum(scaled_rewards))
#     print('avg p %.2f' % (sum(math.exp(p) for p in probs) / len(probs)))
    loss = 0
    for p, r in zip(probs, scaled_rewards):
        loss += p * r
    loss *= -1
    return loss

def train_solve(env_desc, num_episodes=100*1000):
    print('Doing %s' % env_desc.name)
    env = gym.make(env_desc.name)
    input_length, output_length = in_out_length(env)
    if env_desc.output_quantization_levels is not None:
        output_length *= env_desc.output_quantization_levels
    model = env_desc.model_class(input_length, output_length) #PolicyNN(input_length, output_length)
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    episode_rewards = []
    loss = 0
    for episode in range(num_episodes):
        if episode > 0: print('.', end=('' if bool(episode % 100) else '\n'))
        probs, rewards, states = [], [], []
        state = env.reset()
        for t in range(env._max_episode_steps):
            action, prob = select_action_from_policy(model, state)
            if env_desc.output_quantization_levels is not None:
                action = env_desc.output_range[0] + action * float(env_desc.output_range[1] - env_desc.output_range[0]) / (env_desc.output_quantization_levels-1)
                action = np.array([action])
            state, reward, done, _ = env.step(action)
#             if episode > 0 and episode % 200 == 0:
#                 env.render()
            probs.append(prob)
            rewards.append(reward)
            states.append(state.copy())
            if done:
                break
        episode_reward = sum(rewards)
        episode_rewards.append(episode_reward)
        if is_solved(env, episode_rewards, reward_threshold(env, env_desc)):
            print('\nSolved in %s episodes!' % (episode+1))
            return env, model
        loss += compute_loss(probs, rewards, states, env_desc.reward_func)
        if episode > 0 and episode % 10 == 0:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
#             print('%d \t %0.3f \t %d' % (episode, loss.item(), episode_reward))
            loss = 0
    print('\nFailed!')
    return env, model

envs, models = [], []
for env_desc in env_descriptions:
    env, model = train_solve(env_desc)
    envs.append(env)
    models.append(model)

Doing CartPole-v1
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
....................................................................................................
.........................................................................

In [21]:
def select_action_from_policy_best(model, state):
    state = torch.from_numpy(state).float().unsqueeze(0)
    probs = model(state)
    if probs[0][0] > probs[0][1]:
        return 0
    else:
        return 1

def play(env_desc, env, model):
    state = env.reset()
    for t in range(1, env._max_episode_steps):
        action, _ = select_action_from_policy(model, state)
        state, _, done, _ = env.step(action)
        env.render()
        if done:
            break

which = 1
print(env_descriptions, '\n\n', envs, '\n\n', models)
play(env_descriptions[which], envs[which], models[which])

[EnvDescription(name='CartPole-v1', model_class=<class '__main__.PolicyShallowNN'>, reward_func=None, output_quantization_levels=None, output_range=None, reward_threshold=None), EnvDescription(name='Acrobot-v1', model_class=<class '__main__.PolicyShallowNN'>, reward_func=None, output_quantization_levels=None, output_range=None, reward_threshold=None)] 

 [<TimeLimit<CartPoleEnv<CartPole-v1>>>, <TimeLimit<AcrobotEnv<Acrobot-v1>>>] 

 [PolicyShallowNN(
  (fc): Linear(in_features=4, out_features=2, bias=True)
), PolicyShallowNN(
  (fc): Linear(in_features=6, out_features=3, bias=True)
)]
