In [1]:
import torch
import gym
import numpy as np

In [3]:
from tensorboardX import SummaryWriter

In [4]:
##采样episodes,采取高收益episode, 训练q函数

# 1 nn  构造

In [5]:
##nn structure
HIDDEN = 128
## train parameter
BATCH_SIZE = 16
PERCENTILE = 70

In [6]:
class Net(torch.nn.Module):
    '''
    nn class for fitting policy(map state to action) functin
    '''
    def __init__(self,obs_size,hidden_size,n_actions):
        ##调用父类构造函数
        super(Net,self).__init__()
        self.net = torch.nn.Sequential(
            torch.nn.Linear(obs_size,hidden_size),
            torch.nn.ReLU(),
            torch.nn.Linear(hidden_size,n_actions)
        )
    def forward(self,x):
        return self.net(x)

In [7]:
from collections import namedtuple

In [10]:
## 创建结构体
namedtuple?

In [11]:
Episode = namedtuple('Episode',field_names=['reward','steps'])

In [12]:
EpisodeStep = namedtuple('EpisodeStep',field_names=['observation','action'])

# 2 交互过程

In [89]:
def iterate_batch(env,net,batch_size):
    ##根据策略 net，采取行动  on-policy
    batch = []
    episode_reward = 0.0
    episode_steps = []
    obs = env.reset()
    op_sm = torch.nn.Softmax(dim=1)
    while True:
        ##get action
        #print(obs),print(obs.shape)
        obs_value = torch.FloatTensor([obs])
        action_prob = op_sm(net(obs_value)).data.numpy()[0]
        ##sample to get action
        action = np.random.choice(len(action_prob),p = action_prob)
        ## get reward and obs
        next_obs,reward,is_done,_ = env.step(action)
        episode_reward += reward
        episode_steps.append(EpisodeStep(obs,action))
        if is_done:
            batch.append(Episode(reward = episode_reward,steps = episode_steps))
            episode_reward = 0.0
            episode_steps = []
            next_obs = env.reset()
            if len(batch) == batch_size:
                yield batch
                batch = []
        obs = next_obs

# 3 学习过程

In [90]:
def filter_batch(batch,percentile):
    rewards = list(map(lambda x:x.reward,batch))
    reward_bound = np.percentile(rewards,percentile)
    reward_mean = np.mean(rewards)
    
    train_obs = []
    train_act = []
    for example in batch:
        if example.reward < reward_bound:
            continue
        train_obs.extend(map(lambda x:x.observation,example.steps))
        train_act.extend(map(lambda x:x.action,example.steps))
        
    train_obs_value = torch.FloatTensor(train_obs)
    train_act_value = torch.LongTensor(train_act)
    return train_obs_value,train_act_value,reward_bound,reward_mean

In [76]:
env = gym.make('CartPole-v0')

In [77]:
obs_size = env.observation_space.shape[0]

In [78]:
action_size = env.action_space.n

# 3.1 tensorboard for visual

In [107]:
num_exp = 4
writer = SummaryWriter('./runs/cross_entroy_rl/exp%s'%(num_exp))

In [108]:
net = Net(obs_size=obs_size,hidden_size=HIDDEN,n_actions=action_size)
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=net.parameters(),lr=1e-2)

for iter_no,batch in enumerate(iterate_batch(env,net,BATCH_SIZE)):
    obs_v,act_v,reward_b,reward_m = filter_batch(batch,PERCENTILE)
    optimizer.zero_grad()
    action_score_v = net(obs_v)
    loss_v = loss(action_score_v,act_v)
    loss_v.backward()
    optimizer.step()
    print("%d: loss=%.3f, reward_mean = %.1f reward_bound = %.1f"%(iter_no,loss_v.item(),reward_m,reward_b))
    writer.add_scalar("loss",loss_v.item(),global_step=iter_no)
    writer.add_scalar('reward_bound',reward_b,global_step=iter_no)
    writer.add_scalar('reward_mean',reward_m,global_step=iter_no)
    if (50 == iter_no ):
        writer.close()
        break
        

0: loss=0.679, reward_mean = 25.1 reward_bound = 28.5
1: loss=0.673, reward_mean = 22.2 reward_bound = 24.5
2: loss=0.667, reward_mean = 31.0 reward_bound = 37.0
3: loss=0.651, reward_mean = 50.6 reward_bound = 64.5
4: loss=0.620, reward_mean = 47.3 reward_bound = 61.0
5: loss=0.639, reward_mean = 36.7 reward_bound = 40.5
6: loss=0.614, reward_mean = 54.9 reward_bound = 56.5
7: loss=0.629, reward_mean = 55.6 reward_bound = 69.5
8: loss=0.598, reward_mean = 52.4 reward_bound = 56.5
9: loss=0.599, reward_mean = 52.7 reward_bound = 59.0
10: loss=0.598, reward_mean = 61.2 reward_bound = 76.5
11: loss=0.583, reward_mean = 62.2 reward_bound = 68.5
12: loss=0.581, reward_mean = 66.6 reward_bound = 69.0
13: loss=0.559, reward_mean = 45.9 reward_bound = 47.5
14: loss=0.553, reward_mean = 62.6 reward_bound = 72.0
15: loss=0.559, reward_mean = 86.5 reward_bound = 100.5
16: loss=0.585, reward_mean = 61.2 reward_bound = 73.5
17: loss=0.554, reward_mean = 68.0 reward_bound = 81.5
18: loss=0.559, rew