If you run in jupyter, turn 

```
colab = False
```

In [1]:
colab = True
if colab:
    !pip install gym pyvirtualdisplay > /dev/null 2>&1
    !apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
    !apt-get update > /dev/null 2>&1
    !apt-get install cmake > /dev/null 2>&1
    !pip install --upgrade setuptools 2>&1
    !pip install ez_setup > /dev/null 2>&1



In [2]:
if colab:
    from google.colab import drive
    drive.mount('/content/drive')

    %cd /content/drive/MyDrive/Colab\ Notebooks/rl-master/day5/mbrl
    !ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Colab Notebooks/rl-master/day5/mbrl
clockwise.png  memory.py  my_pendulum.py  reward_ftns.py
mb_agent.py    mpc.ipynb  __pycache__	  video


In [3]:
import numpy as np
from scipy.stats import truncnorm
import warnings
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from torch.nn import MSELoss
from my_pendulum import MyPendulumEnv
from reward_ftns import pendulum_reward
from memory import TransitionMemory

In [4]:
class TransitionModel(nn.Module):
    def __init__(self, state_dim, act_dim, hidden1, hidden2):
        super(TransitionModel, self).__init__()
        self.state_dim = state_dim
        self.act_dim = act_dim
        self.fc1 = nn.Linear(state_dim + act_dim, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, state_dim)

    def forward(self, state, act):
        x = torch.cat([state, act], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        delta = self.fc3(x)

        next_state = state + delta  # \hat{s}_{t+1} = s_t + f(s_t, a_t ; \theta)
        
        return next_state

In [5]:
class ModelBasedAgent:
    def __init__(self,
                 state_dim,
                 act_dim,
                 ctrl_range,
                 reward_ftn,
                 hidden1=400,
                 hidden2=400,
                 lr=0.001,
                 mem_sz=20000
                 ):
        self.dimS = state_dim
        self.dimA = act_dim
        self.ctrl_range = ctrl_range
        self.model = TransitionModel(state_dim, act_dim, hidden1, hidden2)
        self.memory = TransitionMemory(state_dim, act_dim, maxlen=mem_sz)
        
        self.reward_model = reward_ftn
        
        self.optimizer = Adam(self.model.parameters(), lr=lr)
        
        self.alpha = 0.5    # smoothing parameter of cross-entropy optimization
        self.Ne = 20        # number of elite samples

    def train(self, batch_size):
        self.model.train()
        # note that training of the dynamics does not depend on any reward info
        (state_batch, act_batch, next_state_batch) = self.memory.sample_batch(batch_size)

        state_batch = torch.tensor(state_batch).float()
        act_batch = torch.tensor(act_batch).float()
        next_state_batch = torch.tensor(next_state_batch).float()

        # TODO : Perform pediction on current state for supervised learning
        prediction = self.model(state_batch, act_batch)
        #prediction = self.model()

        loss_ftn = MSELoss()
        loss = loss_ftn(prediction, next_state_batch)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        loss_val = loss.detach().numpy()

        return loss_val

    def act(self, state, H, N, cross_entropy=False):
        if cross_entropy:
            return self.solve_cross_entropy_opt(state, H, N)
        else:
            return self.solve_random_shooting_opt(state, H, N)
    
    
    def solve_random_shooting_opt(self, state, H, N):
        """
        # generate K trajectories using the model of dynamics and random action sampling, and perform MPC
        # Remark! K roll-outs can be done simultaneously!

        given a state, execute an action based on random-sampling shooting method
        :param state: current state(numpy array)
        :param rew_ftn: vectorized reward function
        :param K: number of candidate action sequences to generate
        :param H: length of time horizon
        :return: action to be executed(numpy array)
        """
        assert N > 0 and H > 0

        dimA = self.dimA

        self.model.eval()

        states = np.tile(state, (N, 1)) # shape = (K, dim S)
        scores = np.zeros(N)    # array which contains cumulative rewards of roll-outs

        # generate K random action sequences of length H
        action_sequences = self.ctrl_range * (2. * np.random.rand(H, N, dimA) - 1.)
        first_actions = action_sequences[0]     # shape = (K, dim A)


        for t in range(H):
            actions = action_sequences[t]    # set of K actions, shape = (K, dim A)
            scores += self.reward_model(states, actions)

            s = torch.tensor(states).float()
            a = torch.tensor(actions).float()

            # TODO : Rollout action sequence to predict next state
            #next_s = 
            next_s = self.model(s,a)

            # torch tensor to numpy array
            # this cannot be skipped since a reward function takes numpy arrays as its inputs
            states = next_s.detach().numpy()

        best_seq = np.argmax(scores)

        return action_sequences[0, best_seq]
    
    
    
    def solve_cross_entropy_opt(self, state, H, N, rho=0.2, tol=1e-2, max_it=100):
        Ne = int(rho * N)
        dimA = self.dimA
        
        self.model.eval()

        states = np.tile(state, (N, 1)) # shape = (K, dim S)
        scores = np.zeros(N)    # array which contains cumulative rewards of roll-outs
        
        
        mu = np.zeros((H, 1, dimA))
        sigma = (.5 * self.ctrl_range) * np.ones((H, 1, dimA))
        
        assert np.max(sigma) >= tol
        best_action_so_far = None
        best_score_so_far = -np.inf
        trial = 0
        while np.max(sigma) >= tol and trial < max_it:
            action_sequences = truncnorm.rvs((-self.ctrl_range - mu) / sigma,
                                             (self.ctrl_range - mu) / sigma,
                                             loc=mu,
                                             scale=sigma,
                                             size=(H, N, dimA))
            # generate K random action sequences of length H
            for t in range(H):
                actions = action_sequences[t]    # set of K actions, shape = (K, dim A)
                scores += self.reward_model(states, actions)
                s = torch.tensor(states).float()
                a = torch.tensor(actions).float()
                next_s = self.model(s, a)
                # torch tensor to numpy array
                states = next_s.detach().numpy()
            # determine the elite samples of the group
            indices = np.argsort(scores)[-Ne:]
            idx = indices[-1]
            if scores[idx] > best_score_so_far:
                best_score_so_far = scores[idx]
                best_action_so_far = action_sequences[0, idx, :]   
            elite_samples = action_sequences[:, indices, :]
            mu += self.alpha * (np.mean(elite_samples, axis=1, keepdims=True) - mu)
            sigma += self.alpha * (np.std(elite_samples, axis=1, keepdims=True) - sigma)
            trial += 1
        if trial == max_it:
            warnings.warn('maximum iteration exceeded', RuntimeWarning)
        return best_action_so_far

In [6]:
def collect_rand_trajectories(env, transition_memory, num_trajectories):
    print('collecting random trajectories...')
    for i in range(num_trajectories):
        # collect random trajectories
        # able to be accelerated with mpi
        state = env.reset()
        for _ in range(200):
            action = env.action_space.sample()
            next_state, _, _, _ = env.step(action)
            transition_memory.append(state, action, next_state)
            state = next_state

        if i % 10 == 9:
            print('{} trajectories collected'.format(i + 1))
    print('done')
    return

In [7]:
env = MyPendulumEnv(g=10.0)
state_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
ctrl_range = env.action_space.high[0]

In [8]:
agent = ModelBasedAgent(state_dim, act_dim, ctrl_range, pendulum_reward, hidden1=64, hidden2=64, mem_sz=30000)
collect_rand_trajectories(env, agent.memory, num_trajectories=150)

collecting random trajectories...
10 trajectories collected
20 trajectories collected
30 trajectories collected
40 trajectories collected
50 trajectories collected
60 trajectories collected
70 trajectories collected
80 trajectories collected
90 trajectories collected
100 trajectories collected
110 trajectories collected
120 trajectories collected
130 trajectories collected
140 trajectories collected
150 trajectories collected
done


In [9]:
batch_size = 256
max_iter = 10
num_ep_per_it = 5

In [10]:
# pre-train the network using randomly collected data
num_epochs = 10
epoch_size = len(agent.memory) // batch_size

for i in range(max_iter + num_epochs):
    # training loop
    # train the model
    if i == 0:
        print('pre-training...')

    if i == num_epochs:
        print('start MPC control...')

    if i < num_epochs:
        # first train the network only using randomly collected data
        for epoch in range(epoch_size):
            loss = agent.train(batch_size=batch_size)
        print('[iter {}] loss val : {:.4f}'.format(i, loss))

    else:
        scores = np.zeros(num_ep_per_it)
        
        for ep in range(num_ep_per_it):    
            state = env.reset()
            score = 0.
            for _ in range(200):
                if i % 5 == 0:
                    pass
                    # env.render()
                # environment roll-out
                # at each step, select an action using MPC (on-policy data)
                action = agent.act(state, H=60, N=400, cross_entropy=False)
                next_state, rew, _, _ = env.step(action)
                agent.memory.append(state, action, next_state)
                score += rew
                state = next_state
            scores[ep] = score
        avg = np.mean(scores)
        std = np.std(scores)
        env.close()
        print('score (over {} episodes) = {:4f}'.format(num_ep_per_it, avg), u'\u00B1', '{:4f}'.format(std))
        for _ in range(num_ep_per_it):
            loss = agent.train(batch_size=batch_size)
        print('[iter {}] loss val : {:.4f}'.format(i, loss), end='\n')
        

pre-training...
[iter 0] loss val : 0.1051
[iter 1] loss val : 0.0918
[iter 2] loss val : 0.0476
[iter 3] loss val : 0.0431
[iter 4] loss val : 0.0279
[iter 5] loss val : 0.0312
[iter 6] loss val : 0.0271
[iter 7] loss val : 0.0232
[iter 8] loss val : 0.0278
[iter 9] loss val : 0.0181


In [11]:
if colab:
    import gym
    from gym.wrappers import Monitor
    import glob
    import io
    import base64
    from IPython.display import HTML
    from pyvirtualdisplay import Display
    from IPython import display as ipythondisplay

    display = Display(visible=0, size=(1400, 900))
    display.start()

    def show_video():
      mp4list = glob.glob('video/*.mp4')
      if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                </video>'''.format(encoded.decode('ascii'))))
      else: 
        print("Could not find video")
        

    def wrap_env(env):
      env = Monitor(env, './video', force=True)
      return env

    env = wrap_env(env)



In [17]:
env = MyPendulumEnv(g=10.0)
if colab:
  env = wrap_env(env)
obs = env.reset()

done = False
score = 0.

for _ in range(200):
    env.render()
    obs, rew, done, _ = env.step(agent.act(obs, H=60, N=400, cross_entropy=False))
    score += rew
    
env.close()
print('score : ', score)

if colab:
  show_video()



score :  -246.6100610752792
