In [1]:
import numpy as np

import matplotlib.pyplot as plt

In [2]:
!pip install typing_extensions==4.4.0
import typing_extensions



In [3]:
import torch
import time
import torch.nn as nn

In [4]:
n = 100
array = torch.arange(n)

In [5]:
!pip install gym==0.24.0



In [6]:
!pip install gym[classic_control]==0.24.0



In [7]:
import gym



In [8]:
gym.envs.registry.keys()

dict_keys(['CartPole-v0', 'CartPole-v1', 'MountainCar-v0', 'MountainCarContinuous-v0', 'Pendulum-v1', 'Acrobot-v1', 'LunarLander-v2', 'LunarLanderContinuous-v2', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3', 'CarRacing-v1', 'CarRacingDomainRandomize-v1', 'CarRacingDiscrete-v1', 'CarRacingDomainRandomizeDiscrete-v1', 'Blackjack-v1', 'FrozenLake-v1', 'FrozenLake8x8-v1', 'CliffWalking-v0', 'Taxi-v3', 'Reacher-v2', 'Reacher-v4', 'Pusher-v2', 'Pusher-v4', 'InvertedPendulum-v2', 'InvertedPendulum-v4', 'InvertedDoublePendulum-v2', 'InvertedDoublePendulum-v4', 'HalfCheetah-v2', 'HalfCheetah-v3', 'HalfCheetah-v4', 'Hopper-v2', 'Hopper-v3', 'Hopper-v4', 'Swimmer-v2', 'Swimmer-v3', 'Swimmer-v4', 'Walker2d-v2', 'Walker2d-v3', 'Walker2d-v4', 'Ant-v2', 'Ant-v3', 'Ant-v4', 'Humanoid-v2', 'Humanoid-v3', 'Humanoid-v4', 'HumanoidStandup-v2', 'HumanoidStandup-v4'])

In [9]:
env = gym.make('CartPole-v1')
state_dim = 4
action_n = 2

In [10]:

def get_trajectory(env, agent, max_len = 1000, visualise = False):
    trajectory = {'states': [], 'actions' : [], 'rewards' : []}
    
    state = env.reset()

    #create traectory
    for _ in range(max_len):
        trajectory['states'].append(state)
        
        action = agent.get_action(state)
        trajectory['actions'].append(action)
        
        state, reward, done, _ = env.step(action)
        trajectory['rewards'].append(reward)

        
        if visualise:
            time.sleep(.1)
            env.render()
            
        if done:
            break
    
    return trajectory


In [None]:
class RandomAgent():
    def __init__(self, action_n):
        self.action_n = action_n
        
    def get_action(self, state):
        action = np.random.randint(self.action_n)
        return action

In [None]:
agent = RandomAgent(action_n)
get_trajectory(env, agent, visualise = True)

In [11]:
class CEM(nn.Module):
    def __init__(self, state_n, action_n):
        super().__init__()
        self.action_n = action_n
        self.state_n = state_n
        self.network = nn.Sequential(nn.Linear(self.state_n, 128), nn.ReLU(), nn.Linear(128, self.action_n))
        self.softmax = nn.Softmax()
        self.optimizer = torch.optim.Adam(self.parameters(), lr = .01)
        self.loss = nn.CrossEntropyLoss()
        
    def forward(self, _input):
        return self.network(_input)
     
    def get_action(self, state):
        state = torch.FloatTensor(state)
        probs = self.softmax(self.forward(state)).data.numpy()
        action = np.random.choice(self.action_n, p = probs)
        return action
    
    def fit(self, elite_tr):
        elite_states = []
        elite_actions = []
        
        for tr in elite_tr:
            for state, action in zip(tr['states'], tr['actions']):
                elite_states.append(state)
                elite_actions.append(action)
        elite_states = torch.FloatTensor(elite_states)
        elite_actions = torch.LongTensor(elite_actions)
        
        pred_acts = self.forward(elite_states)
        loss = self.loss(pred_acts, elite_actions)
        loss.backward()
        self.optimizer.step()
        self.optimizer.zero_grad()

In [12]:

agent = CEM(state_dim, action_n)
trajectory_len = 500
trajectory_n = 50
iteration_n = 20
q_param = .9

for it in range(iteration_n):
    
    #policy evaluation
    trajectories = [get_trajectory(env, agent) for _ in range(trajectory_n)]
    total_reward = [np.sum(i['rewards']) for i in trajectories]
    print('iter', it, 'mean total reward', np.mean(total_reward))
    
    #policy improvement
    quantile = np.quantile(total_reward, q_param)
    elite_tr = []
    for tr in trajectories:
        r = np.sum(tr['rewards'])
        if r > quantile:
            elite_tr.append(tr)
            
    agent.fit(elite_tr)
    
get_trajectory(env, agent, visualise = True)

  return self._call_impl(*args, **kwargs)


iter 0 mean total reward 22.32


  elite_states = torch.FloatTensor(elite_states)


iter 1 mean total reward 31.54
iter 2 mean total reward 33.82
iter 3 mean total reward 35.98
iter 4 mean total reward 38.68
iter 5 mean total reward 47.62
iter 6 mean total reward 53.8
iter 7 mean total reward 48.0
iter 8 mean total reward 64.84
iter 9 mean total reward 58.02
iter 10 mean total reward 71.78
iter 11 mean total reward 69.96
iter 12 mean total reward 88.04
iter 13 mean total reward 75.46
iter 14 mean total reward 70.96
iter 15 mean total reward 80.64
iter 16 mean total reward 83.88
iter 17 mean total reward 88.4
iter 18 mean total reward 116.74
iter 19 mean total reward 130.96


{'states': [array([-0.03111615,  0.04042187, -0.03767826, -0.01783415], dtype=float32),
  array([-0.03030772, -0.15414004, -0.03803494,  0.26272678], dtype=float32),
  array([-0.03339051, -0.34869903, -0.03278041,  0.5431746 ], dtype=float32),
  array([-0.0403645 , -0.1531321 , -0.02191691,  0.2403461 ], dtype=float32),
  array([-0.04342714, -0.34793422, -0.01710999,  0.526036  ], dtype=float32),
  array([-0.05038582, -0.15257573, -0.00658927,  0.22801113], dtype=float32),
  array([-0.05343734,  0.04263976, -0.00202905, -0.06674299], dtype=float32),
  array([-0.05258454, -0.15245304, -0.00336391,  0.22529908], dtype=float32),
  array([-0.0556336 ,  0.04271682,  0.00114208, -0.06844305], dtype=float32),
  array([-0.05477927, -0.15242147, -0.00022679,  0.22459999], dtype=float32),
  array([-0.0578277 , -0.3475402 ,  0.00426521,  0.5172114 ], dtype=float32),
  array([-0.0647785 , -0.15247856,  0.01460944,  0.22587554], dtype=float32),
  array([-0.06782807, -0.34780622,  0.01912695,  0.523