In [1]:
from model import DQN
import os
import minerl
import gym
import torch
import torch.nn.functional as F
import torch.optim as optim
import wandb
import ray



In [2]:
from _collections import deque
from utils import *
import random

In [3]:
from subprocess import call
call(["wandb", "login", "e694c5143ff8b3ba1e2b275f0ddff63443464b98"])
wandb.init(group="pre-train", project='apex_dqfd', entity='neverparadise')

[34m[1mwandb[0m: Currently logged in as: [33mneverparadise[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.30 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [4]:
#하이퍼 파라미터
learning_rate = 0.0003
gamma = 0.999
buffer_limit = 50000
L1 = 0.9
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
def margin_loss(q_value, action, demo, weigths):
    ae = F.one_hot(action, num_classes=19)
    zero_indices = (ae == 0)
    one_indices = (ae == 1)
    ae[zero_indices] = 1
    ae[one_indices] = 0
    ae = ae.to(float)
    max_value = torch.max(q_value + ae, axis=1)

    ae = F.one_hot(action, num_classes=19)
    ae = ae.to(float)

    J_e = torch.abs(torch.sum(q_value * ae,axis=1) - max_value.values)
    J_e = torch.mean(J_e * weigths * demo)
    return J_e

In [6]:
def train_dqn(policy_net, target_net, demos, batch_size, demo_prob, optimizer):
    demo_batch, idxs, is_weights = demos.sample.remote(batch_size)
    # demo_batch = (batch_size, state, action, reward, next_state, done, n_rewards)
    #print(len(demo_batch[0])) # 0번째 배치이므로 0이 나옴
    state_list = []
    action_list = []
    reward_list =[]
    next_state_list = []
    done_mask_list = []
    n_rewards_list = []

    for transition in demo_batch:
        s, a, r, s_prime, done_mask, n_rewards = transition
        state_list.append(s)
        action_list.append([a])
        reward_list.append([r])
        next_state_list.append(s_prime)
        done_mask_list.append([done_mask])
        n_rewards_list.append([n_rewards])

    #a = state_list
    #b = torch.tensor(action_list, dtype=torch.int64)
    #c = torch.tensor(reward_list)
    #d = next_state_list
    #e = torch.tensor(done_mask_list)
    #f = torch.tensor(n_rewards_list)

    s = torch.stack(state_list).float().to(device)
    a = torch.tensor(action_list, dtype=torch.int64).to(device)
    r =  torch.tensor(reward_list).to(device)
    s_prime = torch.stack(next_state_list).float().to(device)
    done_mask = torch.tensor(done_mask_list).float().to(device)
    nr =  torch.tensor(n_rewards_list).to(device)

    q_vals = policy_net(s)
    state_action_values = q_vals.gather(1, a)

    # comparing the q values to the values expected using the next states and reward
    next_state_values = target_net(s_prime).max(1)[0].unsqueeze(1)
    target = r + (next_state_values * gamma)

    # calculating the q loss, n-step return lossm supervised_loss
    is_weights = torch.FloatTensor(is_weights).to(device)
    q_loss = (is_weights * F.mse_loss(state_action_values, target)).mean()
    n_step_loss = (state_action_values.max(1)[0] + nr).mean()
    supervised_loss = margin_loss(q_vals, a, 1, 1)

    loss = q_loss + supervised_loss + n_step_loss
    wandb.log({"Q-loss" : q_loss.item()})
    wandb.log({"n-step loss" : n_step_loss.item()})
    wandb.log({"super_vised loss" : supervised_loss.item()})
    wandb.log({"total loss" : loss.item()})
    
    errors = torch.abs(state_action_values - target).data.cpu()
    errors = errors.numpy()
    # update priority
    for i in range(batch_size):
        idx = idxs[i]
        memory.update(idx, errors[i])

    # optimization step and logging
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm(policy_net.parameters(), 100)
    optimizer.step()
    return loss

In [7]:
from st import SumTree
@ray.remote
class Memory:  # stored as ( s, a, r, s_, n_rewards ) in SumTree
    e = 0.01
    a = 0.6
    beta = 0.4
    beta_increment_per_sampling = 0.001

    def __init__(self, capacity):
        self.tree = SumTree(capacity)
        self.capacity = capacity

    def _get_priority(self, error):
        return (np.abs(error) + self.e) ** self.a

    def add(self, error, sample):
        p = self._get_priority(error)
        self.tree.add(p, sample)

    def size(self):
        return self.tree.n_entries

    def sample(self, n):
        batch = []
        idxs = []
        segment = self.tree.total() / n
        priorities = []

        self.beta = np.min([1., self.beta + self.beta_increment_per_sampling])

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            priorities.append(p)
            batch.append(data)
            idxs.append(idx)

        sampling_probabilities = priorities / self.tree.total()
        is_weight = np.power(self.tree.n_entries * sampling_probabilities, -self.beta)
        is_weight /= is_weight.max() + 1e-5

        return batch, idxs, is_weight

    def update(self, idx, error):
        p = self._get_priority(error)
        self.tree.update(idx, p)

In [8]:
def append_sample(memory, model, target_model, state, action, reward, next_state, done, n_rewards):
    # Caluclating Priority (TD Error)
    target = model(state.float()).data
    old_val = target[0][action].cpu()
    target_val = target_model(next_state.float()).data.cpu()
    if done:
        target[0][action] = reward
    else:
        target[0][action] = reward + 0.99 * torch.max(target_val)

    error = abs(old_val - target[0][action])
    error = error.cpu() 
    memory.add.remote(error, [state, action, reward, next_state, done, n_rewards])

In [9]:
def pre_train(env_name, rep_buffer, policy_net, target_net, optimizer,threshold=10, num_epochs=1, batch_size=16, seq_len=10, gamma=0.99):
    data = minerl.data.make(env_name)
    demo_num = 0
    for s_batch, a_batch, r_batch, ns_batch, d_batch in data.batch_iter(num_epochs=num_epochs, batch_size=batch_size,
                                                                        seq_len=seq_len):
        demo_num += 1
        print(demo_num)
        if r_batch.sum() < threshold:
            continue
        """
        state_batch : (batch_size, seq_len, 64, 64, 3)
        action_batch : (batch_size, seq_len, action['property'].shape) ex camera = 2 otherwise 1

        reward_batch : (batch_size, seq_len)
        next_state_batch : (batch_size, seq_len, 64, 64, 3)
        done_batch : (batch_size, seq_len)

    
        reward, _ = stats.mode(r_batch, axis=1)
        reward = np.squeeze(reward)
        done, _ = stats.mode(d_batch, axis=1)
        done = np.squeeze(done)
        """
        parse_ts = 0

        # 각 state에 대한 action discretize를 위해 반복문을 사용
        batch_length = (s_batch['pov'].shape)[0]  # (batch, seq, 64, 64, 3)[0]
        for i in range(0, batch_length):
            episode_start_ts = 0

            n_step = 10
            n_step_state_buffer = deque(maxlen=n_step)
            n_step_action_buffer = deque(maxlen=n_step)
            n_step_reward_buffer = deque(maxlen=n_step)
            n_step_n_rewards_buffer = deque(maxlen=n_step)
            n_step_next_state_buffer = deque(maxlen=n_step)
            n_step_done_buffer = deque(maxlen=n_step)
            gamma_list = [gamma ** i for i in range(n_step)]

            for j in range(0, seq_len):
                av = a_batch['attack'][i][j]  # attack value
                aj = a_batch['jump'][i][j]  # jump value
                af = a_batch['forward'][i][j]  # forward value
                ab = a_batch['back'][i][j]  # back value
                al = a_batch['left'][i][j]  # left value
                ar = a_batch['right'][i][j]  # right value
                va = a_batch['camera'][i][j][0]  # vertical angle and
                ha = a_batch['camera'][i][j][1]  # horizontal angle

                camera_thresholds = (abs(va) + abs(ha)) / 2.0
                # 카메라를 움직이는 경우
                if (camera_thresholds > 2.5):
                    # camera = [0, -5]
                    if abs(va) < abs(ha) and ha < 0:
                        if av == 0:
                            action_index = 0
                        else:
                            action_index = 1
                    # camera = [0, 5]
                    elif abs(va) < abs(ha) and ha > 0:
                        if av == 0:
                            action_index = 2
                        else:
                            action_index = 3
                    # camera = [-5, 0]
                    elif abs(va) > abs(ha) and ha < 0:
                        if av == 0:
                            action_index = 4
                        else:
                            action_index = 5
                    # camera = [5, 0]
                    elif abs(va) > abs(ha) and ha > 0:
                        if av == 0:
                            action_index = 6
                        else:
                            action_index = 7

                            # 카메라를 안움직이는 경우
                # 점프하는 경우
                elif (aj == 1):
                    if (af == 0):
                        action_index = 8
                    else:
                        action_index = 9

                # 앞으로 가는 경우
                elif (af == 1):
                    if (av == 0):
                        action_index = 10
                    else:
                        action_index = 11

                # 뒤로 가는 경우
                elif (ab == 1):
                    if (av == 0):
                        action_index = 12
                    else:
                        action_index = 13

                # 왼쪽으로 가는 경우
                elif (al == 1):
                    if (av == 0):
                        action_index = 14
                    else:
                        action_index = 15

                # 오른쪽으로 가는 경우
                elif (ar == 1):
                    if (av == 0):
                        action_index = 16
                    else:
                        action_index = 17

                # 카메라, 움직임이 다 0이고 공격만 하는 것
                else:
                    if (av == 0):
                        continue
                    else:
                        action_index = 18

                a_index = torch.LongTensor([action_index]).cpu()
                curr_obs = converter2(s_batch['pov'][i][j]).float().cpu()
                _obs = converter2(ns_batch['pov'][i][j]).float().cpu()
                _reward = torch.FloatTensor([r_batch[i][j]]).cpu()
                _done = d_batch[i][j]  # .astype(int)

                n_step_state_buffer.append(curr_obs)
                n_step_action_buffer.append(a_index)
                n_step_reward_buffer.append(_reward)
                n_step_next_state_buffer.append(_obs)
                n_step_done_buffer.append(_done)
                n_rewards = sum([gamma * reward for gamma, reward in zip(gamma_list, n_step_reward_buffer)])
                n_step_n_rewards_buffer.append(n_rewards)
                

                append_sample(rep_buffer, policy_net, target_net, n_step_state_buffer[j], \
                              n_step_action_buffer[j], n_step_reward_buffer[j], \
                              n_step_next_state_buffer[j], \
                              n_step_done_buffer[j], \
                              n_step_n_rewards_buffer[j])
                episode_start_ts += 1
                parse_ts += 1
                # if episode done we reset
                if _done:
                    break

        # replay is over emptying the deques
        #if rep_buffer.size() > rep_buffer.buffer_limit:
        #    rep_buffer.buffer.popleft()
        print('Parse finished. {} expert samples added.'.format(parse_ts))
        train_dqn(policy_net, target_net, rep_buffer, batch_size, 1, optimizer)
        torch.save(policy_net.state_dict(), model_path + 'pre_trained.pth')        
        if demo_num % 5 == 0 and demo_num != 0:
        # 특정 반복 수가 되면 타겟 네트워크도 업데이트
            print("target network updated")
            target_net.load_state_dict(policy_net.state_dict())
        print("train {} step finished".format(demo_num))
    print('pre_train finished')
    return rep_buffer


In [10]:
total_episodes = 1000
startEpsilon = 1.0
endEpsilon = 0.05
epsilon = startEpsilon

root_path = os.curdir
model_path = root_path + '/dqn_model/'

stepDrop = (startEpsilon - endEpsilon) / total_episodes

In [25]:
@ray.remote
class Actor:
    def __init__(self, shared_network, shared_memory, actor_idx, epsilon):
        # environment initialization
        self.env = gym.make("MineRLTreechop-v0")
        self.port_number = int("12340")+index
        self.env.make_interactive(port=self.port_number, realtime=False)
        
        # network initalization
        self.shared_network = shared_network
        self.shared_memory = shared_memory
        self.actor_network = DQN(19).cpu()
        self.actor_target_network = DQN(19).cpu()
        
        self.actor_network.load_state_dict(self.shared_network.state_dict(), map_location='cpu')
        self.actor_target_network.load_state_dict(self.actor_network.load_state_dict(), map_location='cuda:0')
        
        # exploring info
        self.actor_idx = actor_idx
        self.epsilon = epsilon
        self.max_step = 10
        self.local_buffer_size = 100
        self.local_buffer = deque(maxlen=self.local_buffer_size)
        
    # 1. 네트워크 파라미터 복사
    # 2. 환경 탐험 (초기화, 행동)
    # 3. 로컬버퍼에 저장
    # 4. priority 계산
    # 5. 글로벌 버퍼에 저장
    # 6. 주기적으로 네트워크 업데이트 
    
    # 각 환경 인스턴스에서 각 엡실론에 따라 탐험을 진행한다.
    # 탐험 과정에서 local buffer에 transition들을 저장한다.
    # local buffer의 개수가 특정 개수 이상이면 global buffer에 추가해준다. 
    
    def explore(self):
        for num_epi in range(self.max_step):
            obs = self.env.reset()
            state = converter(obs).cpu()
            state = state.float()
            done = False
            total_reward = 0
            steps = 0
            total_steps = 0
            self.epsilon = 0.5
            if(self.epsilon > endEpsilon):
                self.epsilon -= stepDrop / (self.actor_idx + 1)
            
            n_step = 10
            n_step_state_buffer = deque(maxlen=n_step)
            n_step_action_buffer = deque(maxlen=n_step)
            n_step_reward_buffer = deque(maxlen=n_step)
            n_step_n_rewards_buffer = deque(maxlen=n_step)
            n_step_next_state_buffer = deque(maxlen=n_step)
            n_step_done_buffer = deque(maxlen=n_step)
            gamma_list = [gamma ** i for i in range(n_step)]
            
            while not done:
                steps += 1
                total_steps += 1
                a_out = self.actor_network.sample_action(state,self.epsilon)
                action_index = a_out
                action = make_action(env, action_index)
                obs_prime, reward, done, info = env.step(action)
                total_reward += reward
                state_prime = converter(obs_prime)
                
                # local buffer add
                n_step_state_buffer.append(state)
                n_step_action_buffer.append(action_index)
                n_step_reward_buffer.append(reward)
                n_step_next_state_buffer.append(state_prime)
                n_step_done_buffer.append(done)
                n_rewards = sum([gamma * reward for gamma, reward in zip(gamma_list, n_step_reward_buffer)])
                n_step_n_rewards_buffer.append(n_rewards)
                
                
                if (len(n_step_state_buffer) >= n_step):
                    # LocalBuffer Get
                    # Compute Priorities
                    for i in range(n_step):
                        append_sample(shared_memory, self.actor_network , self.actor_target_network, \
                              n_step_state_buffer[i], \
                              n_step_action_buffer[i], n_step_reward_buffer[i], \
                              n_step_next_state_buffer[i], \
                              n_step_done_buffer[i], \
                              n_step_n_rewards_buffer[i])
                        if(n_step_done_buffer[i]):
                            break
                    
                    
                state = state_prime.float().cpu()
                if done:
                    break

            if done:
                print("%d episode is done" % num_epi)
                print("total rewards : %d " % total_reward)
                break
            
            if (num_epi % 5 == 0 and num_epi != 0):
                self.update_params()

        env.close()
    
    def update_params(self):
        self.actor_network.load_state_dict(self.shared_network.state_dict(), map_location='cpu')



        

In [26]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


In [27]:
print(device)

cuda:0


In [28]:
print(torch.cuda.is_available())

True


In [30]:
class Learner:
    def __init__(self, network, batch_size):
        self.learner_network = DQN(19).to(device)
        self.learner_target_network = DQN(19).to(device)
        
        self.learner_network.load_state_dict(network.state_dict(), map_location='cuda:0')
        self.learner_target_network.load_state_dict(network.state_dict(), map_location='cuda:0')
        
        self.batch_size = batch_size
    # 1. sampling
    # 2. calculate gradient
    # 3. weight update
    # 4. compute priorities
    # 5. priorities of buffer update
    # 6. remove old memory 
    def get_network(self):
        return self.learner_network

    def update_network(memory, demos, batch_size, demo_prob, optimizer):
        agent_batch, agent_idxs, agent_weights = memory.sample.remote(batch_size)
        demo_batch, demo_idxs, demo_weights = demos.sample.remote(batch_size)
        
        # demo_batch = (batch_size, state, action, reward, next_state, done, n_rewards)
        #print(len(demo_batch[0])) # 0번째 배치이므로 0이 나옴
        state_list = []
        action_list = []
        reward_list =[]
        next_state_list = []
        done_mask_list = []
        n_rewards_list = []

        for agent_transition, expert_transition in zip(agent_batch, demo_batch):
            s, a, r, s_prime, done_mask, n_rewards = agent_transition
            state_list.append(s)
            action_list.append([a])
            reward_list.append([r])
            next_state_list.append(s_prime)
            done_mask_list.append([done_mask])
            n_rewards_list.append([n_rewards])
            
            s, a, r, s_prime, done_mask, n_rewards = expert_transition
            state_list.append(s)
            action_list.append([a])
            reward_list.append([r])
            next_state_list.append(s_prime)
            done_mask_list.append([done_mask])
            n_rewards_list.append([n_rewards])
            

        s = torch.stack(state_list).float().to(device)
        a = torch.tensor(action_list, dtype=torch.int64).to(device)
        r =  torch.tensor(reward_list).to(device)
        s_prime = torch.stack(next_state_list).float().to(device)
        done_mask = torch.tensor(done_mask_list).float().to(device)
        nr =  torch.tensor(n_rewards_list).to(device)
        
        q_vals = policy_net(s)
        state_action_values = q_vals.gather(1, a)

        # comparing the q values to the values expected using the next states and reward
        next_state_values = target_net(s_prime).max(1)[0].unsqueeze(1)
        target = r + (next_state_values * gamma)

        # calculating the q loss, n-step return lossm supervised_loss
        is_weights = torch.FloatTensor(is_weights).to(device)
        q_loss = (is_weights * F.mse_loss(state_action_values, target)).mean()
        n_step_loss = (state_action_values.max(1)[0] + nr).mean()
        supervised_loss = margin_loss(q_vals, a, 1, 1)

        loss = q_loss + supervised_loss + n_step_loss
        wandb.log({"Q-loss" : q_loss.item()})
        wandb.log({"n-step loss" : n_step_loss.item()})
        wandb.log({"super_vised loss" : supervised_loss.item()})
        wandb.log({"total loss" : loss.item()})

        errors = torch.abs(state_action_values - target).data.cpu()
        errors = errors.numpy()
        # update priority
        for i in range(batch_size):
            idx = idxs[i]
            memory.remote.update(idx, errors[i])

        # optimization step and logging
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm(policy_net.parameters(), 100)
        optimizer.step()
        return loss


In [16]:
ray.init()

2021-05-20 04:31:18,052	INFO services.py:1269 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.0.22',
 'raylet_ip_address': '192.168.0.22',
 'redis_address': '192.168.0.22:45744',
 'object_store_address': '/tmp/ray/session_2021-05-20_04-31-17_273839_75039/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-05-20_04-31-17_273839_75039/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-05-20_04-31-17_273839_75039',
 'metrics_export_port': 60439,
 'node_id': '294b738b23e47bb7b3c16e5e7e92476904787d3fccf4d334557331dc'}

In [17]:
policy_net = DQN(19).to(device=device)
target_net = DQN(19).to(device=device)
target_net.load_state_dict(policy_net.state_dict())
memory = Memory.remote(50000)
demos = Memory.remote(50000)
optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate, weight_decay=1e-5)

In [15]:
print("pre_train start")
demos = pre_train("MineRLTreechop-v0", demos, policy_net, target_net, optimizer, threshold=1536, num_epochs=10, batch_size=512, seq_len=20, gamma=0.99)
print("pre_train finished")
print(demos.size.remote())

pre_train start
pre_train finished
pre_train finished


TypeError: Actor methods cannot be called directly. Instead of running 'object.size()', try 'object.size.remote()'.

In [22]:
# Copy network params from pretrained Agent
model_path = './dqn_model/pre_trained.pth'
policy_net.load_state_dict(torch.load(model_path, map_location='cuda:0'))

<All keys matched successfully>

In [23]:
# Generating each own instances
# main()
num_actors = 2
epsilon = 0.5



# learner network initialzation
learner = Learner(policy_net, 256)

ray.put(memory)
ray.put(demos)
ray.put(learner)

# actor network, environments initialization
actor_list = [Actor.remote(memory, learner.get_network(), i, 0.5) for i in range(num_actors)]



[2m[36m(pid=75711)[0m 2021-05-20 04:34:39,308	ERROR serialization.py:248 -- Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.
[2m[36m(pid=75711)[0m Traceback (most recent call last):
[2m[36m(pid=75711)[0m   File "/home/kukjin/anaconda3/envs/minerl/lib/python3.7/site-packages/ray/serialization.py", line 246, in deserialize_objects
[2m[36m(pid=75711)[0m     obj = self._deserialize_object(data, metadata, object_ref)
[2m[36m(pid=75711)[0m   File "/home/kukjin/anaconda3/envs/minerl/lib/python3.7/site-packages/ray/serialization.py", line 188, in _deserialize_object
[2m[36m(pid=75711)[0m     return self._deserialize_msgpack_data(data, metadata_fields)
[2m[36m(pid=75711)[0m   File "/home/kukjin/anaconda3/envs/minerl/lib/python3.7/site-packages/ray/serialization.py", line 166, in _deserialize_msgpack_d

2021-05-20 04:34:44,324	ERROR worker.py:1056 -- Possible unhandled error from worker: [36mray::Actor.__init__()[39m (pid=75711, ip=192.168.0.22)
  File "python/ray/_raylet.pyx", line 458, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 479, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 349, in ray._raylet.raise_if_dependency_failed
ray.exceptions.RaySystemError: System error: Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.
traceback: Traceback (most recent call last):
  File "/home/kukjin/anaconda3/envs/minerl/lib/python3.7/site-packages/ray/serialization.py", line 246, in deserialize_objects
    obj = self._deserialize_object(data, metadata, object_ref)
  File "/home/kukjin/anaconda3/envs/minerl/lib/python3.7/site-packages/ray/serialization.py", line 188, in _deserialize_

In [None]:
explore = [actor.explore.remote() for actor in actor_list]
ray.get(explore)
for i in range(1000):
    if(memory.size() > 1000):
        leaner.update_network(memory, demos, batch_size, demo_prob, optimizer)

In [None]:
env0 = gym.make("MineRLTreechop-v0")
env1 = gym.make("MineRLTreechop-v0")
env2 = gym.make("MineRLTreechop-v0")

In [None]:
from unittest.mock import Mock, MagicMock, call

In [None]:
import ray

In [None]:
ray.shutdown()

In [None]:
ray.init()

In [None]:
@ray.remote
def make_env(index):
    env = Mock()
    env.return_value = "Environment %d is created" %index
    port_number = int("12340")+index
    return env()
num_envs = 4


In [None]:
env_list = [make_env.remote(i) for i in range(num_envs)]
ray.get(env_list)

In [None]:
import unittest

In [None]:
def main():
    class EnvTests(unittest.TestCase):
        def test(self):
            make_env()

if __name__=='__main__':
    unittest.main()

In [None]:
def main():
    class EnvTests2(unittest.TestCase):
        def test(self):
            num_envs = 4
            env_list = [make_env.remote(i) for i in range(num_envs)]
            print(ray.get(env_list))

if __name__=='__main__':
    unittest.main()