In [None]:
import gym
import minerl
from stable_baselines.common.policies import MlpPolicy, CnnPolicy, CnnLnLstmPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import A2C
from Wrappers import ActionWrapper, ObsWrapper

In [None]:
import logging
logging.basicConfig(level=logging.INFO)
env = gym.make('MineRLNavigateDense-v0')

# A2C-C

In [None]:
import numpy as np
import time

model_c = A2C.load("a2c_mineRL_MLP_2")
env_c1 = gym.wrappers.FlattenObservation(gym.wrappers.FilterObservation(env, ["compassAngle"]))
env_c2 = ActionWrapper(env_c1)

nr_steps = 1000
nr_iter = 10
reward_list_m_c = np.zeros((nr_iter, nr_steps))

for j in range(nr_iter):
    obs = env_c2.reset()
    done = False
    for i in range(nr_steps):
        action, _states = model_c.predict(obs)
        obs, reward, done, info = env_c2.step(action)
        reward_list_m_c[j,i] = reward
        env.render()
        if done:
            print("Done", done)
            break
    print("Iter {0} done!".format(j))

In [None]:
import matplotlib.pyplot as plt
for i in range(nr_iter):
    plt.plot(np.cumsum(reward_list_m_c[i,:]))

plt.title("A2C-C")
plt.show()

# A2C-CI

In [None]:
import numpy as np

env_ci = ObsWrapper((64, 64), env)
env_ci = ActionWrapper(env_ci)
model_ci = A2C.load("a2c_mineRL_CnnMLP_2")

nr_steps = 1000
nr_iter = 10
reward_list_m_ci = np.zeros((nr_iter, nr_steps))
tot_rew = 0

for j in range(nr_iter):
    obs = env_ci.reset()
    done = False
    for i in range(nr_steps):
        action, _states = model_ci.predict(obs)
        obs, reward, done, info = env_ci.step(action)
        reward_list_m_ci[j,i] = reward
        tot_rew += reward
        env.render()
        if done:
            break
    print("Iter {0}, {1} done!".format(j, tot_rew))

In [None]:
import matplotlib.pyplot as plt
for i in range(nr_iter):
    plt.plot(np.cumsum(reward_list_m_ci[i,:]))

plt.title("A2C-CI")
plt.show()

# Random

In [None]:
import numpy as np

nr_steps = 1000
nr_iter = 10
reward_list_rand = np.zeros((nr_iter, nr_steps))

for j in range(nr_iter):
    obs = env.reset()
    done = False
    net_reward = 0
    for i in range(nr_steps):
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        reward_list_rand[j,i] = reward
        if done:
            break
    print("Iter {0} done!".format(j))

In [None]:
import matplotlib.pyplot as plt
for i in range(nr_iter):
    plt.plot(np.cumsum(reward_list_rand[i,:]))

plt.title("Random")
plt.show()

In [None]:
import matplotlib.pyplot as plt
cum_vec_m_c = np.cumsum(reward_list_m_c,axis = 1)
mean_vec_m_c = np.mean(cum_vec_m_c,axis = 0)
std_vec_m_c = np.std(cum_vec_m_c,axis = 0)

cum_vec_m_ci = np.cumsum(reward_list_m_ci,axis = 1)
mean_vec_m_ci = np.mean(cum_vec_m_ci,axis = 0)
std_vec_m_ci = np.std(cum_vec_m_ci,axis = 0)

cum_vec_rand = np.cumsum(reward_list_rand,axis = 1)
mean_vec_rand = np.mean(cum_vec_rand,axis = 0)
std_vec_rand = np.std(cum_vec_rand,axis = 0)

plt.plot(mean_vec_m_c,label = 'Cumulative reward AC2-C agent')
plt.fill_between(np.arange(1000),mean_vec_m_c-std_vec_m_c,mean_vec_m_c+std_vec_m_c,alpha = 0.5)

plt.plot(mean_vec_m_ci,label = 'Cumulative reward AC2-CI agent')
plt.fill_between(np.arange(1000),mean_vec_m_ci-std_vec_m_ci,mean_vec_m_ci+std_vec_m_ci,alpha = 0.5)

plt.plot(mean_vec_rand,label = 'Cumulative reward random agent')
plt.fill_between(np.arange(1000),mean_vec_rand-std_vec_rand,mean_vec_rand+std_vec_rand,alpha = 0.5)
plt.legend()
plt.xlabel('time')
plt.ylabel('Reward')