In [1]:
import gym
import numpy as np
from pybullet_envs.bullet.kuka_diverse_object_gym_env import KukaDiverseObjectEnv
from gym import spaces
import pybullet as p
from collections import defaultdict

current_dir=C:\Users\ecl23226\AppData\Local\Continuum\anaconda3\lib\site-packages\pybullet_envs\bullet


In [2]:
env = KukaDiverseObjectEnv(renders=True, isDiscrete=False, removeHeightHack=True, maxSteps=20)
env.cid = p.connect(p.DIRECT)
action_space = spaces.Box(low=-1, high=1, shape=(5, 1))

In [11]:
def generate_episode_from_Q(env, Q, epsilon, nA, height_hack_prob=0.9):
    """ generates an episode from following the epsilon-greedy policy """
    episode = []
    state = env.reset()
    while True:
      env.render(mode='human')
      state_m = tuple(np.round(state.flat))
      action = np.random.choice(np.arange(nA), p=get_probs(Q[state_m], epsilon, nA)) if state_m in Q else action_space.sample()
      if np.random.random() < height_hack_prob:
        action[2] = -1
      if state_m in Q:
        print("*******************From Q!*************************") 
      action_r = np.round(action)
      next_state, reward, done, info = env.step(action_r)
      episode.append((state_m, action_r, reward))
      state = next_state
      if done:
        break
    return episode

In [12]:
def update_Q(episode, Q, alpha, gamma):
    """ updates the action-value function estimate using the most recent episode """
    states, actions, rewards = zip(*episode)
    # prepare for discounting
    discounts = np.array([gamma**i for i in range(len(rewards)+1)])
    for i, state in enumerate(states):
      #print(state)
      old_Q = Q[state][tuple(np.round(actions[i].flat))]
      Q[state][tuple(np.round(actions[i].flat))] = old_Q + alpha*(sum(rewards[i:]*discounts[:-(1+i)]) - old_Q)
    return Q

In [13]:
def mc_control(env, num_episodes, alpha, gamma=0.95, eps_start=1.0, eps_decay=.99999, eps_min=0.05):
    nA = action_space.shape[0]
    # initialize empty dictionary of arrays
    Q = defaultdict(lambda: defaultdict(lambda: np.zeros(nA)))
    epsilon = eps_start
    # loop over episodes
    for i_episode in range(1, num_episodes+1):
        # set the value of epsilon
        epsilon = max(epsilon*eps_decay, eps_min)
        # generate an episode by following epsilon-greedy policy
        episode = generate_episode_from_Q(env, Q, epsilon, nA)
        # update the action-value function estimate using the episode
        Q = update_Q(episode, Q, alpha, gamma)
        # determine the policy corresponding to the final action-value function estimate
    policy = dict((k,np.argmax(v)) for k, v in Q.items())
    return policy, Q

In [16]:
policy, Q = mc_control(env, num_episodes=3, alpha=0.1)

In [24]:
len(policy)

23

In [25]:
len(Q)

23

In [28]:
print(env.observation_space)

None
