In [2]:
import gym
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
#Make the environment and reset it
env = gym.make('MountainCar-v0')
env.reset()

array([-0.40723108,  0.        ])

In [8]:
#Explore the action space
print(env.action_space)
print(env.observation_space)

Discrete(3)
Box(2,)


In [12]:
#After exploration, try rendering and experimenting with the environment
for i in range(500):
    env.step(2)
    env.render()
env.close()
env.reset()

array([-0.54440131,  0.        ])

In [13]:
#Now we come to the original problem, we have to get the car to the top
print(env.step(1))

(array([-5.44245393e-01,  1.55917755e-04]), -1.0, False, {})


In [14]:
#This gives the position and the velocity of the car
env.step(1)[0]

array([-5.43934724e-01,  3.10668387e-04])

In [15]:
#This gives the reward for the move we made
env.step(1)[1]

-1.0

In [16]:
#This gives a bool value to check if we reached our destination
env.step(1)[2]

False

In [20]:
#Now using these, we must optimize our output and hence we will initialise a q_table
#But before that we must see what our observation space consists of
print(env.observation_space.low)
print(env.observation_space.high)

[-1.2  -0.07]
[0.6  0.07]


In [23]:
#Using he above values, we will make discrete points of velocities and positions of our cart
pos_arr = np.linspace(-1.2,0.6,20)
vel_arr = np.linspace(-0.07,0.07,20)
print(pos_arr , vel_arr)

[-1.2        -1.10526316 -1.01052632 -0.91578947 -0.82105263 -0.72631579
 -0.63157895 -0.53684211 -0.44210526 -0.34736842 -0.25263158 -0.15789474
 -0.06315789  0.03157895  0.12631579  0.22105263  0.31578947  0.41052632
  0.50526316  0.6       ] [-0.07       -0.06263158 -0.05526316 -0.04789474 -0.04052632 -0.03315789
 -0.02578947 -0.01842105 -0.01105263 -0.00368421  0.00368421  0.01105263
  0.01842105  0.02578947  0.03315789  0.04052632  0.04789474  0.05526316
  0.06263158  0.07      ]


In [67]:
#Now lets say , we have a given action pair
env.step(2)
#We notice, the given tuple for position and velocity must correspond to a given state in the above arrays

(array([-0.41968951,  0.02449702]), -1.0, False, {})

In [75]:
#Hence we define a function to get their states
def get_state(obs):
    pos,vel = obs
    pos_idx = np.digitize(pos,pos_arr)
    vel_idx = np.digitize(vel,vel_arr)
    return(pos_idx , vel_idx)

get_state(env.reset())

(8, 10)

In [114]:
#Now, lets make a q_table in order to optimise the state action pair
#Action pairs = 20
#Velcity pairs = 20
#Total number of states = 400
state = []
for i in range(21):
    for j in range(21):
        state.append((i,j))
        
Q = {}
for i in state:
    for j in range(0,3):
        Q[i , j] = 0
        
#Print the dict to double check:
#print(Q)
        

In [115]:
#Now, we make a function to get the max q_value value from a given Q[state] array
def get_max(Q , obs):
    return np.argmax([Q[obs,0] , Q[obs,1] , Q[obs,2]])
#Check
#state = (8,10)
#get_max(Q , state)
env.reset()

array([-0.52407863,  0.        ])

In [None]:
#Now we start optimising out q_table
env.reset()
env._max_episode_steps = 1000
#Set discount factor, learning rate and ephsilon for MDP
#Set the number of times for which you want to repeat this
alpha = 0.1
gamma = 0.99
ephsilon = 1.0
env.reset()
n = 50000
total_rewards = np.zeros(n)
mean_rewards = np.zeros(n)
for i in range (0,n):
    done = False
    obs = env.reset()
    state = get_state(obs)
    score = 0
    while not done :
        if np.random.uniform(0,1) < ephsilon:
            action = np.random.choice([0,1,2])
        else:
            action = get_max(Q,state)
        obs_new , reward , done , info = env.step(action)
        score = score + reward
        state_new = get_state(obs_new)
        action_new = get_max(Q,state_new)
        #Use formula
        Q[state_new , action_new] = Q[state_new , action_new] + alpha*(reward + gamma*Q[state_new , action_new] - Q[state , action])
        state = state_new
    total_rewards[i] = score
    #Update ephsilon
    if ephsilon > 0.01:
        ephsilon = ephsilon - (2/n)
    else:
        ephsilon = 0.01
        

    
        

In [153]:
obs = env.reset()
state = get_state(obs)
done = False 
score = 0
while not done:
    action = get_max(Q,state)
    env.step(action)
    obs , reward , done , info = env.step(action)
    score += reward
    state = get_state(obs)
    env.render()
env.close()
    