In [1]:
from gridworld import *
from tqdm import trange
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#SARSA Q-learning

goals = [(0, 5), (4, 1)]
error_states = [(o, 0) for o in range(3, 6)] + [(5, o) for o in range(1, 3)]
gw = gridworld((6, 6), goals, error_states, 0.79, 0, 1, 0)
y = 0.6

In [2]:
#Empirically estimate risk probability
num_sims = 10000
def estimate_rr(pi):
    risky = 0
    rewards = []
    discounted = []
    for i in trange(num_sims):
        s = gw.reset()
        reward = 0
        d = False
        j = 0
        while(j < 1000):
            a = pi[s[0],s[1]]
            s1,r,d = gw.step(a)
            reward += r
            if d == True:
                if s1 in gw.error_states:
                    risky += 1
                    discounted_reward = 0
                else:
                    discounted_reward = pow(y, j) * r
                break
            s = s1
            j += 1
        rewards.append(reward)
        discounted.append(discounted_reward)
    return sum(discounted)/float(num_sims), sum(rewards)/float(num_sims), float(risky)/num_sims    

In [3]:
trials = 10
Q_list = []
reward_trial_list = []

np.random.seed(1)

# Set learning parameters
lr = 0.7
num_episodes = 10000

for k in range(trials):
    #Initialize table with all zeros
    Q = np.zeros((gw.M,gw.N,4))
    jList = []
    rList = []
    for i in trange(num_episodes):
        #print('Episode', i)
        #Reset environment and get first new observation
        s = gw.reset()
        rAll = 0
        d = False
        j = 0
        #The Q-Table learning algorithm
        a = np.argmax(Q[s[0],s[1],:] + np.random.randn(1,4)*(1./(i+1)))
        while j < 1000:
            j+=1
            #Get new state and reward from environment
            s1,r,d = gw.step(a)
            #New action
            a1 = np.argmax(Q[s[0],s[1],:] + np.random.randn(1,4)*(1./(i+1)))
            #Update Q-Table with new knowledge - sarsa step yo
            Q[s[0], s[1],a] = Q[s[0], s[1],a] + lr*(r + y * Q[s1[0], s1[1], a1] - Q[s[0], s[1],a])

            rAll += r
            s = s1
            a = a1
            if d == True:
                break
        jList.append(j)    
        rList.append(rAll)
        
    Q_list.append(Q)
#     v = np.mean(Q, axis = 2)
#     gw.print_v(v)

    pi = np.argmax(Q, axis = 2)
    gw.print_policy(pi)
#     reward_trial_list.append(sum(rList)/float(num_episodes))
#     print "Score over time: " +  str(sum(rList)/float(num_episodes))

#     window = int(num_episodes/10)

#     plt.figure(figsize=[9,16])
#     plt.subplot(411)
#     plt.plot(pd.Series(jList).rolling(window).mean())
#     plt.title('Step Moving Average ({}-episode window)'.format(window))
#     plt.ylabel('Moves')
#     plt.xlabel('Episode')

#     plt.subplot(412)
#     plt.plot(pd.Series(rList).rolling(window).mean())
#     plt.title('Reward Moving Average ({}-episode window)'.format(window))
#     plt.ylabel('Reward')
#     plt.xlabel('Episode')

#     plt.tight_layout(pad=2)
#     plt.show()

gw.print_gw()
meanQ = np.mean(np.array(Q_list), axis = 0)
meanv = np.mean(meanQ, axis = 2)
gw.print_v(meanv)
meanpi = np.argmax(meanQ, axis = 2)
gw.print_policy(meanpi)
E_dis_reward, E_reward, E_risk = estimate_rr(meanpi)
print "Mean reward (empirical)", E_dis_reward
print "Successful Episodes ", E_reward
print "Risk probab ", E_risk

100%|██████████| 10000/10000 [00:01<00:00, 8218.83it/s]


Unnamed: 0,0,1,2,3,4,5
0,➡,⬆,➡,⬆,➡,G
1,➡,⬇,➡,⬆,➡,⬆
2,➡,➡,➡,⬆,➡,⬆
3,E,⬅,⬇,➡,➡,➡
4,E,G,⬅,⬅,➡,➡
5,E,E,E,⬆,➡,⬆


100%|██████████| 10000/10000 [00:01<00:00, 6378.22it/s]


Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,➡,G
1,⬇,⬇,➡,➡,⬆,⬆
2,⬆,⬇,⬇,⬆,⬆,⬆
3,E,⬇,⬅,⬆,⬆,⬆
4,E,G,⬅,⬆,➡,➡
5,E,E,E,⬆,⬆,⬆


100%|██████████| 10000/10000 [00:01<00:00, 6359.74it/s]


Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,➡,G
1,➡,⬇,➡,⬆,➡,⬆
2,➡,⬆,⬇,➡,➡,⬆
3,E,⬇,➡,➡,⬆,⬆
4,E,G,⬅,⬅,⬆,⬆
5,E,E,E,⬆,⬆,⬆


100%|██████████| 10000/10000 [00:01<00:00, 8213.99it/s]


Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,➡,G
1,⬆,➡,➡,➡,⬆,⬆
2,⬆,⬇,➡,➡,⬆,⬆
3,E,⬇,➡,➡,⬆,⬆
4,E,G,⬅,⬅,⬅,➡
5,E,E,E,⬆,⬆,⬆


100%|██████████| 10000/10000 [00:01<00:00, 7898.66it/s]


Unnamed: 0,0,1,2,3,4,5
0,➡,➡,⬆,➡,➡,G
1,➡,➡,➡,⬆,⬆,⬆
2,➡,⬇,⬆,⬆,➡,⬆
3,E,⬇,⬇,⬆,⬆,⬆
4,E,G,⬅,⬅,⬆,⬆
5,E,E,E,⬆,➡,⬆


100%|██████████| 10000/10000 [00:01<00:00, 7593.53it/s]


Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,⬆,➡,G
1,➡,➡,➡,➡,➡,⬆
2,➡,⬇,⬆,➡,➡,⬆
3,E,⬇,⬅,⬅,➡,⬆
4,E,G,⬅,⬅,⬅,⬆
5,E,E,E,⬆,⬅,⬆


100%|██████████| 10000/10000 [00:01<00:00, 8222.56it/s]


Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,➡,G
1,➡,➡,➡,➡,➡,⬆
2,➡,⬇,⬇,⬇,⬆,⬆
3,E,⬇,⬅,⬅,⬆,⬆
4,E,G,⬆,⬅,⬆,⬆
5,E,E,E,⬆,⬆,⬆


100%|██████████| 10000/10000 [00:01<00:00, 8223.55it/s]


Unnamed: 0,0,1,2,3,4,5
0,⬆,➡,➡,➡,⬆,G
1,➡,⬇,➡,➡,➡,⬆
2,➡,⬆,⬇,⬇,⬆,⬆
3,E,⬇,⬅,⬅,➡,➡
4,E,G,⬅,⬅,⬅,⬆
5,E,E,E,⬆,➡,⬆


100%|██████████| 10000/10000 [00:01<00:00, 7893.93it/s]


Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,➡,G
1,➡,➡,➡,➡,➡,⬆
2,➡,⬆,⬅,➡,⬆,⬆
3,E,⬇,⬆,➡,➡,⬆
4,E,G,⬅,⬅,⬅,➡
5,E,E,E,⬅,⬆,⬆


100%|██████████| 10000/10000 [00:01<00:00, 7894.31it/s]


Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,➡,G
1,⬆,➡,⬆,⬆,➡,⬆
2,➡,➡,⬆,⬆,⬆,➡
3,E,⬇,⬇,➡,⬆,➡
4,E,G,⬅,⬅,➡,⬆
5,E,E,E,⬆,➡,⬆


Unnamed: 0,0,1,2,3,4,5
0,-,-,-,-,-,G
1,-,-,-,-,-,-
2,-,-,-,-,-,-
3,E,-,-,-,-,-
4,E,G,-,-,-,-
5,E,E,E,-,-,-


Unnamed: 0,0,1,2,3,4,5
0,0.018,0.034,0.078,0.189,0.412,G
1,0.01,0.027,0.045,0.0895,0.21,0.43
2,0.0047,0.076,0.032,0.0492,0.112,0.19
3,E,0.27,0.098,0.032,0.0584,0.094
4,E,G,0.31,0.0974,0.0324,0.034
5,E,E,E,0.00913,0.00995,0.02


Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,➡,G
1,➡,⬇,➡,➡,➡,⬆
2,➡,⬇,⬇,➡,⬆,⬆
3,E,⬇,⬅,⬅,⬆,⬆
4,E,G,⬅,⬅,⬅,⬆
5,E,E,E,⬆,⬆,⬆


100%|██████████| 10000/10000 [00:00<00:00, 37705.57it/s]

Mean reward (empirical) 0.326994538963
Successful Episodes  0.9564
Risk probab  0.0436



