In [20]:
from gridworld import *
from tqdm import trange
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#Normal Q-Learning

goals = [(0, 5), (4, 1)]
error_states = [(o, 0) for o in range(3, 6)] + [(5, o) for o in range(1, 3)]
gw = gridworld((6, 6), goals, error_states, 0.79, 0, 1, 0)
y = 0.6

In [22]:
#Empirically estimate risk probability
def estimate_rr(pi):
    num_sims = 10000
    all_dis_rewards = []
    all_rewards = []
    all_risks = []
    for st in gw.enum_good_states: 
        #print(st)
        risky = 0
        discounted_rewards = []
        rewards = []
        for i in trange(num_sims):
            s = st
            gw.set_state(s)
            reward = 0
            d = False
            j = 0
            #print("new ep")
            while(j < 100):
                a = pi[s[0],s[1]]
                #print("from", s, "through", a)
                s1,r,d = gw.step(a)
                #print("from", s, "to", s1, "through", a)
                reward += r
                if d == True:
                    if s1 in gw.error_states:
                        risky += 1
                        discounted_reward = 0
                    else:
                        discounted_reward = pow(y, j) * r
                    break
                j += 1
                s = s1
            rewards.append(reward)
            discounted_rewards.append(discounted_reward)
        st_reward = sum(rewards)/float(num_sims)
        all_rewards.append(st_reward)
        st_dis_reward = sum(discounted_rewards)/float(num_sims)
        all_dis_rewards.append(st_dis_reward)
        st_risk = float(risky)/num_sims  
        all_risks.append(st_risk)
        #print(st_reward, st_risk)
    return sum(all_dis_rewards)/len(all_dis_rewards), sum(all_rewards)/len(all_rewards), sum(all_risks)/len(all_risks)
    

In [24]:
trials = 10
Q_list = []
reward_trial_list = []

np.random.seed(1)

# Set learning parameters
lr = 0.9
num_episodes = 100000
#create lists to contain total rewards and steps per episode

for k in range(trials):
    #Initialize table with all zeros
    Q = np.zeros((gw.M,gw.N,4))
    jList = []
    rList = []
    for i in trange(num_episodes):
        #print('Episode', i)
        #Reset environment and get first new observation
        s = gw.reset()
        rAll = 0
        d = False
        j = 0
        #The Q-Table learning algorithm
        while j < 1000:
            j+=1
            #Choose an action by greedily (with noise) picking from Q table
            a = np.argmax(Q[s[0],s[1],:] + np.random.randn(1,4)*(1./(i+1)))
#             if np.random.rand() < epsilon:
#                 # take random action
#                 a = np.random.choice([0,1,2,3])
#             else:
#                 # take action according to the q function table
#                 a = np.argmax(Q[s[0],s[1],:])
            #Get new state and reward from environment
            s1,r,d = gw.step(a)

            #Update Q-Table with new knowledge
            Q[s[0], s[1],a] = Q[s[0], s[1],a] + lr*(r + y*np.max(Q[s1[0], s1[1],:]) - Q[s[0], s[1],a])

            rAll += r
            s = s1
            if d == True:
                break
        jList.append(j)    
        rList.append(rAll)
        
    Q_list.append(Q)
#     v = np.mean(Q, axis = 2)
#     gw.print_v(v)

    pi = np.argmax(Q, axis = 2)
    gw.print_policy(pi)
#     reward_trial_list.append(sum(rList)/float(num_episodes))
#     print "Score over time: " +  str(sum(rList)/float(num_episodes))

#     window = int(num_episodes/10)

#     plt.figure(figsize=[9,16])
#     plt.subplot(411)
#     plt.plot(pd.Series(jList).rolling(window).mean())
#     plt.title('Step Moving Average ({}-episode window)'.format(window))
#     plt.ylabel('Moves')
#     plt.xlabel('Episode')

#     plt.subplot(412)
#     plt.plot(pd.Series(rList).rolling(window).mean())
#     plt.title('Reward Moving Average ({}-episode window)'.format(window))
#     plt.ylabel('Reward')
#     plt.xlabel('Episode')

#     plt.tight_layout(pad=2)
#     plt.show()

gw.print_gw()
meanQ = np.mean(np.array(Q_list), axis = 0)
maxv = np.max(meanQ, axis = 2)
gw.print_v(maxv)
meanpi = np.argmax(meanQ, axis = 2)
gw.print_policy(meanpi)
E_dis_reward, E_reward, E_risk = estimate_rr(meanpi)
print "Mean reward (empirical)", E_dis_reward
print "Successful Episodes ", E_reward
print "Risk probab ", E_risk


  0%|          | 0/100000 [00:00<?, ?it/s][A
  1%|          | 514/100000 [00:00<00:19, 5121.87it/s][A
  1%|          | 875/100000 [00:00<00:21, 4549.19it/s][A
  1%|▏         | 1379/100000 [00:00<00:21, 4683.01it/s][A
  2%|▏         | 1918/100000 [00:00<00:20, 4873.91it/s][A
  2%|▏         | 2392/100000 [00:00<00:20, 4824.39it/s][A
  3%|▎         | 2898/100000 [00:00<00:19, 4880.74it/s][A
  3%|▎         | 3389/100000 [00:00<00:19, 4884.90it/s][A
  4%|▍         | 3993/100000 [00:00<00:18, 5181.42it/s][A
  4%|▍         | 4489/100000 [00:00<00:20, 4621.20it/s][A
  5%|▍         | 4947/100000 [00:01<00:21, 4321.90it/s][A
  5%|▌         | 5381/100000 [00:01<00:22, 4136.55it/s][A
  6%|▌         | 5894/100000 [00:01<00:21, 4389.46it/s][A
  6%|▋         | 6417/100000 [00:01<00:20, 4611.67it/s][A
  7%|▋         | 6945/100000 [00:01<00:19, 4791.53it/s][A
  7%|▋         | 7475/100000 [00:01<00:18, 4933.07it/s][A
  8%|▊         | 8121/100000 [00:01<00:17, 5307.43it/s][A
  9%|▊     

Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,➡,G
1,➡,➡,⬆,⬆,⬆,⬆
2,➡,➡,⬇,➡,➡,⬆
3,E,⬇,⬇,⬅,⬇,⬇
4,E,G,⬅,⬅,⬅,⬅
5,E,E,E,⬆,⬆,⬅



  0%|          | 0/100000 [00:00<?, ?it/s][A
  0%|          | 320/100000 [00:00<00:31, 3199.01it/s][A
  1%|          | 707/100000 [00:00<00:29, 3374.41it/s][A
  1%|          | 1067/100000 [00:00<00:28, 3435.38it/s][A
  1%|▏         | 1422/100000 [00:00<00:28, 3468.38it/s][A
  2%|▏         | 1798/100000 [00:00<00:27, 3546.35it/s][A
  2%|▏         | 2192/100000 [00:00<00:26, 3649.85it/s][A
  3%|▎         | 2718/100000 [00:00<00:24, 4018.76it/s][A
  3%|▎         | 3390/100000 [00:00<00:21, 4565.51it/s][A
  4%|▍         | 3863/100000 [00:00<00:23, 4146.44it/s][A
  5%|▍         | 4530/100000 [00:01<00:20, 4676.27it/s][A
  5%|▌         | 5174/100000 [00:01<00:18, 5089.99it/s][A
  6%|▌         | 5836/100000 [00:01<00:17, 5468.54it/s][A
  6%|▋         | 6416/100000 [00:01<00:16, 5536.41it/s][A
  7%|▋         | 6993/100000 [00:01<00:18, 5087.65it/s][A
  8%|▊         | 7526/100000 [00:01<00:20, 4453.10it/s][A
  8%|▊         | 8026/100000 [00:01<00:19, 4601.62it/s][A
  9%|▊     

Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,➡,G
1,⬆,⬆,⬆,➡,⬆,⬆
2,➡,⬆,⬆,⬆,⬆,⬅
3,E,⬇,⬇,⬇,⬆,⬆
4,E,G,⬅,⬅,⬆,⬅
5,E,E,E,⬆,⬆,⬅



  0%|          | 0/100000 [00:00<?, ?it/s][A
  0%|          | 469/100000 [00:00<00:21, 4687.32it/s][A
  1%|          | 1037/100000 [00:00<00:20, 4945.33it/s][A
  2%|▏         | 1660/100000 [00:00<00:18, 5266.78it/s][A
  2%|▏         | 2046/100000 [00:00<00:20, 4747.20it/s][A
  3%|▎         | 2640/100000 [00:00<00:19, 5051.17it/s][A
  3%|▎         | 3259/100000 [00:00<00:18, 5345.40it/s][A
  4%|▍         | 3755/100000 [00:00<00:20, 4587.82it/s][A
  4%|▍         | 4206/100000 [00:00<00:21, 4373.44it/s][A
  5%|▍         | 4640/100000 [00:00<00:22, 4207.57it/s][A
  5%|▌         | 5060/100000 [00:01<00:23, 4110.75it/s][A
  5%|▌         | 5471/100000 [00:01<00:23, 4016.38it/s][A
  6%|▌         | 6074/100000 [00:01<00:21, 4463.07it/s][A
  7%|▋         | 6621/100000 [00:01<00:19, 4723.09it/s][A
  7%|▋         | 7208/100000 [00:01<00:18, 5011.45it/s][A
  8%|▊         | 7725/100000 [00:01<00:18, 5037.21it/s][A
  8%|▊         | 8240/100000 [00:01<00:18, 4929.31it/s][A
  9%|▊    

Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,⬇,G
1,⬆,⬆,⬇,⬇,➡,⬆
2,➡,⬇,⬅,⬅,⬆,⬆
3,E,⬇,⬅,⬅,⬆,⬅
4,E,G,⬆,⬅,⬅,⬅
5,E,E,E,⬆,➡,⬆


Unnamed: 0,0,1,2,3,4,5
0,-,-,-,-,-,G
1,-,-,-,-,-,-
2,-,-,-,-,-,-
3,E,-,-,-,-,-
4,E,G,-,-,-,-
5,E,E,E,-,-,-


Unnamed: 0,0,1,2,3,4,5
0,0.027,0.14,0.24,0.35,0.67,G
1,0.039,0.03,0.13,0.12,0.23,0.97
2,0.048,0.19,0.058,0.06,0.16,0.38
3,E,0.79,0.073,0.12,0.06,0.02
4,E,G,0.31,0.28,0.13,0.083
5,E,E,E,0.18,0.08,0.02


Expected Reward  0.16592041831639462


Unnamed: 0,0,1,2,3,4,5
0,➡,➡,➡,➡,➡,G
1,⬆,⬆,⬆,➡,⬆,⬆
2,➡,⬇,⬅,⬅,⬆,⬆
3,E,⬇,⬅,⬅,⬆,⬇
4,E,G,⬅,⬅,⬅,⬅
5,E,E,E,⬆,⬆,⬅



  0%|          | 0/10000 [00:00<?, ?it/s][A
 33%|███▎      | 3343/10000 [00:00<00:00, 33424.29it/s][A
 63%|██████▎   | 6307/10000 [00:00<00:00, 32190.67it/s][A
 95%|█████████▌| 9547/10000 [00:00<00:00, 32248.95it/s][A
100%|██████████| 10000/10000 [00:00<00:00, 31083.88it/s][A
  0%|          | 0/10000 [00:00<?, ?it/s][A
 38%|███▊      | 3777/10000 [00:00<00:00, 37762.47it/s][A
 77%|███████▋  | 7680/10000 [00:00<00:00, 38132.87it/s][A
100%|██████████| 10000/10000 [00:00<00:00, 37121.87it/s][A
  0%|          | 0/10000 [00:00<?, ?it/s][A
 56%|█████▋    | 5642/10000 [00:00<00:00, 56404.72it/s][A
100%|██████████| 10000/10000 [00:00<00:00, 54029.56it/s][A
  0%|          | 0/10000 [00:00<?, ?it/s][A
 82%|████████▏ | 8168/10000 [00:00<00:00, 81679.30it/s][A
100%|██████████| 10000/10000 [00:00<00:00, 75412.57it/s][A
  0%|          | 0/10000 [00:00<?, ?it/s][A
100%|██████████| 10000/10000 [00:00<00:00, 136448.05it/s][A
  0%|          | 0/10000 [00:00<?, ?it/s][A
 24%|██▍       

Mean reward  0.308593515045
Risk probab  0.0489379310345
