In [1]:
import sys
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
class True_MDP():
    def __init__(self):
        self.s = 0
    def swim(self,a):
        if a == 0:
            return self.move_left()
        elif a == 1:
            return self.move_right()

    def move_left(self):
        if self.s == 0:
            return self.s, 0.005
        elif self.s in [1, 2, 3, 4, 5]:
            self.s -= 1
            return self.s, 0

    def move_right(self):
        coin = random.random()
        if self.s == 5:
            if coin < 0.6:
                return self.s, 1
            else:
                self.s -= 1
                return self.s, 0
        if self.s in [1, 2, 3, 4]:
            if coin < 0.35:
                self.s += 1
                return self.s, 0
            elif coin < 0.4:
                self.s -= 1
                return self.s, 0
            else:
                return self.s, 0
        if self.s == 0:
            if coin < 0.6:
                self.s += 1
                return self.s, 0
            else:
                return self.s, 0

    def get_state(self):
        return self.s

    def reset(self):
        self.s=0
        return self.s

In [3]:
class DynamicProgramming():
    def __init__(self,psample,rsample):
        self.theta=0.0001
        self.psample=psample
        self.rsample=rsample

    def One_step_LookAhead(self,v):
        A = np.zeros((6,2))
        inner_product=np.zeros((6,2))
        for s in range(6):
            for a in range(2):
                for s_prime in range(6):
                    inner_product[int(s)][int(a)] += self.psample[int(s)][int(a)][int(s_prime)]*v[int(s_prime)]
        
        for s in range(6):
          for a in range(2):
            A[int(s)][int(a)] = self.rsample[int(s)][int(a)] + 0.99*inner_product[int(s)][int(a)]
        
        return A
    def value_iteration(self):
        V=np.zeros(6)
        while True:
            delta = 0
            for s in range(6):
                v=V[s]
                A = self.One_step_LookAhead(V)
                best_action_value = np.max(A[s])
                V[s] = best_action_value      
                delta = max(delta, np.abs(v - V[s]))            
            if delta < self.theta:
                break
        policy = np.zeros(6)
        for s in range(6):
            # One step lookahead to find the best action for this state
            A = self.One_step_LookAhead(V)
            policy[s] = np.argmax(A[s])
        return policy, V

In [4]:
class policy_evaluation():
    def __init__(self,psample,rsample,policy):
        self.theta=0.0001
        self.psample=psample
        self.rsample=rsample
        self.policy=policy

    def One_step_LookAhead(self,v):
        v = np.zeros(6)
        inner_product=np.zeros((6,2))
        for s in range(6):
          for s_prime in range(6):
            inner_product[int(s)][int(self.policy[s])] += self.psample[int(s)][int(self.policy[s])][int(s_prime)]*v[int(s_prime)]
        
        for s in range(6):
          v[s] = self.rsample[int(s)][int(self.policy[s])] + 0.99*inner_product[int(s)][int(self.policy[s])]
        
        return v

    def value_iteration(self):
        V=np.zeros(6)
        while True:
            delta = 0
            for s in range(6):
                v=V[s]
                V = self.One_step_LookAhead(V)
                delta = max(delta, np.abs(V[s]-v))
            if delta < self.theta:
                break
        return V

In [5]:
true_rsample=np.zeros((6,2))
true_rsample[0,0]=0.005
true_rsample[5,1]=0.6
true_psample=np.zeros((6,2,6))
true_psample[0,0,0]=1
true_psample[0,1,0]=0.4
true_psample[0,1,1]=0.6
true_psample[1,0,0]=1
true_psample[1,1,1]=0.6
true_psample[1,1,0]=0.05
true_psample[1,1,2]=0.35
true_psample[2,0,1]=1
true_psample[2,1,2]=0.6
true_psample[2,1,1]=0.05
true_psample[2,1,3]=0.35
true_psample[3,0,2]=1
true_psample[3,1,3]=0.6
true_psample[3,1,2]=0.05
true_psample[3,1,4]=0.35
true_psample[4,0,3]=1
true_psample[4,1,4]=0.6
true_psample[4,1,3]=0.05
true_psample[4,1,5]=0.35
true_psample[5,0,4]=1
true_psample[5,1,4]=0.4
true_psample[5,1,5]=0.6


In [7]:
def PSRL():
  env=True_MDP()
  mu=np.zeros((6,2))
  precision=np.ones((6,2))
  alpha = np.ones((6, 2, 6))
  sigma=1
  regret=0
  df=pd.DataFrame()
  visit = np.zeros((6,2))
  for m in range(50):
    for episode in range(5000):
        s=0
        alpha_update = np.zeros((6, 2, 6))
        mu_update = np.zeros((6,2))
        precision_update = np.ones((6,2))
        b=[]
        for i in range(6):
            for j in range(2):
                for k in range(6):
                    b.append(alpha[i][j][k])
        c=np.random.dirichlet(b)
        psample = np.zeros((6,2,6))
        for i in range(6):
            for j in range(2):
                for k in range(6):
                    psample[i][j][k]=c[12*(i-1)+6*(j-1)+k]
        rsample = np.zeros((6,2))
        for i in range(6):
            for j in range(2):
                rsample[i][j]=np.random.normal(mu[i][j],1/precision[i][j])
       
        policy,v=DynamicProgramming(psample,rsample).value_iteration()
        history=[]
        for time_step in range(20):
            a=policy[s]
            s_prime, r, = env.swim(a)
            history.append((s, a, r, s_prime))
            s=s_prime

        for transition in history[::-1]:
            s,a,r,s_prime=transition
            alpha[int(s)][int(a)][int(s_prime)]+=1
            visit[int(s)][int(a)] +=1
            mu[int(s)][int(a)] += (r-mu[int(s)][int(a)])/(visit[int(s)][int(a)]+1)
            precision[int(s)][int(a)] += sigma
        V=policy_evaluation(true_psample,true_rsample,policy).value_iteration()
        regret +=v[0]-V[0]
       
      
        df.loc[episode,m]=regret

  df.mean(axis=1).plot()

In [8]:
PSRL()

KeyboardInterrupt: 