In [None]:
import gym
import numpy as np
import time
from IPython import display
#import thư viện

In [None]:
def value_iteration(env, max_iters, gamma):
    Converge=[]
    v_values = np.zeros(env.observation_space.n)

    for i in range(max_iters):
        prev_v_values = np.copy(v_values)

        # Compute the value for state
        for state in range(env.observation_space.n):
            q_values = []
            # Compute the q-value for each action
            for action in range(env.action_space.n):
                q_value = 0
                # Loop through each possible outcome
                for prob, next_state, reward, done in env.P[state][action]:
                    q_value += prob * (reward + gamma * prev_v_values[next_state])
                
                q_values.append(q_value)
            
            # Select the best action
            best_action = np.argmax(q_values)
            v_values[state] = q_values[best_action]
        
        # Check convergence
        if np.all(np.isclose(v_values, prev_v_values)):
            print(f'Converged at {i}-th iteration.')
            Converge.append(i)
            break
    
    return v_values,Converge

In [None]:
def policy_extraction(env, v_values, gamma=0.9):
    policy = np.zeros(env.observation_space.n, dtype=np.int)

    # Compute the best action for each state in the game
    # Compute q-value for each (state-action) pair in the game
    for state in range(env.observation_space.n):
        q_values = []
        # Compute q_value for each action
        for action in range(env.action_space.n):
            q_value = 0
            for prob, next_state, reward, done in env.P[state][action]:
                q_value += prob * (reward + gamma * v_values[next_state])
            q_values.append(q_value)
        
        # Select the best action
        best_action = np.argmax(q_values)
        policy[state] = best_action
    
    return policy

In [None]:
def play(env, policy):
    state = env.reset()
    total_reward = 0
    done = False
    steps = 0
    #time.sleep(1)
    #display.clear_output(wait=True)
    while not done:
        action = policy[state]
        next_state, reward, done, info = env.step(action)
        total_reward += reward
        steps += 1
        #print(f'Step {steps}')
        #env.render()
        #time.sleep(0.2)
        #if not done:
        #    display.clear_output(wait=True)
        state = next_state

    return total_reward

In [None]:
def play_multiple_times(env, policy, max_episodes):
    success = 0
    score=[]
    for i in range(max_episodes):
        reward = play(env, policy)

        if reward > 0:
            success += 1
            score.append(reward)
    score=np.array(score)
    print(f'Number of successes: {success}/{max_episodes}')
    print(f'mean score: {np.mean(score)}')
    return success,list(score)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt


# Cách thực hiện taxi v3

In [None]:
arr_Score=[]

for i in range(50):
  env = gym.make('Taxi-v3')#load môi trường
  
  v_values,Converge = value_iteration(env, max_iters=1000, gamma=0.9)
  env.reset()
  policy = policy_extraction(env, v_values, gamma=0.9)
  
  success,score=play_multiple_times(env, policy, 1000)
  arr_Score+=score
arr_Score=np.array(arr_Score)
x = pd.Series(arr_Score, name="Score Distribution Taxi v3 ")
ax = sns.distplot(x)

Converged at 116-th iteration.
Number of successes: 1000/1000
mean score: 7.878
Converged at 116-th iteration.
Number of successes: 1000/1000
mean score: 8.033


KeyboardInterrupt: ignored

# Cách thực hiện Frozen Lake

In [None]:
arr_Success=[]

for i in range(50):
  env = gym.make('FrozenLake8x8-v0')#load môi trường
  
  v_values,Converge = value_iteration(env, max_iters=1000, gamma=0.9)
  env.reset()
  policy = policy_extraction(env, v_values, gamma=0.9)
  
  success,score=play_multiple_times(env, policy, 1000)
  arr_Success.append(success)
arr_Success=np.array(arr_Success)
x = pd.Series(arr_Success, name="Success Distribution Frozen Laze")
ax = sns.distplot(x)

Converged at 117-th iteration.
Number of successes: 725/1000
mean score: 1.0
Converged at 117-th iteration.
Number of successes: 738/1000
mean score: 1.0
Converged at 117-th iteration.
Number of successes: 761/1000
mean score: 1.0
Converged at 117-th iteration.
Number of successes: 749/1000
mean score: 1.0
Converged at 117-th iteration.
Number of successes: 762/1000
mean score: 1.0
Converged at 117-th iteration.
Number of successes: 754/1000
mean score: 1.0
Converged at 117-th iteration.
Number of successes: 735/1000
mean score: 1.0
Converged at 117-th iteration.
Number of successes: 712/1000
mean score: 1.0
Converged at 117-th iteration.
Number of successes: 732/1000
mean score: 1.0
Converged at 117-th iteration.
Number of successes: 735/1000
mean score: 1.0
Converged at 117-th iteration.


KeyboardInterrupt: ignored

In [None]:
arr_Success=[]

for i in range(50):
  env = gym.make('FrozenLake-v0')#load môi trường
  
  v_values,Converge = value_iteration(env, max_iters=1000, gamma=0.9)
  env.reset()
  policy = policy_extraction(env, v_values, gamma=0.9)
  
  success,score=play_multiple_times(env, policy, 1000)
  arr_Success.append(success)
arr_Success=np.array(arr_Success)
x = pd.Series(arr_Success, name="Success Distribution Frozen Laze")
ax = sns.distplot(x)

Converged at 79-th iteration.
Number of successes: 731/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 732/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 723/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 725/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 748/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 751/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 767/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 750/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 748/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 755/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 739/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 742/1000
mean score: 1.0
Converged at 79-th iteration.
Number of successes: 727/1000
mean score: 1.0
Converged at

NameError: ignored