## I re-used my homework for Prof. Tran's AE598RL course last term. The original code is here: https://github.com/uiuc-ae598-rl-2023-spring/hw1-dp-LXYYY.git

In [1]:
import numpy as np

In [8]:
def coin_game_DP():
    # Initialize the DP table; +1 for extra head counts and +1 for 0-based indexing
    dp = [[0 for _ in range(12)] for _ in
          range(11)]  # 11 for the number of flips (0-10), 12 for heads (0-11, with 9+ as loss)
    policy = [['' for _ in range(12)] for _ in range(11)]

    # Base cases
    dp[10][8] = 1  # Win condition if exactly 8 heads after 10 flips
    for h in range(9, 12):  # Lose condition if more than 8 heads
        dp[10][h] = 0

    # DP table fill
    for n in range(9, -1, -1):  # From 9 down to 0 flips
        for h in range(8, -1, -1):  # Up to 8 heads, inclusive
            # Coin A choice leads directly to the next state with one more head
            probA = dp[n + 1][min(h + 1, 11)]  # min to cap heads at 11 (9+ considered as losing states)

            # Coin B choice, with a fair chance of head or tail
            probB = 0.5 * dp[n + 1][min(h + 1, 11)] + 0.5 * dp[n + 1][h]

            # Select the action with the higher expected probability
            if probA > probB:
                dp[n][h] = probA
                policy[n][h] = 'A'  # Choose coin A
            else:
                dp[n][h] = probB
                policy[n][h] = 'B'  # Choose coin B

    # Reconstruct the policy path
    n, h = 0, 0
    path = []
    while n < 10:
        decision = policy[n][h]
        path.append(decision)
        if decision == 'A':
            h = min(h + 1, 11)  # Increment head count or cap
        n += 1

    print(dp)
    return dp[0][0], path


def simulate_policy(policy):
    probability = 1.0  # Start with 100% probability
    heads = 0  # Initial number of heads

    # Simulate each decision in the policy
    for n in range(10):  # For each flip
        decision = policy[n]
        if decision == 'A':
            # Coin A (guaranteed head)
            heads += 1
            # Probability does not change as outcome is certain
        elif decision == 'B':
            # Coin B (fair coin, 50% head)
            if heads < 8:
                # Only if less than 8 heads, flipping coin B makes sense for trying to win
                probability *= 0.5  # Update probability for the uncertain outcome

        # If at any point heads exceed 8, the game is lost, so probability is 0
        if heads > 8:
            return 0.0

    # If exactly 8 heads, return the accumulated probability, else 0
    return probability


probability, policy_path = coin_game_DP()
print(f"Optimal policy's win probability: {probability}")
print(f"Policy sequence: {policy_path}")
print(f"Simulated win probability: {simulate_policy(policy_path)}")  # Should match the DP result



[[0.9892578125, 0.9453125, 0.828125, 0.623046875, 0.376953125, 0.171875, 0.0546875, 0.0107421875, 0.0009765625, 0, 0, 0], [0.998046875, 0.98046875, 0.91015625, 0.74609375, 0.5, 0.25390625, 0.08984375, 0.01953125, 0.001953125, 0, 0, 0], [1, 0.99609375, 0.96484375, 0.85546875, 0.63671875, 0.36328125, 0.14453125, 0.03515625, 0.00390625, 0, 0, 0], [0.0, 1, 0.9921875, 0.9375, 0.7734375, 0.5, 0.2265625, 0.0625, 0.0078125, 0, 0, 0], [0.0, 0.0, 1, 0.984375, 0.890625, 0.65625, 0.34375, 0.109375, 0.015625, 0, 0, 0], [0.0, 0.0, 0.0, 1, 0.96875, 0.8125, 0.5, 0.1875, 0.03125, 0, 0, 0], [0.0, 0.0, 0.0, 0.0, 1, 0.9375, 0.6875, 0.3125, 0.0625, 0, 0, 0], [0.0, 0.0, 0.0, 0.0, 0.0, 1, 0.875, 0.5, 0.125, 0, 0, 0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1, 0.75, 0.25, 0, 0, 0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1, 0.5, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]]
Optimal policy's win probability: 0.9892578125
Policy sequence: ['B', 'B', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A']
Simulated win probability: 0.25


In [1]:
from good_bad import GoodBad

env = GoodBad()

# from models.policy_iteration.policy_iteration import learn
from models.value_iteration.value_iteration import learn

# from models.q_learning.q_learning import learn
model = learn(env, scene="good_bad", max_it=1000, gamma=0.9, epsilon=1e-3, alpha=0.1)

delta: 5.434970448421403
Learning finished after 0 epochs, converged False, delta 5.434970448421403
delta: 4.709624691084699
Learning finished after 1 epochs, converged False, delta 4.709624691084699
delta: 4.471906384802198
Learning finished after 2 epochs, converged False, delta 4.471906384802198
delta: 4.246186910029309
Learning finished after 3 epochs, converged False, delta 4.246186910029309
delta: 4.03186062574558
Learning finished after 4 epochs, converged False, delta 4.03186062574558
delta: 3.828352460661069
Learning finished after 5 epochs, converged False, delta 3.828352460661069
delta: 3.6351163702092
Learning finished after 6 epochs, converged False, delta 3.6351163702092
delta: 3.4516338714228887
Learning finished after 7 epochs, converged False, delta 3.4516338714228887
delta: 3.2774126517628233
Learning finished after 8 epochs, converged False, delta 3.2774126517628233
delta: 3.111985248165091
Learning finished after 9 epochs, converged False, delta 3.111985248165091
de

In [10]:
s = env.reset()

max_sim_steps = 1000000

from models.base_model import ModelBasedAlg


def get_action_from_belief(model: ModelBasedAlg, b, s, random):
    v0 = model.get_values(0)
    v1 = model.get_values(1)
    if random:
        model_policy_0 = model.get_policy(0)
        model_policy_1 = model.get_policy(1)
        return np.random.choice([model_policy_0, model_policy_1], p=b)
    else:
        # try all action
        max_value = -np.inf
        best_action = None
        for a in [0, 1, 2]:
            new_b = model.update_belief(s, b, a)
            new_value = new_b[0] * v0 + new_b[1] * v1
            if new_value > max_value:
                max_value = new_value
                best_action = a
        return best_action


r_sum = 0
b = np.asarray([1, 0])
for i in range(max_sim_steps):
    a = get_action_from_belief(model, b, s, random=False)
    b = model.update_belief(s, b, a)
    s1, r, done = env.step(a)
    r_sum += r
    s = s1
    if done:
        break
print(f"Total reward with policy (1): {r_sum}")

s = env.reset()
r_sum = 0
b = np.asarray([1, 0])
for i in range(max_sim_steps):
    a = get_action_from_belief(model, b, s, random=True)
    b = model.update_belief(s, b, a)
    s1, r, done = env.step(a)
    r_sum += r
    s = s1
    if done:
        break
print(f"Total reward with policy (2): {r_sum}")

Total reward with policy (1): -1000000


KeyboardInterrupt: 

In [2]:
from models.q_learning.q_learning import learn as q_learn
from good_bad import GoodBad

env = GoodBad(100)
model = q_learn(env, scene="good_bad", max_it=1000, gamma=0.9, epsilon=0.3, alpha=0.1)

Episode 0 of 1000 finished
Episode 100 of 1000 finished
Episode 200 of 1000 finished
Episode 300 of 1000 finished
Episode 400 of 1000 finished
Episode 500 of 1000 finished
Episode 600 of 1000 finished
Episode 700 of 1000 finished
Episode 800 of 1000 finished
Episode 900 of 1000 finished
[[74.55007247 87.3952746  80.92021661]
 [73.03538937 69.23336605 71.97633751]]
[1 0]


In [1]:
s = env.reset()
r_sum = 0
b = np.asarray([1, 0])
for i in range(max_sim_steps):
    a = model.get_policy(s)
    s1, r, done = env.step(a)
    r_sum += r
    s = s1
    if done:
        break
print(f"Total reward with policy (1): {r_sum}")

NameError: name 'env' is not defined

In [1]:
from mountain_gridworld import MountainGridWorld

env = MountainGridWorld()

from models.q_learning.q_learning import learn as q_learn
from models.q_learning.q_learning import PRECISE, NOISY

model = q_learn(env, scene="mountain_gridworld", max_it=100000, gamma=0.9, epsilon=0.3, alpha=0.1, obs_mode=NOISY)
print(model.Q[::10, :])

Episode 0 of 100000 finished
Episode 10000 of 100000 finished
Episode 20000 of 100000 finished
Episode 30000 of 100000 finished
Episode 40000 of 100000 finished
Episode 50000 of 100000 finished
Episode 60000 of 100000 finished
Episode 70000 of 100000 finished
Episode 80000 of 100000 finished
Episode 90000 of 100000 finished
[[ 2.88625420e-01  8.09418006e-01  6.74802348e-01  8.66012832e-01]
 [ 7.00683109e-01  2.28076797e-01  3.82835118e-01  6.92589967e-01]
 [ 7.56250717e-01  4.22848348e-01  5.00436106e-01  8.75481984e-01]
 ...
 [ 3.12890252e-01  5.88496802e-01  6.49462707e-01  7.54496301e-02]
 [ 3.04901714e+00  1.28318460e+02  1.38134443e+02 -9.98051814e-01]
 [-1.33751822e-01  1.23550646e+02  4.43318929e+01  5.93982330e+01]]
(490, 4)
[3 0 3 0 0 2 3 3 1 3 2 1 1 0 3 2 3 3 3 3 3 3 1 3 3 0 3 3 3 3 2 2 2 1 3 3 2
 1 3 2 0 1 0 0 3 3 3 2 1 0 3 3 0 3 3 0 1 1 3 1 3 3 0 0 3 1 2 2 2 3 0 1 2 1
 3 0 0 0 2 3 3 1 1 1 0 3 2 3 2 3 3 2 2 3 1 1 3 3 3 0 3 1 1 1 1 0 3 1 3 3 0
 3 3 0 2 1 3 3 3 1 3 3 0 1 3 3 2

In [8]:
from models.q_learning.q_learning import discretize_state_and_confidence, DISCRETIZATION_RESOLUTION

# test
s = env.reset()
obs = s
confidence = 1
r_sum = 0
trajectory = [s]
for i in range(100):
    s_obs_and_confidence = discretize_state_and_confidence(obs, confidence)
    a = model.get_policy(s_obs_and_confidence)
    s1, r, done, obs, confidence = env.step(a, True)
    r_sum += r
    s = s1
    trajectory.append(s)
    if done:
        break
print(f"Total reward with policy (1): {r_sum}")
print(trajectory)

from state 8 action 3 to state 15 reward -1 done False confidence 0.86
from state 15 action 3 to state 22 reward -1 done False confidence 0.5
from state 22 action 3 to state 23 reward -1 done False confidence 0.5
from state 23 action 3 to state 30 reward -1 done False confidence 0.77
from state 30 action 2 to state 31 reward -1 done False confidence 0.95
from state 31 action 2 to state 32 reward -1 done False confidence 0.77
from state 32 action 2 to state 31 reward -1 done False confidence 0.95
from state 31 action 2 to state 32 reward -1 done False confidence 0.77
from state 32 action 2 to state 33 reward -1 done False confidence 0.6799999999999999
from state 33 action 2 to state 32 reward -1 done False confidence 0.77
from state 32 action 2 to state 31 reward -1 done False confidence 0.95
from state 31 action 2 to state 30 reward -1 done False confidence 0.77
from state 30 action 2 to state 29 reward -1 done False confidence 0.6799999999999999
from state 29 action 3 to state 28 rewa