# Model-Free prediction

## Monte Carlo

In [None]:
import random
n = 10
w_max = 25
values = [random.randint(1,10) for _ in range(n)]
weights1 = [random.randint(1,10) for _ in range(n)]
weights2 = [random.randint(1,10) for _ in range(n)]

# state = (item #, weight_remaining)
def available_actions(state):
  return [0,1]

def reward(state, action):
  if action == 0:
    return 0
  return values[state[0]]

def possible_states(state, action):
  item_ind = state[0]
  if action == 0:
    return [(1,(item_ind+1, state[1]))]

  if action == 1:
    return [(0.5, (item_ind+1, state[1] - weights1[item_ind])),
            (0.5, (item_ind+1, state[1] - weights2[item_ind]))]


def terminal_state(state):
  if state[1] < 0:
    return True, (-1000000, -1)
  if state[0] >= len(values):
    return True, (0, -1)
  else:
    return False, ()

cache = {}
def bellman(state):
  is_terminal, term_return = terminal_state(state)
  if is_terminal:
    return term_return
  if state in cache:
    return cache[state]
  best_value = None
  best_action = None
  for action in available_actions(state):
    exp_action_value = 0
    for p_state, next_state in possible_states(state, action):
      exp_action_value += p_state*(reward(state, action) + bellman(next_state)[0])
    if best_value is None or exp_action_value > best_value:
      best_value = exp_action_value
      best_action = action
  cache[state] = (best_value, best_action)
  return best_value, best_action

In [None]:
bellman((0,w_max))

(33.71875, 1)

In [None]:
[(v, w1, w2) for v,w1,w2 in zip(values, weights1, weights2)]

[(4, 2, 5),
 (7, 1, 7),
 (1, 6, 3),
 (1, 3, 10),
 (4, 3, 2),
 (9, 10, 5),
 (9, 5, 2),
 (4, 6, 3),
 (4, 7, 5),
 (2, 2, 4)]

In [None]:
state = (0, w_max)
state_actions = []
while not terminal_state(state)[0]:
  best_value, best_action = bellman(state)
  state_actions.append((state, best_action))
  states = possible_states(state, best_action)
  state = random.choices([s[1] for s in states],
                          weights=[s[0] for s in states])[0]
for s,a in state_actions:
  print(f'item#:{s[0]}\t Weight:{s[1]}\t add?:{a}')

item#:0	 Weight:25	 add?:1
item#:1	 Weight:20	 add?:1
item#:2	 Weight:13	 add?:0
item#:3	 Weight:13	 add?:0
item#:4	 Weight:13	 add?:1
item#:5	 Weight:11	 add?:1
item#:6	 Weight:6	 add?:1
item#:7	 Weight:4	 add?:0
item#:8	 Weight:4	 add?:0
item#:9	 Weight:4	 add?:1


In [None]:
cache

{(9, 25): (2.0, 1),
 (9, 18): (2.0, 1),
 (9, 20): (2.0, 1),
 (8, 25): (6.0, 1),
 (9, 19): (2.0, 1),
 (9, 12): (2.0, 1),
 (9, 14): (2.0, 1),
 (8, 19): (6.0, 1),
 (9, 22): (2.0, 1),
 (9, 15): (2.0, 1),
 (9, 17): (2.0, 1),
 (8, 22): (6.0, 1),
 (7, 25): (10.0, 1),
 (9, 13): (2.0, 1),
 (8, 20): (6.0, 1),
 (9, 7): (2.0, 1),
 (9, 9): (2.0, 1),
 (8, 14): (6.0, 1),
 (9, 10): (2.0, 1),
 (8, 17): (6.0, 1),
 (7, 20): (10.0, 1),
 (9, 23): (2.0, 1),
 (9, 16): (2.0, 1),
 (8, 23): (6.0, 1),
 (7, 23): (10.0, 1),
 (6, 25): (19.0, 1),
 (9, 8): (2.0, 1),
 (8, 15): (6.0, 1),
 (9, 2): (0, 0),
 (9, 4): (2.0, 1),
 (8, 9): (5.0, 1),
 (9, 5): (2.0, 1),
 (8, 12): (6.0, 1),
 (7, 15): (9.5, 1),
 (9, 3): (0, 0),
 (8, 10): (5.0, 1),
 (8, 4): (2.0, 0),
 (9, 0): (0, 0),
 (8, 7): (4.0, 1),
 (7, 10): (7.0, 1),
 (9, 6): (2.0, 1),
 (8, 13): (6.0, 1),
 (7, 13): (8.5, 1),
 (6, 15): (16.75, 1),
 (9, 11): (2.0, 1),
 (8, 18): (6.0, 1),
 (7, 18): (10.0, 1),
 (6, 20): (18.75, 1),
 (5, 25): (26.75, 1),
 (8, 16): (6.0, 1),
 (7, 22

In [None]:
opt_policy = {}
for s,v_a in cache.items():
  opt_policy[s] = v_a[1]

## Monte Carlo Policy Evaluation

In [None]:
policy = {}
for item in range(n):
  for weight in range(w_max+1):
    if max(weights1[item], weights2[item]) > weight:
      policy[(item,weight)] = 0
    else:
      policy[(item,weight)] = random.randint(0,1)

In [None]:
mc_cache = {}
#State:(N, TotalReward)

In [None]:
for _ in range(10000):
  state = (0, w_max)
  state_actions_reward = []
  while not terminal_state(state)[0]:
    if state not in mc_cache:
      mc_cache[state] = [0, 0]
    action = opt_policy[state]
    r = reward(state,action)
    # Add that we saw this state and the reward
    mc_cache[state][0] += 1
    mc_cache[state][1] += r
    # Add reward to previous states
    for s,_,_ in state_actions_reward:
      mc_cache[s][1] += r
    state_actions_reward.append((state, action, r))
    states = possible_states(state, action)
    state = random.choices([s[1] for s in states],
                            weights=[s[0] for s in states])[0]

In [None]:
state_actions_reward

[((0, 25), 0, 0),
 ((1, 25), 0, 0),
 ((2, 25), 0, 0),
 ((3, 25), 0, 0),
 ((4, 25), 1, 4),
 ((5, 22), 1, 9),
 ((6, 12), 1, 9),
 ((7, 7), 0, 0),
 ((8, 7), 0, 0),
 ((9, 7), 1, 2)]

In [None]:
mc_cache

{(0, 25): [10000, 337648],
 (1, 20): [4993, 142093],
 (2, 13): [2534, 45248],
 (3, 13): [2534, 45248],
 (4, 13): [2534, 45248],
 (5, 11): [1243, 17553],
 (6, 1): [607, 0],
 (7, 1): [1836, 0],
 (8, 1): [2155, 0],
 (9, 1): [2155, 0],
 (1, 23): [5007, 155555],
 (2, 16): [2436, 50482],
 (3, 16): [2436, 50482],
 (4, 16): [2436, 50482],
 (5, 16): [3648, 75754],
 (6, 6): [2462, 24624],
 (7, 4): [1565, 3130],
 (8, 4): [2182, 4364],
 (9, 4): [2919, 5838],
 (6, 11): [1822, 24664],
 (7, 9): [1276, 6408],
 (8, 9): [1436, 7218],
 (2, 19): [2459, 61894],
 (3, 19): [2459, 61894],
 (4, 19): [2459, 61894],
 (5, 17): [1247, 26786],
 (6, 12): [617, 9243],
 (7, 7): [643, 3220],
 (2, 22): [2571, 70024],
 (3, 22): [2571, 70024],
 (4, 22): [2571, 70024],
 (5, 19): [1279, 29222],
 (6, 9): [650, 8108],
 (7, 6): [870, 3480],
 (8, 3): [760, 0],
 (9, 3): [837, 0],
 (8, 0): [1401, 0],
 (9, 0): [1644, 0],
 (5, 10): [1291, 17559],
 (6, 5): [660, 5940],
 (7, 0): [976, 0],
 (5, 20): [1292, 30518],
 (6, 15): [647, 1084

In [None]:
for s,n_r in mc_cache.items():
  print(s, n_r[1]/n_r[0])

(0, 25) 33.7648
(1, 20) 28.458441818545964
(2, 13) 17.85635359116022
(3, 13) 17.85635359116022
(4, 13) 17.85635359116022
(5, 11) 14.121480289621882
(6, 1) 0.0
(7, 1) 0.0
(8, 1) 0.0
(9, 1) 0.0
(1, 23) 31.067505492310765
(2, 16) 20.723316912972084
(3, 16) 20.723316912972084
(4, 16) 20.723316912972084
(5, 16) 20.765899122807017
(6, 6) 10.001624695369618
(7, 4) 2.0
(8, 4) 2.0
(9, 4) 2.0
(6, 11) 13.536772777167947
(7, 9) 5.021943573667712
(8, 9) 5.0264623955431755
(2, 19) 25.17039446929646
(3, 19) 25.17039446929646
(4, 19) 25.17039446929646
(5, 17) 21.480352846832396
(6, 12) 14.980551053484604
(7, 7) 5.0077760497667185
(2, 22) 27.23609490470634
(3, 22) 27.23609490470634
(4, 22) 27.23609490470634
(5, 19) 22.847537138389367
(6, 9) 12.473846153846154
(7, 6) 4.0
(8, 3) 0.0
(9, 3) 0.0
(8, 0) 0.0
(9, 0) 0.0
(5, 10) 13.601084430673897
(6, 5) 9.0
(7, 0) 0.0
(5, 20) 23.620743034055728
(6, 15) 16.75579598145286
(7, 10) 7.044045676998369
(6, 14) 15.267090620031796
(7, 12) 7.60655737704918
(9, 2) 0.0
(

# TD(0)

In [None]:
td_cache = {}
# State: (# times, mean_value)
for _ in range(100000):
  state = (0, w_max)
  # BUG WAS HERE!  Shouldn't be reset within the for loop
  # td_cache[state] = 0
  state_actions_reward = []
  while not terminal_state(state)[0]:
    action = opt_policy[state]
    r = reward(state,action)
    # Add that we saw this state and the reward
    state_actions_reward.append((state, action, r))
    states = possible_states(state, action)
    next_state = random.choices([s[1] for s in states],
                            weights=[s[0] for s in states])[0]

    fmu = td_cache.get(next_state,0)
    mu = td_cache.get(state,0)
    td_cache[state] = mu + 0.01 * ((r + fmu) - mu)
    state = next_state


In [None]:
td_cache

{(0, 25): 33.768606322549196,
 (1, 23): 31.313217565965772,
 (2, 16): 20.773975342438227,
 (3, 16): 20.79100155835655,
 (4, 16): 20.794893597156932,
 (5, 16): 20.655913587740336,
 (6, 6): 10.05867681881975,
 (7, 4): 1.9999999999999667,
 (8, 4): 1.9999999999999778,
 (9, 4): 1.999999999999989,
 (6, 11): 13.528255282951477,
 (7, 6): 3.999999999999978,
 (8, 0): 0.0,
 (9, 0): 0.0,
 (2, 22): 27.363951924768045,
 (3, 22): 27.34984203618286,
 (4, 22): 27.31075857027004,
 (5, 20): 23.765496185576175,
 (6, 10): 12.476990389418393,
 (7, 8): 5.062798640347908,
 (8, 2): 0.0,
 (9, 2): 0.0,
 (1, 20): 28.15584214655052,
 (2, 19): 25.115405028912136,
 (3, 19): 25.152019259694836,
 (4, 19): 25.20731360008003,
 (5, 17): 21.716490734415114,
 (6, 12): 14.958020065705785,
 (7, 7): 5.094352340081348,
 (2, 13): 17.629294773614912,
 (3, 13): 17.570440155509072,
 (4, 13): 17.506731623244896,
 (5, 11): 13.584458380167312,
 (8, 3): 0.0,
 (9, 3): 0.0,
 (7, 1): 0.0,
 (8, 1): 0.0,
 (9, 1): 0.0,
 (5, 10): 13.57686421