# Knapsack

In [8]:
import numpy as np
import random
n = 100
w_max = 250
values = [random.randint(1,10) for _ in range(n)]
weights = [random.randint(1,10) for _ in range(n)]

In [9]:
def available_actions(state):
  return [0,1]

def reward(state, action):
  if action == 0:
    return 0
  return values[state[0]]

def next_state(state, action):
  item_ind = state[0]
  if action == 0:
    return (item_ind+1, state[1])
  if action == 1:
    return (item_ind+1, state[1] - weights[item_ind])


def terminal_state(state):
  if state[1] < 0:
    return True, (-1000000, -1)
  if state[0] >= len(values):
    return True, (0, -1)
  else:
    return False, ()

cache = {}
def bellman(state):
  is_terminal, term_return = terminal_state(state)
  if is_terminal:
    return term_return
  if state in cache:
    return cache[state]
  best_value = None
  best_action = None
  for action in available_actions(state):
    action_value = reward(state, action) + bellman(next_state(state,action))[0]
    if best_value is None or action_value > best_value:
      best_value = action_value
      best_action = action
  cache[state] = (best_value, best_action)
  return best_value, best_action

In [10]:
start_state = (0, w_max)

state = start_state

actions = []
for _ in range(n):
  best_value, best_action = bellman(state)
  actions.append(best_action)
  state = next_state(state, best_action)

In [11]:
np.array(actions)

array([1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0])

# Egg Drop

In [12]:
def available_actions(state):
  return range(1,state[0] + 1)

def reward(state, action):
  return -1

def next_state(state, action):
  surv_state = (state[0]-action, state[1])
  break_state = (action-1, state[1]-1)
  if bellman(surv_state)[0] < bellman(break_state)[0]:
    return surv_state
  else:
    return break_state


def terminal_state(state):
  if state[0] == 0:
    return True, (0, -1)
  if state[1] == 0:
    return True, (-10000000, -1)
  else:
    return False, ()

cache = {}
def bellman(state):
  is_terminal, term_return = terminal_state(state)
  if is_terminal:
    return term_return
  if state in cache:
    return cache[state]
  best_value = None
  best_action = None
  for action in available_actions(state):
    action_value = reward(state, action) + bellman(next_state(state,action))[0]
    if best_value is None or action_value > best_value:
      best_value = action_value
      best_action = action
  cache[state] = (best_value, best_action)
  return best_value, best_action

In [13]:
start_state = (100, 5)

state = start_state

state_actions = []
while not terminal_state(state)[0]:
  best_value, best_action = bellman(state)
  state_actions.append((state, best_action))
  state = next_state(state, best_action)

state_actions

[((100, 5), 38),
 ((37, 4), 7),
 ((30, 4), 15),
 ((14, 3), 7),
 ((6, 2), 3),
 ((2, 1), 1),
 ((1, 1), 1)]

# Stochastic Egg Drop

In [14]:
def available_actions(state):
  return range(1,state[0] + 1)

def reward(state, action):
  return -1

def possible_states(state, action):
  surv_state = (state[0] - action, state[1])
  break_state = (action - 1, state[1] - 1)
  p_break = action/state[0]
  p_surv = 1 - p_break
  return [(p_surv, surv_state),
          (p_break, break_state)]

def terminal_state(state):
  if state[0] == 0:
    return True, (0, -1)
  if state[1] == 0:
    return True, (-10000000, -1)
  else:
    return False, ()

cache = {}
def bellman(state):
  is_terminal, term_return = terminal_state(state)
  if is_terminal:
    return term_return
  if state in cache:
    return cache[state]
  best_value = None
  best_action = None
  for action in available_actions(state):
    exp_action_value = 0
    for p_state, next_state in possible_states(state, action):
      exp_action_value += p_state*(reward(state, action) + bellman(next_state)[0])
    if best_value is None or exp_action_value > best_value:
      best_value = exp_action_value
      best_action = action
  cache[state] = (best_value, best_action)
  return best_value, best_action

In [15]:
import random
start_state = (100, 5)
state = start_state
state_actions = []
while not terminal_state(state)[0]:
  best_value, best_action = bellman(state)
  state_actions.append((state, best_action))
  states = possible_states(state, best_action)
  state = random.choices([s[1] for s in states],
                          weights=[s[0] for s in states])[0]
#  if bellman(states[0][1])[0] < bellman(states[1][1])[1]:
#    state = states[0][1]
#  else:
#    state = states[1][1]
state_actions

[((100, 5), 49),
 ((48, 4), 23),
 ((25, 4), 12),
 ((11, 3), 6),
 ((5, 3), 3),
 ((2, 3), 1)]

In [16]:
start_state = (100, 5)

state = start_state

actions = []
while not terminal_state(state)[0]:
  best_value, best_action = bellman(state)
  actions.append((state,best_action))
  state = next_state(state, best_action)

In [17]:
state

(0, 3)

In [18]:
actions

[((100, 5), 49),
 ((51, 5), 26),
 ((25, 4), 12),
 ((13, 4), 6),
 ((7, 4), 3),
 ((4, 4), 2),
 ((2, 4), 1),
 ((1, 4), 1)]

# Optimal Replacement

In [19]:
# 0 maintain
# 1 replace

max_miles = 50
gamma = 0.9
threshold = 0.00001
def available_actions(state):
  return [0,1]

def reward(state, action):
  if action == 1:
    return -15
  else:
    return -state

def possible_states(state, action):
  if action == 1:
    state = 0
  if state == max_miles-1:
    return [(1, state+1)]
  if state == max_miles:
    return [(1, state)]
  return [(0.5, state+1), (0.5, state+2)]

values = [(0,0) for _ in range(max_miles + 1)]

it = 0
done = False
while not done:
  it += 1
  new_values = [0 for _ in range(max_miles+1)]
  for state in range(max_miles+1):
    best_value = None
    best_action = None
    for action in available_actions(state):
      exp_action_value = 0
      for p_state, next_state in possible_states(state, action):
        exp_action_value += p_state*(reward(state, action) + gamma*values[next_state][0])
      if best_value is None or exp_action_value > best_value:
        best_value = exp_action_value
        best_action = action
    new_values[state] = best_value, best_action
  max_change = max(abs(v[0] - new_v[0]) for v,new_v in zip(values, new_values))
  if max_change < threshold:
    done = True
  values = new_values


In [20]:
values

[(-47.898352309227604, 0),
 (-51.596584791667766, 0),
 (-54.844217151267486, 0),
 (-57.59265697484717, 0),
 (-59.83895572813033, 0),
 (-61.47807880501428, 0),
 (-62.60850851321201, 0),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62

# Optimal Replacement Policy Iteration

In [21]:
max_miles = 50
gamma = 0.9
threshold = 0.00001
def available_actions(state):
  return [0,1]

def reward(state, action):
  if action == 1:
    return -15
  else:
    return -state

def possible_states(state, action):
  if action == 1:
    state = 0
  if state == max_miles-1:
    return [(1, state+1)]
  if state == max_miles:
    return [(1, state)]
  return [(0.5, state+1), (0.5, state+2)]

policy = [0 for _ in range(max_miles + 1)]
policy_value = [0 for _ in range(max_miles + 1)]

In [22]:
# Policy Evaluation
for _ in range(100):
  for miles in range(max_miles + 1):
    action = policy[miles]
    value_s = 0
    for prob_s, s_prime in possible_states(miles, action):
      value_s += prob_s*(reward(miles, action) + gamma*policy_value[s_prime])
    policy_value[miles] = value_s

# Policy Update
for miles in range(max_miles + 1):
  best_value = None
  best_action = None
  for action in available_actions(miles):
    value_s = 0
    for prob_s, s_prime in possible_states(miles, action):
      value_s += prob_s*(reward(miles, action) + gamma*policy_value[s_prime])
    if best_value is None or value_s > best_value:
      best_value = value_s
      best_action = action
  policy[miles] = best_action


In [23]:
policy_value

[-130.8738215585101,
 -140.57633488882237,
 -150.25733095218752,
 -159.91525340062745,
 -169.5484333152212,
 -179.1550810638273,
 -188.73327756987396,
 -198.28096494961872,
 -207.79593647219983,
 -217.27582579349593,
 -226.71809541126896,
 -236.120024285265,
 -245.47869456187567,
 -254.79097733858964,
 -264.05351739878824,
 -273.2627168423985,
 -282.4147175325629,
 -291.5053822726436,
 -300.53027462181706,
 -309.4846372506007,
 -318.36336873109144,
 -327.1609986478975,
 -335.8716609099597,
 -344.48906512968887,
 -353.0064659367647,
 -361.41663006242027,
 -369.7118010635999,
 -377.883661452766,
 -385.9232921773324,
 -393.8211289842461,
 -401.56691600970237,
 -409.14965522183195,
 -416.5575538440495,
 -423.77796458212026,
 -430.79732856008843,
 -437.6010995556107,
 -444.1736929555734,
 -450.498368434549,
 -456.5572338443793,
 -462.33098061005035,
 -467.7991569773356,
 -472.939306755986,
 -477.7284377935251,
 -482.13963959743137,
 -486.14872898805635,
 -489.72008805055634,
 -492.835594300

# RLlib Optimal Replacement

In [24]:
!pip install ray[rllib]

Collecting ray[rllib]
  Downloading ray-2.9.3-cp310-cp310-win_amd64.whl.metadata (14 kB)
Collecting msgpack<2.0.0,>=1.0.0 (from ray[rllib])
  Downloading msgpack-1.0.7-cp310-cp310-win_amd64.whl.metadata (9.4 kB)
Collecting pyarrow<7.0.0,>=6.0.1 (from ray[rllib])
  Downloading pyarrow-6.0.1-cp310-cp310-win_amd64.whl.metadata (2.9 kB)
Collecting lz4 (from ray[rllib])
  Downloading lz4-4.3.3-cp310-cp310-win_amd64.whl.metadata (3.8 kB)
Collecting scikit-image (from ray[rllib])
  Downloading scikit_image-0.22.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting typer (from ray[rllib])
  Downloading typer-0.9.0-py3-none-any.whl.metadata (14 kB)
Collecting pillow>=9.0.1 (from scikit-image->ray[rllib])
  Downloading pillow-10.2.0-cp310-cp310-win_amd64.whl.metadata (9.9 kB)
Collecting imageio>=2.27 (from scikit-image->ray[rllib])
  Downloading imageio-2.34.0-py3-none-any.whl.metadata (4.9 kB)
Collecting tifffile>=2022.8.12 (from scikit-image->ray[rllib])
  Downloading tifffile-2024.2.12-py3-

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
diffusers 0.6.0 requires Pillow<10.0, but you have pillow 10.2.0 which is incompatible.


In [33]:
import gymnasium as gym
from gymnasium import spaces, vector
import numpy as np
from scipy.stats import binom, nbinom, beta, poisson, gamma, norm, geom

In [34]:
import random

In [35]:
def available_actions(state):
  return [0,1]

def reward(state, action):
  if action == 1:
    return -15
  else:
    return -state

def possible_states(state, action):
  if action == 1:
    state = 0
  if state == max_miles-1:
    return [(1, state+1)]
  if state == max_miles:
    return [(1, state)]
  return [(0.5, state+1), (0.5, state+2)]

class OptReplaceEnv(gym.Env):
  def __init__(self, seed=None):
    self.observation_space = gym.spaces.Discrete(51)
    self.action_space = gym.spaces.Discrete(2)

  def reset(self, seed=None, options=None):
    self.miles = 0
    return self.miles, {}

  def step(self, action):
    if action == 0:
      reward = -self.miles
    else:
      reward = -15
      self.miles = 0
    self.miles += random.choice([1,2])
    return self.miles, reward, False, False, {}

In [36]:
env = OptReplaceEnv()

In [37]:
obs, _ = env.reset()

In [38]:
env.observation_space.sample()

33

In [39]:
env.step(env.action_space.sample())

(1, -15, False, False, {})

In [40]:
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial

ImportError: DLL load failed while importing _fs: The specified procedure could not be found.

In [41]:
if ray.is_initialized():
  ray.shutdown()
ray.init(num_cpus=1)

2024-02-26 12:29:43,660	INFO worker.py:1724 -- Started a local Ray instance.


0,1
Python version:,3.10.5
Ray version:,2.9.3


In [42]:
config = (PPOConfig()
          .environment(OptReplaceEnv)
          .rollouts(num_rollout_workers=1)
)

stop = {"timesteps_total": 10000}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop),
)

tuner.fit()

NameError: name 'PPOConfig' is not defined