# Knapsack

In [None]:
import random
n = 100
w_max = 250
values = [random.randint(1,10) for _ in range(n)]
weights = [random.randint(1,10) for _ in range(n)]

In [None]:
def available_actions(state):
  return [0,1]

def reward(state, action):
  if action == 0:
    return 0
  return values[state[0]]

def next_state(state, action):
  item_ind = state[0]
  if action == 0:
    return (item_ind+1, state[1])
  if action == 1:
    return (item_ind+1, state[1] - weights[item_ind])


def terminal_state(state):
  if state[1] < 0:
    return True, (-1000000, -1)
  if state[0] >= len(values):
    return True, (0, -1)
  else:
    return False, ()

cache = {}
def bellman(state):
  is_terminal, term_return = terminal_state(state)
  if is_terminal:
    return term_return
  if state in cache:
    return cache[state]
  best_value = None
  best_action = None
  for action in available_actions(state):
    action_value = reward(state, action) + bellman(next_state(state,action))[0]
    if best_value is None or action_value > best_value:
      best_value = action_value
      best_action = action
  cache[state] = (best_value, best_action)
  return best_value, best_action

In [None]:
start_state = (0, w_max)

state = start_state

actions = []
for _ in range(n):
  best_value, best_action = bellman(state)
  actions.append(best_action)
  state = next_state(state, best_action)

In [None]:
import numpy as np
np.array(actions)

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1])

# Egg Drop

In [None]:
def available_actions(state):
  return range(1,state[0] + 1)

def reward(state, action):
  return -1

def next_state(state, action):
  surv_state = (state[0]-action, state[1])
  break_state = (action-1, state[1]-1)
  if bellman(surv_state)[0] < bellman(break_state)[0]:
    return surv_state
  else:
    return break_state


def terminal_state(state):
  if state[0] == 0:
    return True, (0, -1)
  if state[1] == 0:
    return True, (-10000000, -1)
  else:
    return False, ()

cache = {}
def bellman(state):
  is_terminal, term_return = terminal_state(state)
  if is_terminal:
    return term_return
  if state in cache:
    return cache[state]
  best_value = None
  best_action = None
  for action in available_actions(state):
    action_value = reward(state, action) + bellman(next_state(state,action))[0]
    if best_value is None or action_value > best_value:
      best_value = action_value
      best_action = action
  cache[state] = (best_value, best_action)
  return best_value, best_action

In [None]:
start_state = (100, 5)

state = start_state

state_actions = []
while not terminal_state(state)[0]:
  best_value, best_action = bellman(state)
  state_actions.append((state, best_action))
  state = next_state(state, best_action)

state_actions

[((100, 5), 38),
 ((37, 4), 7),
 ((30, 4), 15),
 ((14, 3), 7),
 ((6, 2), 3),
 ((2, 1), 1),
 ((1, 1), 1)]

# Stochastic Egg Drop

In [None]:
def available_actions(state):
  return range(1,state[0] + 1)

def reward(state, action):
  return -1

def possible_states(state, action):
  surv_state = (state[0] - action, state[1])
  break_state = (action - 1, state[1] - 1)
  p_break = action/state[0]
  p_surv = 1 - p_break
  return [(p_surv, surv_state),
          (p_break, break_state)]

def terminal_state(state):
  if state[0] == 0:
    return True, (0, -1)
  if state[1] == 0:
    return True, (-10000000, -1)
  else:
    return False, ()

cache = {}
def bellman(state):
  is_terminal, term_return = terminal_state(state)
  if is_terminal:
    return term_return
  if state in cache:
    return cache[state]
  best_value = None
  best_action = None
  for action in available_actions(state):
    exp_action_value = 0
    for p_state, next_state in possible_states(state, action):
      exp_action_value += p_state*(reward(state, action) + bellman(next_state)[0])
    if best_value is None or exp_action_value > best_value:
      best_value = exp_action_value
      best_action = action
  cache[state] = (best_value, best_action)
  return best_value, best_action

In [None]:
import random
start_state = (100, 5)
state = start_state
state_actions = []
while not terminal_state(state)[0]:
  best_value, best_action = bellman(state)
  state_actions.append((state, best_action))
  states = possible_states(state, best_action)
  state = random.choices([s[1] for s in states],
                          weights=[s[0] for s in states])[0]
#  if bellman(states[0][1])[0] < bellman(states[1][1])[1]:
#    state = states[0][1]
#  else:
#    state = states[1][1]
state_actions

[((100, 5), 49),
 ((48, 4), 23),
 ((22, 3), 10),
 ((9, 2), 4),
 ((5, 2), 3),
 ((2, 2), 1)]

In [None]:
start_state = (100, 5)

state = start_state

actions = []
while not terminal_state(state)[0]:
  best_value, best_action = bellman(state)
  actions.append((state,best_action))
  state = next_state(state, best_action)

In [None]:
state

(0, 0)

In [None]:
actions

[((100, 5), 38),
 ((37, 4), 7),
 ((30, 4), 15),
 ((14, 3), 7),
 ((6, 2), 3),
 ((2, 1), 1),
 ((1, 1), 1)]

# Optimal Replacement

In [None]:
# 0 maintain
# 1 replace

max_miles = 50
gamma = 0.9
threshold = 0.00001
def available_actions(state):
  return [0,1]

def reward(state, action):
  if action == 1:
    return -15
  else:
    return -state

def possible_states(state, action):
  if action == 1:
    state = 0
  if state == max_miles-1:
    return [(1, state+1)]
  if state == max_miles:
    return [(1, state)]
  return [(0.5, state+1), (0.5, state+2)]

values = [(0,0) for _ in range(max_miles + 1)]

it = 0
done = False
while not done:
  it += 1
  new_values = [0 for _ in range(max_miles+1)]
  for state in range(max_miles+1):
    best_value = None
    best_action = None
    for action in available_actions(state):
      exp_action_value = 0
      for p_state, next_state in possible_states(state, action):
        exp_action_value += p_state*(reward(state, action) + gamma*values[next_state][0])
      if best_value is None or exp_action_value > best_value:
        best_value = exp_action_value
        best_action = action
    new_values[state] = best_value, best_action
  max_change = max(abs(v[0] - new_v[0]) for v,new_v in zip(values, new_values))
  if max_change < threshold:
    done = True
  values = new_values


In [None]:
values

[(-47.898352309227604, 0),
 (-51.596584791667766, 0),
 (-54.844217151267486, 0),
 (-57.59265697484717, 0),
 (-59.83895572813033, 0),
 (-61.47807880501428, 0),
 (-62.60850851321201, 0),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62.898352309227604, 1),
 (-62

# Optimal Replacement Policy Iteration

In [None]:
max_miles = 50
gamma = 0.9
threshold = 0.00001
def available_actions(state):
  return [0,1]

def reward(state, action):
  if action == 1:
    return -15
  else:
    return -state

def possible_states(state, action):
  if action == 1:
    state = 0
  if state == max_miles-1:
    return [(1, state+1)]
  if state == max_miles:
    return [(1, state)]
  return [(0.5, state+1), (0.5, state+2)]

policy = [0 for _ in range(max_miles + 1)]
policy_value = [0 for _ in range(max_miles + 1)]

In [None]:
# Policy Evaluation
for _ in range(100):
  for miles in range(max_miles + 1):
    action = policy[miles]
    value_s = 0
    for prob_s, s_prime in possible_states(miles, action):
      value_s += prob_s*(reward(miles, action) + gamma*policy_value[s_prime])
    policy_value[miles] = value_s

# Policy Update
for miles in range(max_miles + 1):
  best_value = None
  best_action = None
  for action in available_actions(miles):
    value_s = 0
    for prob_s, s_prime in possible_states(miles, action):
      value_s += prob_s*(reward(miles, action) + gamma*policy_value[s_prime])
    if best_value is None or value_s > best_value:
      best_value = value_s
      best_action = action
  policy[miles] = best_action


In [None]:
policy_value

[-47.89831942947086,
 -51.596554047321646,
 -54.844188458069674,
 -57.592630387445126,
 -59.83893102785208,
 -61.478056316899306,
 -62.60848748652378,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983341274261,
 -62.8983

# RLlib Optimal Replacement

In [None]:
!pip install ray[rllib]

Collecting ray[rllib]
  Downloading ray-2.9.2-cp310-cp310-manylinux2014_x86_64.whl (64.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting tensorboardX>=1.9 (from ray[rllib])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
Collecting gymnasium==0.28.1 (from ray[rllib])
  Downloading gymnasium-0.28.1-py3-none-any.whl (925 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m925.5/925.5 kB[0m [31m62.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting lz4 (from ray[rllib])
  Downloading lz4-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
Collecting jax-jumpy>=1.0.0 (from gymnasium==0.28.1->ray[rllib])
  Downloa

In [None]:
import gymnasium as gym
from gymnasium import spaces, vector
import numpy as np
from scipy.stats import binom, nbinom, beta, poisson, gamma, norm, geom

In [None]:
import random

In [None]:
def available_actions(state):
  return [0,1]

def reward(state, action):
  if action == 1:
    return -15
  else:
    return -state

def possible_states(state, action):
  if action == 1:
    state = 0
  if state == max_miles-1:
    return [(1, state+1)]
  if state == max_miles:
    return [(1, state)]
  return [(0.5, state+1), (0.5, state+2)]

class OptReplaceEnv(gym.Env):
  def __init__(self, seed=None):
    self.observation_space = gym.spaces.Discrete(51)
    self.action_space = gym.spaces.Discrete(2)

  def reset(self, seed=None, options=None):
    self.miles = 0
    return self.miles, {}

  def step(self, action):
    if action == 0:
      reward = -self.miles
    else:
      reward = -15
      self.miles = 0
    self.miles += random.choice([1,2])
    return self.miles, reward, False, False, {}

In [None]:
env = OptReplaceEnv()

In [None]:
obs, _ = env.reset()

In [None]:
env.observation_space.sample()

42

In [None]:
env.step(env.action_space.sample())

(1, -15, False, False, {})

In [None]:
import ray
from ray import tune, air
from ray.rllib.algorithms.ppo import PPOConfig
from functools import partial

  if (distutils.version.LooseVersion(tf.__version__) <


In [None]:
if ray.is_initialized():
  ray.shutdown()
ray.init(num_cpus=1)

  and should_run_async(code)
2024-02-20 05:44:43,187	INFO worker.py:1724 -- Started a local Ray instance.


0,1
Python version:,3.10.12
Ray version:,2.9.2


In [None]:
config = (PPOConfig()
          .environment(OptReplaceEnv)
          .rollouts(num_rollout_workers=1)
)

stop = {"timesteps_total": 10000}

tuner = tune.Tuner(
    "PPO",
    param_space=config.to_dict(),
    run_config=air.RunConfig(stop=stop),
)

tuner.fit()

2024-02-20 05:45:33,179	INFO tune.py:592 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+------------------------------------------------------------+
| Configuration for experiment     PPO_2024-02-20_05-45-33   |
+------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator     |
| Scheduler                        FIFOScheduler             |
| Number of trials                 1                         |
+------------------------------------------------------------+

View detailed results here: /root/ray_results/PPO_2024-02-20_05-45-33
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/PPO_2024-02-20_05-45-33`

Trial status: 1 PENDING
Current time: 2024-02-20 05:45:33. Total running time: 0s
Logical resource usage: 0/1 CPUs, 0/0 GPUs
+------------------------------------------+
| Trial name                      status   |
+------------------------------------------+
| PPO_OptReplaceEnv_43650_00000   PENDING  |
+------------------------------------------+
Trial status: 1 PENDING