In [1]:
import gymnasium as gym
from gymnasium import spaces
import numpy as np
import math
import random
from scipy.stats import norm

# --- CMDP Dynamics Functions (from previous code) ---
# KinematicModel definition
class KinematicModel:
    def __init__(self, L, y, theta, Lmax, l, d, p1, sigmaC, p2):
        self.L = L
        self.y = y
        self.theta = theta
        self.Lmax = Lmax
        self.l = l
        self.d = d
        self.p1 = p1
        self.sigmaC = sigmaC
        self.p2 = p2

    def input(self, action):
        noise = norm.rvs(loc=0, scale=action[0] / 4)
        self.y = self.y + action[0] * math.sin(math.radians(action[1] + noise + self.theta))
        if random.uniform(0, 1) < self.p2:
            self.d = self.d + (self.l - action[0] * math.cos(math.radians(action[1] + noise + self.theta)))
        else:
            self.d = 40
        self.theta = self.theta + (action[0] / self.L) * math.tan(math.radians(action[1] + noise))
        if random.uniform(0, 1) < self.p1:
            curveAngle = norm.rvs(loc=0, scale=self.sigmaC)
            self.theta = self.theta + curveAngle

    def __str__(self):
        return f"y = {self.y:.2f} / theta = {self.theta:.2f} / d = {self.d:.2f}"

# Transition probability functions (one per outcome)
def P1(action, env, sigma, show=True):
    ratio = (env.Lmax - env.y) / action[0]
    if ratio > 1 or ratio < -1:
        term1 = 0
    else:
        angle = math.degrees(math.asin(ratio))
        term1 = 1 - norm.cdf((1 / sigma) * (angle - action[1] - env.theta))
    ratio2 = (-env.Lmax - env.y) / action[0]
    if ratio2 > 1 or ratio2 < -1:
        term2 = 0
    else:
        angle = math.degrees(math.asin(ratio2))
        term2 = norm.cdf((1 / sigma) * (angle - action[1] - env.theta))
    value = (1 - env.p1) * (term1 + term2)
    return value

def P2(action, env, sigma, show=True):
    ratio = (env.Lmax - env.y) / action[0]
    if ratio > 1 or ratio < -1:
        term1 = 0
    else:
        angle = math.degrees(math.asin(ratio))
        term1 = 1 - norm.cdf((1 / sigma) * (angle - action[1] - env.theta))
    ratio2 = (-env.Lmax - env.y) / action[0]
    if ratio2 > 1 or ratio2 < -1:
        term2 = 0
    else:
        angle = math.degrees(math.asin(ratio2))
        term2 = norm.cdf((1 / sigma) * (angle - action[1] - env.theta))
    value = env.p1 * (term1 + term2)
    return value

def P3(action, env, sigma, show=True):
    if (1 / action[0]) * (env.l + (((action[0] * 3.6) / -2) + env.d)) > 1:
        return 0
    angle = math.degrees(math.acos((1 / action[0]) * (env.l + (((action[0] * 3.6) / -2) + env.d))))
    value = norm.cdf((1 / sigma) * (angle - abs(action[1]) - abs(env.theta)))
    return env.p2 * (1 - env.p1) * value

def P4(action, env, sigma, show=True):
    if (1 / action[0]) * (env.l + (((action[0] * 3.6) / -2) + env.d)) > 1:
        return 0
    angle = math.degrees(math.acos((1 / action[0]) * (env.l + (((action[0] * 3.6) / -2) + env.d))))
    value = norm.cdf((1 / sigma) * (angle - abs(action[1]) - abs(env.theta)))
    return env.p2 * env.p1 * value

# Combined transition functions to yield probability of each outcome:
def anyToG(action, env):
    p1 = P1(action, env, action[0] / 4, show=False)
    p2 = P2(action, env, math.sqrt(action[0] / 4 + env.sigmaC), show=False)
    term1 = 1 - (p1 + p2)
    term2 = 1 - (P3(action, env, action[0] / 4, show=False) + 
                 P4(action, env, math.sqrt(action[0] / 4 + env.sigmaC), show=False))
    return term1 * term2

def anyToX(action, env):
    p1 = P1(action, env, action[0] / 4, show=False)
    p2 = P2(action, env, math.sqrt(action[0] / 4 + env.sigmaC), show=False)
    term1 = p1 + p2
    term2 = 1 - (P3(action, env, action[0] / 4, show=False) + 
                 P4(action, env, math.sqrt(action[0] / 4 + env.sigmaC), show=False))
    return term1 * term2

def anyToI(action, env):
    p1 = P1(action, env, action[0] / 4, show=False)
    p2 = P2(action, env, math.sqrt(action[0] / 4 + env.sigmaC), show=False)
    term1 = 1 - (p1 + p2)
    term2 = (P3(action, env, action[0] / 4, show=False) + 
             P4(action, env, math.sqrt(action[0] / 4 + env.sigmaC), show=False))
    return term1 * term2

def anyToXI(action, env):
    p1 = P1(action, env, action[0] / 4, show=False)
    p2 = P2(action, env, math.sqrt(action[0] / 4 + env.sigmaC), show=False)
    term1 = p1 + p2
    term2 = (P3(action, env, action[0] / 4, show=False) + 
             P4(action, env, math.sqrt(action[0] / 4 + env.sigmaC), show=False))
    return term1 * term2

# Expected reward function
def rewardCenterProbability(action, env, ratio, show=True):
    numerator1 = (env.Lmax * ratio) - env.y
    if abs(numerator1 / action[0]) > 1:
        term1 = term1C = 0
    else:
        angle = math.degrees(math.asin(numerator1 / action[0]))
        term1 = norm.cdf((4 / action[0]) * (angle - action[1] - env.theta))
        term1C = norm.cdf((1 / math.sqrt(action[0] / 4 + env.sigmaC)) * (angle - action[1] - env.theta))
    numerator2 = (-env.Lmax * ratio) - env.y
    if abs(numerator2 / action[0]) > 1:
        term2 = term2C = 0
    else:
        angle = math.degrees(math.asin(numerator2 / action[0]))
        term2 = norm.cdf((4 / action[0]) * (angle - action[1] - env.theta))
        term2C = norm.cdf((1 / math.sqrt(action[0] / 4 + env.sigmaC)) * (angle - action[1] - env.theta))
    return ((1 - env.p1) * (term1 - term2)) + (env.p1 * (term1C - term2C))

def rewardSpeed(action, env):
    return -0.5 * abs(env.l - action[0])

def rewardDistanceProbability(action, env):
    term1 = P3(action, env, action[0] / 4, show=False)
    term2 = P4(action, env, math.sqrt(action[0] / 4 + env.sigmaC), show=False)
    return term1 + term2

def expectedReward(action, env, r1, r2, r3, r4, show=True):
    termR1 = r1 * rewardCenterProbability(action, env, 0.5, show=False)
    termR2 = r2 * rewardCenterProbability(action, env, 0.25, show=False)
    rSpeed = rewardSpeed(action, env)
    termR3 = r3 * (1 - rewardDistanceProbability(action, env))
    termR4 = r4 * rewardDistanceProbability(action, env)
    return termR1 + termR2 + rSpeed + termR3 + termR4

# --- Custom Gym Environment using the CMDP ---
class KinematicEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self, kinematic_model):
        super(KinematicEnv, self).__init__()
        self.kinematic_model = kinematic_model
        
        # Define observation: discrete state (0: G, 1: X, 2: I, 3: XI)
        self.observation_space = spaces.Discrete(4)
        # Define action space: a finite set of actions (velocity, steering angle)
        self.actions = [(v, delta) for v in [20, 25, 30] for delta in [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]]
        self.action_space = spaces.Discrete(len(self.actions))
        
        # Start in a "Good" state by default
        self.state = 0
    
    def reset(self, seed=None, options=None):
        if seed is not None:
            np.random.seed(seed)
            random.seed(seed)
        # Reset the kinematic model parameters
        self.kinematic_model.y = 0
        self.kinematic_model.theta = 0
        self.kinematic_model.d = 40
        self.state = 0  # Good state (G)
        return self.state, {}
    
    def step(self, action_index):
        action = self.actions[action_index]
        # Compute transition probabilities for each state
        probs = np.array([
            anyToG(action, self.kinematic_model),
            anyToX(action, self.kinematic_model),
            anyToI(action, self.kinematic_model),
            anyToXI(action, self.kinematic_model)
        ])
        if probs.sum() == 0:
            probs = np.array([1.0, 0, 0, 0])
        else:
            probs = probs / probs.sum()
        # Sample the next state from the discrete outcomes {0:G, 1:X, 2:I, 3:XI}
        next_state = int(np.random.choice([0, 1, 2, 3], p=probs))
        # Calculate reward using our expectedReward function
        r1, r2, r3, r4 = 50, 100, -1, 0
        reward = expectedReward(action, self.kinematic_model, r1, r2, r3, r4, show=False)
        self.state = next_state
        # Mark episode as done if state is unsafe (X or XI)
        done = self.state in [1, 3]
        return self.state, reward, done, False, {}
    
    def render(self, mode='human'):
        print(f"State: {self.state}, Kinematic Model: {self.kinematic_model}")
    
    def close(self):
        pass

# --- Testing the Environment with a Random Policy ---
if __name__ == "__main__":
    # Instantiate a kinematic model with fixed parameters
    km = KinematicModel(L=1, y=0, theta=0, Lmax=2, l=25, d=40, p1=0.05, sigmaC=5, p2=0.3)
    env = KinematicEnv(km)
    obs, _ = env.reset(seed=42)
    total_reward = 0
    done = False
    step_count = 0
    
    while not done and step_count < 20:
        # For testing, choose a random action
        action = env.action_space.sample()
        obs, reward, done, truncated, info = env.step(action)
        env.render()
        total_reward += reward
        step_count += 1
    
    print(f"Episode finished after {step_count} steps with total reward {total_reward:.2f}.")


State: 0, Kinematic Model: y = 0.00 / theta = 0.00 / d = 40.00
State: 3, Kinematic Model: y = 0.00 / theta = 0.00 / d = 40.00
Episode finished after 2 steps with total reward 46.70.


In [2]:
import numpy as np
import math
import random
from scipy.stats import norm

# (Reuse the same definitions for KinematicModel, anyToG, anyToX, anyToI, anyToXI, expectedReward, etc.)
# For brevity, we assume the same functions defined above (P1, P2, P3, P4, reward functions, etc.) are already present.

# Define states and actions (as before)
states = ['G', 'X', 'I', 'XI']
anglesDeg = [-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]
velocities = [20, 25, 30]
actions = [(v, delta) for v in velocities for delta in anglesDeg]

def transition_func(a, env):
    # For a given action, return a list of (state, probability) pairs.
    probabilities = [
        anyToG(a, env),
        anyToX(a, env),
        anyToI(a, env),
        anyToXI(a, env)
    ]
    # Normalize probabilities if needed
    s = sum(probabilities)
    if s > 0:
        probabilities = [p/s for p in probabilities]
    else:
        probabilities = [1.0, 0, 0, 0]
    return list(zip(states, probabilities))

def reward_func(a, env, r1=50, r2=100, r3=-1, r4=0):
    return expectedReward(a, env, r1, r2, r3, r4, show=False)

def value_iteration(env, gamma=0.9, theta_thresh=1e-4):
    # Initialize value function for each state
    V = {s: 0 for s in states}
    policy = {s: None for s in states}
    r1, r2, r3, r4 = 50, 100, -1, 0

    iteration = 0
    while True:
        delta = 0
        for s in states:
            v_old = V[s]
            action_values = {}
            for a in actions:
                # For a stationary CMDP model the transition probabilities and rewards
                # do not depend on s, so we simply compute the expected value.
                exp_value = 0
                for s_prime, p in transition_func(a, env):
                    exp_value += p * (reward_func(a, env, r1, r2, r3, r4) + gamma * V[s_prime])
                action_values[a] = exp_value
            best_action = max(action_values, key=action_values.get)
            V[s] = action_values[best_action]
            policy[s] = best_action
            delta = max(delta, abs(v_old - V[s]))
        if delta < theta_thresh:
            break
        iteration += 1
    return V, policy

# Define a grid of initial conditions (varying lateral offset and heading)
initial_conditions = [(y, theta) for y in np.linspace(-1, 1, 5) for theta in np.linspace(-5, 5, 5)]
results = {}

for (y, theta) in initial_conditions:
    # Create a new kinematic model instance for each initial condition
    km = KinematicModel(L=1, y=y, theta=theta, Lmax=2, l=25, d=40, p1=0.05, sigmaC=5, p2=0.3)
    V, policy = value_iteration(km, gamma=0.9, theta_thresh=1e-4)
    results[(y, theta)] = (V, policy)
    print(f"Initial condition y = {y:.2f}, theta = {theta:.2f}")
    print("Value Function:", V)
    print("Policy:", policy)
    print("-" * 40)


KeyboardInterrupt: 

## prove safetyGym


In [None]:
import safety_gymnasium

env = safety_gymnasium.make('SafetyPointCircle0-v0', render_mode='human')
'''
Vision Environment
    env = safety_gymnasium.make('SafetyPointCircle0Vision-v0', render_mode='human')
Keyboard Debug environment
due to the complexity of the agent's inherent dynamics, only partial support for the agent.
    env = safety_gymnasium.make('SafetyPointCircle0Debug-v0', render_mode='human')
'''
obs, info = env.reset()
# Set seeds
# obs, _ = env.reset(seed=0)
terminated, truncated = False, False
ep_ret, ep_cost = 0, 0
for _ in range(1000):
    assert env.observation_space.contains(obs)
    act = env.action_space.sample()
    assert env.action_space.contains(act)
    # modified for Safe RL, added cost
    obs, reward, cost, terminated, truncated, info = env.step(act)
    ep_ret += reward
    ep_cost += cost
    if terminated or truncated:
        observation, info = env.reset()

    env.close()

: 

In [4]:
import safety_gymnasium  # the Safety Gymnasium package
import time
import numpy as np

# -------------------------------
# Part 1: Single-Episode Simulation
# -------------------------------

# Create the environment.
# "Safexp-RaceCarGoal1-v0" is a pre-built task where the agent is a racecar.
# In this configuration the environment includes Sigwalls that act as road boundaries.
env = safety_gymnasium.make("Safexp-RaceCarGoal1-v2", render_mode="human")

# Reset the environment (optionally specify a seed for reproducibility)
obs, info = env.reset(seed=42)

done = False
total_reward = 0.0

print("Starting single-episode simulation...")
while not done:
    # For demonstration, we sample a random action.
    # In practice, you could use a learned policy.
    action = env.action_space.sample()
    
    # Step the environment
    obs, reward, done, truncated, info = env.step(action)
    total_reward += reward
    
    # Render the environment (the "racecar" along a straight road with sigwalls)
    env.render()
    time.sleep(0.05)  # slow down the simulation for visualization

print(f"Episode finished with total reward: {total_reward:.2f}")
env.close()

# -------------------------------
# Part 2: Multiple Episodes with Varying Initial Conditions
# -------------------------------

# In Safety Gymnasium, initial conditions are typically randomized.
# We can simulate multiple episodes by resetting with different seeds.
num_episodes = 5

print("\nStarting multiple-episode simulation with varying seeds:")
for seed in range(100, 100 + num_episodes):
    env = safety_gymnasium.make("Safexp-RaceCarGoal1-v0", render_mode="human")
    obs, info = env.reset(seed=seed)
    
    done = False
    episode_reward = 0.0
    steps = 0
    
    while not done:
        action = env.action_space.sample()
        obs, reward, done, truncated, info = env.step(action)
        episode_reward += reward
        env.render()
        time.sleep(0.05)
        steps += 1
        if steps > 200:  # safety exit condition for long episodes
            break
            
    print(f"Episode with seed {seed} finished with reward: {episode_reward:.2f} in {steps} steps.")
    env.close()


AssertionError: Environment Safexp-RaceCarGoal1-v2 is not registered in safety-gymnasium.

In [None]:
import time
import safety_gymnasium
import numpy as np

# -------------------------------------------------
# Single-Episode Simulation of a Racecar Environment
# -------------------------------------------------

# Try to create the Racecar environment with sigwalls (representing road boundaries).
# According to the documentation, the Racecar agent uses realistic dynamics and has
# an action space defined as a Box with limits on rear-wheel velocity and front-wheel steering.
# If the environment "Safexp-RaceCarGoal1-v2" is not available, you may need to try a different ID.
#env_id = "Safexp-RaceCarGoal1-v2"
env_id= "SafetyRacecarCircle1-v0"
try:
    env = safety_gymnasium.make(env_id, render_mode="human")
except AssertionError as e:
    print(f"Environment {env_id} is not registered in Safety Gymnasium. Please check available environment IDs.")
    exit()

# Reset the environment with a fixed seed (for reproducibility)
obs, info = env.reset(seed=42)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

done = False
total_reward = 0.0
step_count = 0

print("Starting single-episode simulation of the Racecar environment...")
while not done:
    # For testing, we choose a random action.
    action = env.action_space.sample()
    
    # Step the environment using the sampled action.
    obs, reward, done, truncated, info = env.step(action)
    total_reward += reward
    step_count += 1
    
    # Render the simulation (the Racecar, sigwalls, and other static objects)
    env.render()
    time.sleep(0.05)  # Slow down the simulation for visualization
    
    # Optional exit condition: terminate after a fixed number of steps to prevent endless episodes.
    if step_count >= 300:
        break

print(f"Episode finished after {step_count} steps with total reward: {total_reward:.2f}")
env.close()


Observation space: Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf   0.   0.
   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.], [inf inf inf inf inf inf inf inf inf inf inf inf  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.], (28,), float64)
Action space: Box([-20.          -0.78500003], [20.          0.78500003], (2,), float64)
Starting single-episode simulation of the Racecar environment...


ValueError: too many values to unpack (expected 5)

: 