<a href="https://colab.research.google.com/github/manasuii/Beginner-ML-Projects/blob/main/Mass_Spring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import gymnasium as gym
import numpy as np

class PIDEnv(gym.Env):
    def __init__(self):
        super(PIDEnv, self).__init__()

        self.observation_space = gym.spaces.Box(
            low=np.array([-10, -10, -10], dtype=np.float32),
            high=np.array([10, 10, 10], dtype=np.float32)
        )

        self.action_space = gym.spaces.Box(
            low=np.array([-5.0], dtype=np.float32),
            high=np.array([5.0], dtype=np.float32)
        )

        self.setpoint = 1.0
        self.process_variable = 0.0
        self.prev_error = 0.0
        self.integral = 0.0

    def reset(self, seed=None, options=None):
        self.process_variable = np.random.uniform(0, 0.2)
        self.prev_error = 0.0
        self.integral = 0.0

        error = self.setpoint - self.process_variable
        state = np.array([error, self.integral, 0.0], dtype=np.float32)

        return state, {}

    def step(self, action):
        action = float(action)

        self.process_variable += 0.1 * action
        self.process_variable = np.clip(self.process_variable, -2, 2)

        error = self.setpoint - self.process_variable
        derivative = error - self.prev_error
        self.integral += error

        reward = -abs(error)

        terminated = abs(error) < 0.01

        self.prev_error = error

        state = np.array([error, self.integral, derivative], dtype=np.float32)

        return state, reward, terminated, False, {}


In [8]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

env = DummyVecEnv([lambda: PIDEnv()])

model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=0.0003,
    n_steps=2048,
    batch_size=64,
)

model.learn(total_timesteps=50_000)
model.save("pid_rl_agent")


Using cpu device


  action = float(action)
  return datetime.utcnow().replace(tzinfo=utc)


-----------------------------
| time/              |      |
|    fps             | 1408 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 970          |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0065582204 |
|    clip_fraction        | 0.0879       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.41        |
|    explained_variance   | -0.016       |
|    learning_rate        | 0.0003       |
|    loss                 | 196          |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.00808     |
|    std                  | 0.992        |
|    value_loss           | 466          |
----------------

In [9]:
model = PPO.load("pid_rl_agent")

env = PIDEnv()
obs, _ = env.reset()

for step in range(50):
    action, _ = model.predict(obs)
    obs, reward, done, _, _ = env.step(action)
    print(f"Step {step}, PV={env.process_variable:.3f}, Reward={reward:.3f}")
    if done:
        print("Target reached!")
        break


Step 0, PV=0.497, Reward=-0.503
Step 1, PV=0.779, Reward=-0.221
Step 2, PV=1.000, Reward=-0.000
Target reached!


  action = float(action)
