# PID, MPC, Offline, Offline + Online 평가

시뮬레이터를 통한 평가를 진행한다.

고정된 에피소드 7개에 대해 3개의 seed를 적용하여 L2(t,tsp)을 평균낸다.

In [1]:
import tclab
import numpy as np
import time
import csv
import matplotlib.pyplot as plt
import torch
import random
import os
root_path="/home/minchanggi/code/TCLab/data/eval_data"

epi_gen_seed=[0,1,2,3,4]
rl_seed=[0]
max_episode_steps=1200
sleep_max=1.0


In [2]:
def save_csv_png(all_data,step_num):
    import csv
    import matplotlib.pyplot as plt
    for data in all_data:
        path = data["path"]
        tm = data["tm"]
        Q1 = data["Q1"]
        Q2 = data["Q2"]
        T1 = data["T1"]
        T2 = data["T2"]
        Tsp1 = data["Tsp1"]
        Tsp2 = data["Tsp2"]
        os.makedirs(path, exist_ok=True)
        csv_filename = os.path.join(path, f'episode_{step_num}_data.csv')
        with open(csv_filename, 'w', newline='') as fid:
            writer = csv.writer(fid)
            writer.writerow(['step_num', 'Time', 'Q1', 'Q2', 'T1', 'T2', 'TSP1', 'TSP2'])
            for i in range(len(tm)):
                writer.writerow([
                    step_num,
                    f"{tm[i]:.2f}", f"{Q1[i]:.2f}", f"{Q2[i]:.2f}",
                    f"{T1[i]:.2f}", f"{T2[i]:.2f}", f"{Tsp1[i]:.2f}", f"{Tsp2[i]:.2f}"
                ])

        plt.figure(figsize=(10, 7))
        ax = plt.subplot(2, 1, 1)
        ax.grid()
        plt.plot(tm, Tsp1, 'k--', label=r'$T_1$ set point')
        plt.plot(tm, T1, 'b.', label=r'$T_1$ measured')
        plt.plot(tm, Tsp2, 'k-', label=r'$T_2$ set point')
        plt.plot(tm, T2, 'r.', label=r'$T_2$ measured')
        plt.ylabel(r'Temperature ($^oC$)')
        plt.title(f'Episode {step_num}')
        plt.legend(loc='best')

        ax = plt.subplot(2, 1, 2)
        ax.grid()
        plt.plot(tm, Q1, 'b-', label=r'$Q_1$')
        plt.plot(tm, Q2, 'r:', label=r'$Q_2$')
        plt.ylabel('Heater Output (%)')
        plt.xlabel('Time (sec)')
        plt.legend(loc='best')

        plt.tight_layout()
        png_filename = os.path.join(path, f'episode_{step_num}_plot.png')
        plt.savefig(png_filename)
        plt.close()


def set_seed(seed, env=None):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    if env is not None:
        env.seed(seed)



def generate_random_tsp(length, name='TSP'):
    i = 0
    tsp = np.zeros(length)
    #print(f'duration {length}: [{name} 설정 정보]')
    while i < length:
        if length == 600: 
            duration = int(np.clip(np.random.normal(240, 50), 80, 400))
        elif length == 900:
            duration = int(np.clip(np.random.normal(360, 75), 120, 600))
        elif length == 1200:
            duration = int(np.clip(np.random.normal(480, 100), 160, 800))
        else:
            duration = 5
        temp = np.random.uniform(25, 65)
        end = min(i + duration, length)
        tsp[i:end] = temp
        #print("@@@@@@@@@@@@@@@@")
        #print(f'  구간: {i:>3} ~ {end - 1:>3}, 목표 온도: {temp:.2f}°C')
        i = end
    return tsp

## PID

In [3]:
# PID 상수
Kc   = 9.24
tauI = 126.6 # sec
tauD = 8.90  # sec
Kff  = -0.66
st_temp = 29.0

def pid(sp, pv, pv_last, ierr, dt, d, cid):
    if cid == 1:
        KP = Kc
        Kf = Kff
    else:
        KP = Kc * 2.0
        Kf = Kff * 2.0
    KI = Kc / tauI
    KD = Kc * tauD
    op0 = 0
    ophi = 100
    oplo = 0
    error = sp - pv
    ierr += KI * error * dt
    dpv = (pv - pv_last) / dt
    P = KP * error
    I = ierr
    D = -KD * dpv
    FF = Kf * d
    op = op0 + P + I + D + FF
    if op < oplo or op > ophi:
        I -= KI * error * dt
        op = max(oplo, min(ophi, op))
    return op, P, I, D, FF


# PID Sim

In [5]:
all_datas = []
PID_SIM_path=os.path.join(root_path,"PID_SIM_7epi")
for rl in rl_seed:
    path = os.path.join(PID_SIM_path, f"seed_{rl}")
    for epi_seed in epi_gen_seed:
        set_seed(epi_seed)
        Tsp1 = generate_random_tsp(max_episode_steps, 'TSP1')
        Tsp2 = generate_random_tsp(max_episode_steps, 'TSP2')
        set_seed(rl)
        from tclab import setup
        lab = setup(connected=False)
        env = lab(synced=False)
        
        env.Q1(0)
        env.Q2(0)

        tm = np.zeros(max_episode_steps)
        T1 = np.ones(max_episode_steps) * env.T1
        T2 = np.ones(max_episode_steps) * env.T2
        Q1 = np.zeros(max_episode_steps)
        Q2 = np.zeros(max_episode_steps)
        total_reward = 0.0
        ierr1 = 0.0
        ierr2 = 0.0
        for i in range(max_episode_steps):
            sim_time = i * sleep_max
            env.update(t=sim_time)
            tm[i] = sim_time

            T1[i] = env.T1
            T2[i] = env.T2
            
            d1 = T1[i] - 23.0
            d2 = T2[i] - 23.0
            Q1[i],P,ierr1,D,FF = pid(Tsp1[i],T1[i],T1[i-1],ierr1,sleep_max,d2,1)
            Q2[i],P,ierr2,D,FF = pid(Tsp2[i],T2[i],T2[i-1],ierr2,sleep_max,d1,2)
            
            env.Q1(Q1[i])
            env.Q2(Q2[i])
            
            reward = -np.linalg.norm([T1[i] - Tsp1[i], T2[i] - Tsp2[i]])
            total_reward += reward
        final_path = os.path.join(path, f"{epi_seed}epi")
        data={
            "path": final_path,
            "tm": tm,
            "Q1": Q1,
            "Q2": Q2,
            "T1": T1,
            "T2": T2,
            "Tsp1": Tsp1,
            "Tsp2": Tsp2,
            "total_reward": total_reward,
        }
        all_datas.append(data)
        
        env.close()

all_rewards = [d["total_reward"] for d in all_datas]
print("Total Rewards:", all_rewards)
# 평균 계산 및 출력
mean_reward = np.mean(all_rewards)
print("Average Total Reward:", mean_reward)
save_csv_png(all_datas, 0)


Total Rewards: [np.float64(-7806.524405653653), np.float64(-7089.993977819802), np.float64(-4722.3837660146555), np.float64(-4452.512307181782), np.float64(-11838.677839709766), np.float64(-9057.39499326673), np.float64(-9310.38821206253), np.float64(-7831.846252922945), np.float64(-7056.477856414248), np.float64(-4726.546240184996), np.float64(-4463.221704037656), np.float64(-11825.957235841135), np.float64(-9049.436231840753), np.float64(-9300.530888378662), np.float64(-7820.940954137004), np.float64(-7073.09101129539), np.float64(-4717.97224565917), np.float64(-4457.887721877617), np.float64(-11845.76482527988), np.float64(-9046.540441052328), np.float64(-9316.557241358047)]
Average Total Reward: -7752.887921523272


# PID 키트


In [None]:
# PID 키트


PID_KIT_path=os.path.join(root_path,"PID_KIT")
all_datas = []
st_temp = 29.0

for rl in rl_seed:
    path = os.path.join(PID_KIT_path, f"seed_{rl}")
    for epi_seed in epi_gen_seed:
        set_seed(epi_seed)
        Tsp1 = generate_random_tsp(max_episode_steps, 'TSP1')
        Tsp2 = generate_random_tsp(max_episode_steps, 'TSP2')
        set_seed(rl)
        
        env = tclab.TCLab()
        
        env.Q1(0)
        env.Q2(0)

        while env.T1 >= st_temp or env.T2 >= st_temp:
            print(f'Time: {i} T1: {env.T1} T2: {env.T2}')
            i += 20
            time.sleep(20)

        tm = np.zeros(max_episode_steps)
        T1 = np.ones(max_episode_steps) * env.T1
        T2 = np.ones(max_episode_steps) * env.T2
        Q1 = np.zeros(max_episode_steps)
        Q2 = np.zeros(max_episode_steps)
        total_reward = 0.0
        ierr1 = 0.0
        ierr2 = 0.0
        dt_error = 0.0
        start_time = time.time()
        prev_time = start_time
        for i in range(max_episode_steps):
            sleep = sleep_max - (time.time() - prev_time) - dt_error
            if sleep >= 1e-4:
                time.sleep(sleep - 1e-4)
            else:
                print('exceeded max cycle time by ' + str(abs(sleep)) + ' sec')
                time.sleep(1e-4)

            t = time.time()
            dt = t - prev_time
            if (sleep>=1e-4):
                dt_error = dt-sleep_max+0.009
            else:
                dt_error = 0.0
            prev_time = t
            tm[i] = t - start_time

            T1[i] = env.T1
            T2[i] = env.T2
            
            d1 = T1[i] - 23.0
            d2 = T2[i] - 23.0
            Q1[i],P,ierr1,D,FF = pid(Tsp1[i],T1[i],T1[i-1],ierr1,sleep_max,d2,1)
            Q2[i],P,ierr2,D,FF = pid(Tsp2[i],T2[i],T2[i-1],ierr2,sleep_max,d1,2)
            
            env.Q1(Q1[i])
            env.Q2(Q2[i])
            
            reward = -np.linalg.norm([T1[i] - Tsp1[i], T2[i] - Tsp2[i]])
            total_reward += reward
        final_path = os.path.join(path, f"{epi_seed}epi")
        data={
            "path": final_path,
            "tm": tm,
            "Q1": Q1,
            "Q2": Q2,
            "T1": T1,
            "T2": T2,
            "Tsp1": Tsp1,
            "Tsp2": Tsp2,
            "total_reward": total_reward,
        }
        all_datas.append(data)
        
        env.close()

all_rewards = [d["total_reward"] for d in all_datas]

# 평균 계산 및 출력
mean_reward = np.mean(all_rewards)
print("Average Total Reward:", mean_reward)
save_csv_png(all_datas, 0)


TCLab version 1.0.0
Arduino Leonardo connected on port /dev/ttyACM0 at 115200 baud.
TCLab Firmware 2.0.1 Arduino Leonardo/Micro.
TCLab version 1.0.0
Arduino Leonardo connected on port /dev/ttyACM0 at 115200 baud.
TCLab Firmware 2.0.1 Arduino Leonardo/Micro.
Time: 1199 T1: 45.713 T2: 46.035
Time: 1219 T1: 44.746 T2: 45.584
Time: 1239 T1: 43.457 T2: 44.102
Time: 1259 T1: 41.846 T2: 42.49
Time: 1279 T1: 40.46 T2: 41.233
Time: 1299 T1: 39.268 T2: 39.912
Time: 1319 T1: 37.979 T2: 38.655
Time: 1339 T1: 37.012 T2: 37.656
Time: 1359 T1: 36.045 T2: 36.528
Time: 1379 T1: 35.078 T2: 35.723
Time: 1399 T1: 34.401 T2: 34.756
Time: 1419 T1: 33.757 T2: 34.144
Time: 1439 T1: 33.145 T2: 33.499
Time: 1459 T1: 32.5 T2: 32.726
Time: 1479 T1: 32.049 T2: 32.178
Time: 1499 T1: 31.533 T2: 31.759
Time: 1519 T1: 31.211 T2: 31.211
Time: 1539 T1: 30.889 T2: 30.889
Time: 1559 T1: 30.566 T2: 30.309
Time: 1579 T1: 30.212 T2: 29.922
Time: 1599 T1: 29.922 T2: 29.567
Time: 1619 T1: 29.922 T2: 29.567
Time: 1639 T1: 29.63

# IQL MLP
## lab8 dT(5) 

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from pathlib import Path
from IQL.src.policy import GaussianPolicy
from IQL.src.iql import ImplicitQLearning
from IQL.src.value_functions import TwinQ, ValueFunction
from IQL.src.util import normalize, torchify

obs_dim = 6
act_dim = 2
hidden_dim = 256
n_hidden = 2
tau=0.8
beta=3.0
alpha=0.005
discount=0.99
max_steps=1000000
learning_rate=3e-4

policy = GaussianPolicy(obs_dim, act_dim, hidden_dim, n_hidden).to(device)
iql = ImplicitQLearning(
        qf=TwinQ(obs_dim, act_dim, hidden_dim, n_hidden).to(device),
        vf=ValueFunction(obs_dim, hidden_dim, n_hidden).to(device),
        policy=policy,
        optimizer_factory=lambda p: torch.optim.Adam(p, lr=learning_rate),
        max_steps=max_steps,
        tau=tau,
        beta=beta,
        alpha=alpha,
        discount=discount
    )


In [9]:
#IQL offline 

model_path = Path("/home/minchanggi/code/TCLab/IQL/log/static_False_exp_1.0_noise_10/05-16-25_09.21.05_fxqc/best.pt")
iql.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

# IQL_MLP_SIM


In [None]:
IQL_MLP_offline_path=os.path.join(root_path,"IQL_MLP_offine_dT5_SIM_Train_best")
all_datas = []
st_temp = 29.0

for rl in rl_seed:
    path = os.path.join(IQL_MLP_offline_path, f"seed_{rl}")
    for epi_seed in epi_gen_seed:
        set_seed(epi_seed)
        Tsp1 = generate_random_tsp(max_episode_steps, 'TSP1')
        Tsp2 = generate_random_tsp(max_episode_steps, 'TSP2')
        set_seed(rl)
        from tclab import setup
        lab = setup(connected=False)
        env = lab(synced=False)
        
        
        env.Q1(0)
        env.Q2(0)

        tm = np.zeros(max_episode_steps)
        T1 = np.ones(max_episode_steps) * env.T1
        T2 = np.ones(max_episode_steps) * env.T2
        Q1 = np.zeros(max_episode_steps)
        Q2 = np.zeros(max_episode_steps)
        
        total_reward = 0.0
        dt_error = 0.0

        for i in range(max_episode_steps):
            sim_time = i * sleep_max
            env.update(t=sim_time)
            tm[i] = sim_time


            T1[i] = env.T1
            T2[i] = env.T2
            if i == 0:
                dT1, dT2= 0.0, 0.0
            elif i<4:
                dT1, dT2 = T1[i]-T1[i-1], T2[i]-T2[i-1]
            else:
                dT1 = T1[i] - T1[i - 4] 
                dT2 = T2[i] - T2[i - 4]

            obs = np.array([T1[i], Tsp1[i], dT1, T2[i], Tsp2[i], dT2], dtype=np.float32)    
            
            with torch.no_grad():
                action = policy.act(torchify(obs), deterministic=True).cpu().numpy()
            Q1[i], Q2[i] = action
            env.Q1(Q1[i])
            env.Q2(Q2[i])
            
            reward = -np.linalg.norm([T1[i] - Tsp1[i], T2[i] - Tsp2[i]])
            total_reward += reward
            
        final_path = os.path.join(path, f"{epi_seed}epi")
        data={
            "path": final_path,
            "tm": tm,
            "Q1": Q1,
            "Q2": Q2,
            "T1": T1,
            "T2": T2,
            "Tsp1": Tsp1,
            "Tsp2": Tsp2,
            "total_reward": total_reward,
        }
        all_datas.append(data)
        
        env.close()

all_rewards = [d["total_reward"] for d in all_datas]

# 평균 계산 및 출력
mean_reward = np.mean(all_rewards)
print("Average Total Reward:", mean_reward)
save_csv_png(all_datas, 0)


Average Total Reward: -6705.670830510384


In [10]:

IQL_MLP_offline_path=os.path.join(root_path,"IQL_MLP_Noise10_best")
all_datas = []
st_temp = 29.0

for rl in rl_seed:
    path = os.path.join(IQL_MLP_offline_path, f"seed_{rl}")
    for epi_seed in epi_gen_seed:
        set_seed(epi_seed)
        Tsp1 = generate_random_tsp(max_episode_steps, 'TSP1')
        Tsp2 = generate_random_tsp(max_episode_steps, 'TSP2')
        set_seed(rl)
        
        env = tclab.TCLab()
        
        env.Q1(0)
        env.Q2(0)

        while env.T1 >= st_temp or env.T2 >= st_temp:
            print(f'Time: {i} T1: {env.T1} T2: {env.T2}')
            i += 20
            time.sleep(20)

        tm = np.zeros(max_episode_steps)
        T1 = np.ones(max_episode_steps) * env.T1
        T2 = np.ones(max_episode_steps) * env.T2
        Q1 = np.zeros(max_episode_steps)
        Q2 = np.zeros(max_episode_steps)
        
        total_reward = 0.0
        dt_error = 0.0
        
        start_time = time.time()
        prev_time = start_time
        for i in range(max_episode_steps):
            sleep = sleep_max - (time.time() - prev_time) - dt_error
            if sleep >= 1e-4:
                time.sleep(sleep - 1e-4)
            else:
                print('exceeded max cycle time by ' + str(abs(sleep)) + ' sec')
                time.sleep(1e-4)

            t = time.time()
            dt = t - prev_time
            if (sleep>=1e-4):
                dt_error = dt-sleep_max+0.009
            else:
                dt_error = 0.0
            prev_time = t
            tm[i] = t - start_time


            T1[i] = env.T1
            T2[i] = env.T2
            if i == 0:
                dT1, dT2= 0.0, 0.0
            elif i<4:
                dT1, dT2 = T1[i]-T1[i-1], T2[i]-T2[i-1]
            else:
                dT1 = T1[i] - T1[i - 4] 
                dT2 = T2[i] - T2[i - 4]

            obs = np.array([T1[i], Tsp1[i], dT1, T2[i], Tsp2[i], dT2], dtype=np.float32)    
            
            with torch.no_grad():
                action = policy.act(torchify(obs), deterministic=True).cpu().numpy()
            Q1[i], Q2[i] = action
            env.Q1(Q1[i])
            env.Q2(Q2[i])
            
            reward = -np.linalg.norm([T1[i] - Tsp1[i], T2[i] - Tsp2[i]])
            total_reward += reward
            
        final_path = os.path.join(path, f"{epi_seed}epi")
        data={
            "path": final_path,
            "tm": tm,
            "Q1": Q1,
            "Q2": Q2,
            "T1": T1,
            "T2": T2,
            "Tsp1": Tsp1,
            "Tsp2": Tsp2,
            "total_reward": total_reward,
        }
        all_datas.append(data)
        
        env.close()

all_rewards = [d["total_reward"] for d in all_datas]

# 평균 계산 및 출력
mean_reward = np.mean(all_rewards)
print("Average Total Reward:", mean_reward)
save_csv_png(all_datas, 0)


TCLab version 1.0.0
Arduino Leonardo connected on port /dev/ttyACM0 at 115200 baud.
TCLab Firmware 2.0.1 Arduino Leonardo/Micro.
Time: 1199 T1: 30.405 T2: 28.665
Time: 1219 T1: 29.954 T2: 28.568
Time: 1239 T1: 29.857 T2: 28.311
Time: 1259 T1: 29.6 T2: 28.021
Time: 1279 T1: 29.277 T2: 27.988
Time: 1299 T1: 29.277 T2: 27.988
TCLab version 1.0.0
Arduino Leonardo connected on port /dev/ttyACM0 at 115200 baud.
TCLab Firmware 2.0.1 Arduino Leonardo/Micro.
Time: 1199 T1: 46.035 T2: 46.261
Time: 1219 T1: 44.424 T2: 45.068
Time: 1239 T1: 42.941 T2: 43.457
Time: 1259 T1: 41.556 T2: 41.781
Time: 1279 T1: 40.234 T2: 40.009
Time: 1299 T1: 38.945 T2: 38.623
Time: 1319 T1: 37.656 T2: 37.302
Time: 1339 T1: 36.689 T2: 36.045
Time: 1359 T1: 35.497 T2: 34.756
Time: 1379 T1: 34.756 T2: 33.854
Time: 1399 T1: 33.918 T2: 33.145
Time: 1419 T1: 33.145 T2: 32.274
Time: 1439 T1: 32.5 T2: 31.533
Time: 1459 T1: 31.855 T2: 30.889
Time: 1479 T1: 31.533 T2: 30.309
Time: 1499 T1: 30.985 T2: 29.922
Time: 1519 T1: 30.56

In [11]:
#IQL offline 

model_path = Path("/home/minchanggi/code/TCLab/IQL/log/lab8_afalse_nfalse_rtrue/05-22-25_09.12.39_rugj/best7224.pt")
iql.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

# IQL_MLP_KIT

In [12]:

IQL_MLP_offline_path=os.path.join(root_path,"IQL_MLP_online_KIT_Train_best")
all_datas = []
st_temp = 29.0

for rl in rl_seed:
    path = os.path.join(IQL_MLP_offline_path, f"seed_{rl}")
    for epi_seed in epi_gen_seed:
        set_seed(epi_seed)
        Tsp1 = generate_random_tsp(max_episode_steps, 'TSP1')
        Tsp2 = generate_random_tsp(max_episode_steps, 'TSP2')
        set_seed(rl)
        
        env = tclab.TCLab()
        
        env.Q1(0)
        env.Q2(0)

        while env.T1 >= st_temp or env.T2 >= st_temp:
            print(f'Time: {i} T1: {env.T1} T2: {env.T2}')
            i += 20
            time.sleep(20)

        tm = np.zeros(max_episode_steps)
        T1 = np.ones(max_episode_steps) * env.T1
        T2 = np.ones(max_episode_steps) * env.T2
        Q1 = np.zeros(max_episode_steps)
        Q2 = np.zeros(max_episode_steps)
        
        total_reward = 0.0
        dt_error = 0.0
        
        start_time = time.time()
        prev_time = start_time
        for i in range(max_episode_steps):
            sleep = sleep_max - (time.time() - prev_time) - dt_error
            if sleep >= 1e-4:
                time.sleep(sleep - 1e-4)
            else:
                print('exceeded max cycle time by ' + str(abs(sleep)) + ' sec')
                time.sleep(1e-4)

            t = time.time()
            dt = t - prev_time
            if (sleep>=1e-4):
                dt_error = dt-sleep_max+0.009
            else:
                dt_error = 0.0
            prev_time = t
            tm[i] = t - start_time


            T1[i] = env.T1
            T2[i] = env.T2
            if i == 0:
                dT1, dT2= 0.0, 0.0
            elif i<4:
                dT1, dT2 = T1[i]-T1[i-1], T2[i]-T2[i-1]
            else:
                dT1 = T1[i] - T1[i - 4] 
                dT2 = T2[i] - T2[i - 4]

            obs = np.array([T1[i], Tsp1[i], dT1, T2[i], Tsp2[i], dT2], dtype=np.float32)    
            
            with torch.no_grad():
                action = policy.act(torchify(obs), deterministic=True).cpu().numpy()
            Q1[i], Q2[i] = action
            env.Q1(Q1[i])
            env.Q2(Q2[i])
            
            reward = -np.linalg.norm([T1[i] - Tsp1[i], T2[i] - Tsp2[i]])
            total_reward += reward
            
        final_path = os.path.join(path, f"{epi_seed}epi")
        data={
            "path": final_path,
            "tm": tm,
            "Q1": Q1,
            "Q2": Q2,
            "T1": T1,
            "T2": T2,
            "Tsp1": Tsp1,
            "Tsp2": Tsp2,
            "total_reward": total_reward,
        }
        all_datas.append(data)
        
        env.close()

all_rewards = [d["total_reward"] for d in all_datas]

# 평균 계산 및 출력
mean_reward = np.mean(all_rewards)
print("Average Total Reward:", mean_reward)
save_csv_png(all_datas, 0)


TCLab version 1.0.0
Arduino Leonardo connected on port /dev/ttyACM0 at 115200 baud.
TCLab Firmware 2.0.1 Arduino Leonardo/Micro.
Time: 1199 T1: 62.793 T2: 38.623
Time: 1219 T1: 60.859 T2: 37.979
Time: 1239 T1: 56.67 T2: 37.334
Time: 1259 T1: 52.803 T2: 36.367
Time: 1279 T1: 48.936 T2: 35.4
Time: 1299 T1: 46.647 T2: 34.756
Time: 1319 T1: 44.102 T2: 34.111
Time: 1339 T1: 42.104 T2: 33.467
Time: 1359 T1: 39.944 T2: 32.822
Time: 1379 T1: 38.301 T2: 32.178
Time: 1399 T1: 36.851 T2: 31.533
Time: 1419 T1: 35.723 T2: 30.921
Time: 1439 T1: 34.434 T2: 30.566
Time: 1459 T1: 33.467 T2: 29.89
Time: 1479 T1: 32.5 T2: 29.438
Time: 1499 T1: 31.855 T2: 28.987
Time: 1519 T1: 31.211 T2: 28.665
Time: 1539 T1: 30.566 T2: 28.311
Time: 1559 T1: 30.212 T2: 28.214
Time: 1579 T1: 29.857 T2: 27.988
Time: 1599 T1: 29.567 T2: 27.666
Time: 1619 T1: 29.277 T2: 27.666
TCLab version 1.0.0
Arduino Leonardo connected on port /dev/ttyACM0 at 115200 baud.
TCLab Firmware 2.0.1 Arduino Leonardo/Micro.
Time: 1199 T1: 46.035 

# LSTM

In [29]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from pathlib import Path
from IQL.src.lstm_policy import LSTMPolicy
from IQL.src.iql import LSTM_ImplicitQLearning
from IQL.src.value_functions import TwinQ, ValueFunction
from IQL.src.util import normalize, torchify, unnormalize

obs_dim = 4
act_dim = 2
hidden_dim = 256
n_hidden = 2
tau=0.8
beta=3.0
alpha=0.005
discount=0.99
max_steps=1000000
learning_rate=3e-4
act_scale=1.0
obs_scale=1.0
policy = LSTMPolicy(obs_dim, act_dim, hidden_dim, n_hidden).to(device)
iql = LSTM_ImplicitQLearning(
        qf=TwinQ(obs_dim, act_dim, hidden_dim, n_hidden).to(device),
        vf=ValueFunction(obs_dim, hidden_dim, n_hidden).to(device),
        policy=policy,
        optimizer_factory=lambda p: torch.optim.Adam(p, lr=learning_rate),
        max_steps=max_steps,
        tau=tau,
        beta=beta,
        alpha=alpha,
        discount=discount
    )


def last_value_hold(seq, seq_length):
    current_length = seq.size(0)
    if current_length >= seq_length:
        return seq.unsqueeze(0)  # (1, Seq, Feature)

    last_value = seq[-1].unsqueeze(0).repeat(seq_length - current_length, 1)

    padded_seq = torch.cat((seq,last_value), dim=0)
    return padded_seq 

In [30]:

model_path = Path("/home/minchanggi/code/TCLab/IQL/log/fixed_lab8_rs5_lr1e-3_sl5/05-13-25_15.33.04_tbvs/best.pt")
iql.load_state_dict(torch.load(model_path, map_location=device))

<All keys matched successfully>

In [41]:
IQL_MLP_offline_path=os.path.join(root_path,"IQL_MLP_LSTM_online_SIM")
all_datas = []
st_temp = 29.0
obs_mins = np.array([24.0, 25.0, 24.0, 25.0], dtype=np.float32)
obs_maxs = np.array([66.0, 65.0, 66.0, 65.0], dtype=np.float32)

# obs_mins = torch.tensor([23.0, 25.0, 23.0, 25.0], device=device)
# obs_maxs = torch.tensor([67.0, 65.0, 67.0, 65.0], device=device)
act_mins = torch.tensor([0.0, 0.0], device=device)
act_maxs = torch.tensor([100.0, 100.0], device=device)
seq_length= 5
for rl in rl_seed:
    path = os.path.join(IQL_MLP_offline_path, f"seed_{rl}")
    for epi_seed in epi_gen_seed:
        
        set_seed(epi_seed)
        Tsp1 = generate_random_tsp(max_episode_steps, 'TSP1')
        Tsp2 = generate_random_tsp(max_episode_steps, 'TSP2')
        set_seed(rl)
        from tclab import setup
        lab = setup(connected=False)
        env = lab(synced=False)
        
        
        env.Q1(0)
        env.Q2(0)

        tm = np.zeros(max_episode_steps)
        T1 = np.ones(max_episode_steps) * env.T1
        T2 = np.ones(max_episode_steps) * env.T2
        Q1 = np.zeros(max_episode_steps)
        Q2 = np.zeros(max_episode_steps)
        
        total_reward = 0.0
        dt_error = 0.0
        obs_sequence = []
        h=None
        for i in range(max_episode_steps):
            sim_time = i * sleep_max
            env.update(t=sim_time)
            tm[i] = sim_time


            T1[i] = env.T1
            T2[i] = env.T2

            obs = np.array([T1[i], Tsp1[i], T2[i], Tsp2[i]], dtype=np.float32) 
            
            # obs_tensor = normalize(
            #     obs,
            #     min_val=obs_mins,  
            #     max_val=obs_maxs,  
            #     scale=obs_scale,
            #     mode='zero_one'   
            # ).unsqueeze(0)   
            # obs_sequence.append(obs)
            
            obs_normalized = (obs - obs_mins) / (obs_maxs - obs_mins+ 1e-8)
            obs_sequence.append(obs_normalized)
            obs_tensor = torch.from_numpy(np.array(obs_sequence)).to(device).float()
            
            if len(obs_sequence) < seq_length:
                obs_tensor = last_value_hold(obs_tensor, seq_length=seq_length)
            else:
                obs_tensor = obs_tensor[-seq_length:]
                
            obs_tensor = obs_tensor.unsqueeze(0) 
            print(obs_tensor)
            with torch.no_grad():
                action,h = policy.act(obs_tensor,  hidden_state= h, deterministic=True)
                #print(action)
                action = action.cpu().numpy()
            #print(action)
            action = unnormalize(
                action,
                min_val=act_mins,
                max_val=act_maxs,
                scale=act_scale,
                mode='zero_one'
            )
            
            Q1[i], Q2[i] = action.squeeze(0)
            env.Q1(Q1[i])
            env.Q2(Q2[i])
            
            reward = -np.linalg.norm([T1[i] - Tsp1[i], T2[i] - Tsp2[i]])
            total_reward += reward
            
        final_path = os.path.join(path, f"{epi_seed}epi")
        data={
            "path": final_path,
            "tm": tm,
            "Q1": Q1,
            "Q2": Q2,
            "T1": T1,
            "T2": T2,
            "Tsp1": Tsp1,
            "Tsp2": Tsp2,
            "total_reward": total_reward,
        }
        all_datas.append(data)
        
        env.close()

all_rewards = [d["total_reward"] for d in all_datas]

# 평균 계산 및 출력
mean_reward = np.mean(all_rewards)
print("Average Total Reward:", mean_reward)
save_csv_png(all_datas, 0)


tensor([[[0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918]]], device='cuda:0')
tensor([[[0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918]]], device='cuda:0')
tensor([[[0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918]]], device='cuda:0')
tensor([[[0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6028, 0.1192, 0.8918],
         [0.1192, 0.6028, 0.1192, 0.8918]]], device='cuda:0')
tensor([[[0.1192, 0.6028, 0.1115, 0.8918],
         [0.1192, 0.6