# DQN Training Notebook

**Description**: This notebook implements and trains a Deep Q-Network (DQN) agent to optimize energy storage arbitrage using the `BidSimulator`.

**Steps**:
1. Load and preprocess the CAISO price data.
2. Initialize the `BidSimulator` with the data.
3. Define the Deep Q-Network (DQN) and replay buffer.
4. Train the DQN agent.
5. Evaluate the trained agent's performance.

In [1]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import pandas as pd

from src.BidSimulator import BidSimulator
from src.dqn.QFunction import Qfunction
from src.dqn.ReplayBuffer import ReplayBuffer

In [2]:
# load day-ahead and real-time prices
DAP = pd.read_csv("./data/CAISO_DAP.csv")
RTP = pd.read_csv("./data/CAISO_RTP.csv")

# read datetime and drop duplicate data
RTP["Date"] = pd.to_datetime(RTP["Date"], format="%m/%d/%Y %I:%M:%S %p")
RTP = RTP.drop_duplicates(subset=["Date", "hub"])
DAP["Date"] = pd.to_datetime(DAP["Date"], format="%m/%d/%Y %I:%M:%S %p")
DAP = DAP.drop_duplicates(subset=["Date", "zone"])

# pivot data
DAP_pivoted = DAP.pivot(index="Date", columns="zone", values="price")
RTP_pivoted = RTP.pivot(index="Date", columns="hub", values="price")

# rename 'Date' column to 'ts'
DAP_pivoted.index.names = ["ts"]
RTP_pivoted.index.names = ["ts"]

# merge dataframes on index
CAISO_PRICES = pd.merge(DAP_pivoted, RTP_pivoted, on=["ts"], how="outer")
CAISO_PRICES = CAISO_PRICES.ffill().reset_index()

In [3]:
# form datasets
PGAE_NP15 = CAISO_PRICES[["ts", "PGAE", "TH_NP15"]].rename(
    columns={"PGAE": "dap", "TH_NP15": "rtp"}
)
PGAE_ZP26 = CAISO_PRICES[["ts", "PGAE", "TH_ZP26"]].rename(
    columns={"PGAE": "dap", "TH_ZP26": "rtp"}
)
SCE_SP15 = CAISO_PRICES[["ts", "SCE", "TH_SP15"]].rename(
    columns={"SCE": "dap", "TH_SP15": "rtp"}
)
SDGE_SP15 = CAISO_PRICES[["ts", "SDGE", "TH_SP15"]].rename(
    columns={"SDGE": "dap", "TH_SP15": "rtp"}
)

print(len(PGAE_NP15))

144457


In [8]:
bidder = BidSimulator(
    data=PGAE_NP15,
    lookback_periods=12 * 24 * 5,  # start after first 5 days
    end=10000,
    eff=0.8,
    discharge_cost=0.0,
    initial_soc=0.5,
    capacity=8.0,
    power_max=2.0,
    resting_draw=0.01,
    timestep=(5.0 / 60),
    params=None,
)

print(bidder)

Bid Summary
	SOC: 0.5
	Profit: 0
	Timestamp: 1440/144457
	Capacity: 8.0
	Maximum Power: 2.0
	Efficiency: 0.8
	Discharge Cost: 0.0
	Resting Draw: 0.01


In [5]:
# Hyperparameters
lr = 1e-3  # learning rate for gradient update
batchsize = 64  # batch size for buffer sampling
maxlength = 1000  # max number of tuples held by buffer
tau = 100  # time steps for target update
episodes = 300  # number of episodes to run
initialsize = 500  # initial time steps before starting training
epsilon = 1.0  # starting exploration rate
epsilon_decay = 0.995  # decay factor for exploration rate
epsilon_min = 0.01  # minimum exploration rate
gamma = 0.99  # discount factor

In [6]:
def run_target_update(Qprincipal, Qtarget):
    for v,v_ in zip(Qprincipal.model.parameters(), Qtarget.model.parameters()):
        v_.data.copy_(v.data)

In [22]:
bidder.reset_simulation()
# Initialize Q-function networks (principal and target)
obssize = 2  # state representation: price and SOC
actsize = 3  # action representation: charge, hold, discharge
Qprincipal = Qfunction(obssize, actsize, lr)
Qtarget = Qfunction(obssize, actsize, lr)

# Initialize replay buffer
buffer = ReplayBuffer(maxlength)

# Main training loop
rrecord = []  # to keep track of rewards per episode
totalstep = 0  # total steps across episodes

for episode in range(episodes):
    # Reset the simulator and get initial state
    bidder.reset_simulation()
    soc, running_profit, done, lookback = bidder.get_state()
    state = [lookback.rtp.iloc[-1], soc]  # Initial state (price, SOC)

    # Calculate bid threshold and tolerance
    bid_threshold = lookback.rtp.median()
    tolerance = 4  # Tolerance value for price deviation
    done = False
    rsum = 0  # cumulative reward for the current episode

    while not done:
        # Determine action based on epsilon-greedy policy
        if np.random.rand() < max(epsilon_min, epsilon):
            action = np.random.choice(actsize)  # Random action (exploration)
        else:
            action = Qprincipal.compute_argmaxQ(np.expand_dims(state, axis=0))  # Greedy action (exploitation)

        # Step in the environment
        next_soc, reward, _, done, lookback = bidder.step(bid)
        next_state = [lookback.rtp.iloc[-1], next_soc]

        # Store experience in replay buffer
        buffer.append((state, action, reward, next_state))

        # Train Qprincipal using experiences from the replay buffer
        if totalstep > initialsize and len(buffer) >= batchsize:
            # Sample a minibatch from buffer
            minibatch = buffer.sample(batchsize)
            states, actions, rewards, next_states = zip(*minibatch)

            # Compute target values using Qtarget
            max_next_q_values = Qtarget.compute_maxQvalues(next_states)
            targets = [r + gamma * max_next_q for r, max_next_q in zip(rewards, max_next_q_values)]

            # Train Qprincipal on the targets
            Qprincipal.train(states, actions, targets)

        # Update target network every `tau` steps
        if totalstep % tau == 0:
            run_target_update(Qprincipal, Qtarget)

        # Update state and cumulative reward
        state = next_state
        rsum += reward
        totalstep += 1

    # Decay epsilon after each episode
    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    # Record the total reward for this episode
    rrecord.append(rsum)
    print(episode)
    # Print progress every 10 episodes
    print(f"Episode {episode}/{episodes}, Total Reward: {rsum}, Epsilon: {epsilon:.3f}")

power:  1.0
self.soc_hist[-1]:  0.5
corrected_power:  0.8
self.resting_draw:  0.01
self.timestep:  0.08333333333333333
self.capacity:  8.0
0.5082291666666666
power:  1.0
self.soc_hist[-1]:  0.5082291666666666
corrected_power:  0.8
self.resting_draw:  0.01
self.timestep:  0.08333333333333333
self.capacity:  8.0
0.5164583333333332
power:  1.0
self.soc_hist[-1]:  0.5164583333333332
corrected_power:  0.8
self.resting_draw:  0.01
self.timestep:  0.08333333333333333
self.capacity:  8.0
0.5246874999999999
power:  1.0
self.soc_hist[-1]:  0.5246874999999999
corrected_power:  0.8
self.resting_draw:  0.01
self.timestep:  0.08333333333333333
self.capacity:  8.0
0.5329166666666665
power:  1.0
self.soc_hist[-1]:  0.5329166666666665
corrected_power:  0.8
self.resting_draw:  0.01
self.timestep:  0.08333333333333333
self.capacity:  8.0
0.5411458333333331
power:  1.0
self.soc_hist[-1]:  0.5411458333333331
corrected_power:  0.8
self.resting_draw:  0.01
self.timestep:  0.08333333333333333
self.capacity:  

AssertionError: 