# Reinforcement Learning for Unit Commitment Problem

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

from collections import deque

In [None]:
from ppo.main import Agent as PPO_Agent
from ppo.buffer import ReplayBuffer
from environment.main import Environment

## Set Environment

In [None]:
env = Environment()

## Set Buffer

In [None]:
buffer = ReplayBuffer(capacity=240)

## Set Agent

In [None]:
agent = PPO_Agent(env.state_size, env.action_size, 'agent')

## Train The Agent

In [None]:
num_generator = 10
num_episodes = 5000
max_time_step = 24 # hours

In [None]:
total_rewards = []

for i_episode in range(1, num_episodes+1):
    
    env.reset()
    state = np.zeros(env.state_size)
    
    for time_step in range(max_time_step):
        
        hour = 'Hour-' + str(time_step)
        actions = []
        
        for i_gen in range(num_generator):
            
            gen = 'gen-' + str(i_gen)
            
            action = agent.act(state)
            action = np.argmax(action)
            
            env.net.net.res_on_off_schedule.loc[hour, gen] = action
            next_state, reward = env.step(gen, hour, action)
            
            if i_gen == 9:
                gen_end = True
            else:
                gen_end = False
            
            if time_step == 23:
                done = True
            else:
                done = False
                
            buffer.add(state, action, reward, next_state, done)
            
            if gen_end:
                schedule = env.net.net.res_use_schedule.loc[hour, :].values
                on_off = env.net.net.res_on_off_schedule.loc[hour, :].values
                
                for index, (sample_state, sample_action, _, sample_next_state, sample_done) in enumerate(buffer.memory):
                    
                    if schedule[index] == 1 and on_off[index] == 1:
                        agent.memorize(sample_state, sample_action, reward, sample_next_state, sample_done)
                        
                    elif schedule[index] == 0 and on_off[index] == 1:
                        agent.memorize(sample_state, sample_action, -reward*0.8, sample_next_state, sample_done)
                    
                    else:
                        agent.memorize(sample_state, sample_action, reward*0.01, sample_next_state, sample_done)
                
                buffer.memory = deque(maxlen=96)
            
            state = next_state

    total_rewards.append(reward)
    print(f'Episode {i_episode}, Total Reward: {reward:.4f}')
    
print('Training done.')

## Evaluate The Agent

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(total_rewards, color='green')
plt.xlabel('Number of Episodes')
plt.ylabel('Total Rewards')
plt.savefig('./images/plot_PPO_total_rewards.png')
plt.show()

In [None]:
print('--- UNIT COMMITMENT REPORT')

print('\nUC Hourly Schedule:', env.net.net.res_on_off_schedule)
print('\nSimulation Result:', env.net.net.res_generation)
print('\nTotal Generation (MW):', env.net.net.res_generation.sum(axis=1))
print('\nFuel Cost:', env.net.net.res_cost)
print('\nPenalty:', env.net.net.res_reward)

---