# Implementing Soft-Actor-Critic from original paper (2018)  
(https://arxiv.org/pdf/1801.01290.pdf)
### References: 
OpenAI Gym <br/>
https://github.com/vaishak2future/sac/blob/master/sac.ipynb <br/>
https://towardsdatascience.com/soft-actor-critic-demystified-b8427df61665

In [134]:
import simulation
import sys
import random
import pandas as pd
import importlib
import numpy as np
import math
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal
import matplotlib.pyplot as plt

## Using Vaishak Kumar's implementation of version 1

In [155]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0
    
    def push(self, state, action, reward, next_state, done):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state, done)
        self.position = (self.position + 1) % self.capacity
    
    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        state, action, reward, next_state, done = map(np.stack, zip(*batch))
        return state, action, reward, next_state, done
    
    def __len__(self):
        return len(self.buffer)

In [156]:
class Space():
    def __init__(self, low, high, length, dtype):
        self.shape = (length, 1)
        self.dtype = np.dtype(dtype)
        self.low = low
        self.high = high

class ActionSpace(Space):
    def __init__(self, low = 0, high = 100, length=24, dtype= np.float32):
        super().__init__(low, high, length, dtype)
    
    def sample(self):
        return np.random.uniform(self.low,self.high,self.shape)

class StateSpace(Space):
    def __init__(self, low = 0, high = 100, length=24, dtype= np.float32):
        super().__init__(low, high, length, dtype)

In [157]:
class NormalizedActions(ActionSpace):
    def __init__(self):
        self.action_space = ActionSpace()
        self.state_space = StateSpace()
        
    def _action(self, action):
        """
        Input: Action- a np_array of length= self.length (self refers to NormalizedActions)
        Returns: "normalized action"
        """
        low  = np.ones(self.length) * self.low
        high = np.ones(self.length) * self.high
        action = low + (action + 1.0) * 0.5 * (high - low)
        action = np.clip(action, low, high)
        
        return action

    def _reverse_action(self, action):
        low  = self.low
        high = self.high
        action_prime = []
        for x in np.nditer(action):
            x_prime = 2*(x-low) / (high-low) - 1
            action_prime.append(x_prime)
        action_prime = np.array(action_prime)
        action_prime = np.clip(action_prime, low, high)
        
        return action_prime
    
    def step(self, office, price, controllers_points):
        end = False

        energy_dict = {}
        rewards_dict = {}
        for player_name in office.players_dict:
            player = office.players_dict.get(player_name)
            player_energy = player.threshold_exp_response(controllers_points)
            last_player_energy = player_energy
            energy_dict[player_name] = player_energy
            
            player_min_demand = player.get_min_demand()
            player_max_demand = player.get_max_demand()
            player_reward = simulation.Reward(player_energy, price, player_min_demand, player_max_demand)
            player_ideal_demands = player_reward.ideal_use_calculation()
            last_player_ideal = player_ideal_demands
            reward = player_reward.scaled_cost_distance_neg(player_ideal_demands)
            rewards_dict[player_name] = reward
        
        total_reward = sum(rewards_dict.values())
        office._timestep = office._timestep + office._time_interval
        
        if office._timestep>office._end_timestamp:
            office._timestep = office._start_timestamp
            
#         if office.current_iter >= office.num_iters:
#             end = True
#         office.current_iter += 1
        
        diff = (office.get_timestep()-office._start_timestamp)
        next_price = office.price_signal(diff.days + 1)
        
        return next_price, total_reward, end

In [158]:
class ValueNetwork(nn.Module):
    def __init__(self, state_dim, hidden_dim, init_w=3e-3):
        super(ValueNetwork, self).__init__()
        
        self.linear1 = nn.Linear(state_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear3 = nn.Linear(hidden_dim, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x
        
        
class SoftQNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3):
        super(SoftQNetwork, self).__init__()
        
        self.linear1 = nn.Linear(num_inputs + num_actions, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        self.linear3 = nn.Linear(hidden_size, 1)
        
        self.linear3.weight.data.uniform_(-init_w, init_w)
        self.linear3.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state, action):
        x = torch.cat([state, action], 1)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x
        
        
class PolicyNetwork(nn.Module):
    def __init__(self, num_inputs, num_actions, hidden_size, init_w=3e-3, log_std_min=-20, log_std_max=2):
        super(PolicyNetwork, self).__init__()
        
        self.log_std_min = log_std_min
        self.log_std_max = log_std_max
        
        self.linear1 = nn.Linear(num_inputs, hidden_size)
        self.linear2 = nn.Linear(hidden_size, hidden_size)
        
        self.mean_linear = nn.Linear(hidden_size, num_actions)
        self.mean_linear.weight.data.uniform_(-init_w, init_w)
        self.mean_linear.bias.data.uniform_(-init_w, init_w)
        
        self.log_std_linear = nn.Linear(hidden_size, num_actions)
        self.log_std_linear.weight.data.uniform_(-init_w, init_w)
        self.log_std_linear.bias.data.uniform_(-init_w, init_w)
        
    def forward(self, state):
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        
        mean    = self.mean_linear(x)
        log_std = self.log_std_linear(x)
        log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
        
        return mean, log_std
    
    def evaluate(self, state, epsilon=1e-6):
        mean, log_std = self.forward(state)
        std = log_std.exp()
        
        normal = Normal(0, 1)
        #z      = normal.sample()
        z = torch.empty(24).normal_(mean=0,std=1)
        action = torch.tanh(mean+ std*z) #.to(device))
#       log_prob = Normal(mean, std).log_prob(mean+ std*z.to(device)) - torch.log(1 - action.pow(2) + epsilon)
        log_prob = Normal(mean, std).log_prob(mean+ std*z) - torch.log(1 - action.pow(2) + epsilon)
        return action, log_prob, z, mean, log_std
        
    
    def get_points(self, state):
        state = torch.FloatTensor(state).unsqueeze(0) #.to(device)
        mean, log_std = self.forward(state)
        std = log_std.exp()
        
#         normal = Normal(0, 1)
#         z      = normal.sample() #.to(device)
        z = torch.empty(24).normal_(mean=0,std=1)
        action = torch.tanh(mean + std*z)
        
        action  = action.cpu()#.detach().cpu().numpy()
        return action[0]

In [159]:
def update(batch_size,gamma=0.99,soft_tau=1e-2,):
    
    state, action, reward, next_state, done = replay_buffer.sample(batch_size)

    state      = torch.FloatTensor(state) #.to(device)
    next_state = torch.FloatTensor(next_state) #.to(device)
    action     = torch.FloatTensor(action) #.to(device)
    reward     = torch.FloatTensor(reward).unsqueeze(1) #.to(device)
    done       = torch.FloatTensor(np.float32(done)).unsqueeze(1) #.to(device)

    predicted_q_value1 = soft_q_net1(state, action)
    predicted_q_value2 = soft_q_net2(state, action)
    predicted_value    = value_net(state)
    new_action, log_prob, epsilon, mean, log_std = policy_net.evaluate(state)

    
    
# Training Q Function
    target_value = target_value_net(next_state)
    target_q_value = reward + (1 - done) * gamma * target_value
    q_value_loss1 = soft_q_criterion1(predicted_q_value1, target_q_value.detach())
    q_value_loss2 = soft_q_criterion2(predicted_q_value2, target_q_value.detach())


    soft_q_optimizer1.zero_grad()
    q_value_loss1.backward()
    soft_q_optimizer1.step()
    soft_q_optimizer2.zero_grad()
    q_value_loss2.backward()
    soft_q_optimizer2.step()    
# Training Value Function
    predicted_new_q_value = torch.min(soft_q_net1(state, new_action),soft_q_net2(state, new_action))
    target_value_func = predicted_new_q_value - log_prob
    value_loss = value_criterion(predicted_value, target_value_func.detach())

    
    value_optimizer.zero_grad()
    value_loss.backward()
    value_optimizer.step()
# Training Policy Function
    policy_loss = (log_prob - predicted_new_q_value).mean()

    policy_optimizer.zero_grad()
    policy_loss.backward()
    policy_optimizer.step()
    
    
    for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
        target_param.data.copy_(
            target_param.data * (1.0 - soft_tau) + param.data * soft_tau
        )

In [160]:
env = NormalizedActions()

action_dim = env.action_space.shape[0]
state_dim  = env.state_space.shape[0]
hidden_dim = 32

value_net        = ValueNetwork(state_dim, hidden_dim) #.to(device)
target_value_net = ValueNetwork(state_dim, hidden_dim) #.to(device)

soft_q_net1 = SoftQNetwork(state_dim, action_dim, hidden_dim) #.to(device)
soft_q_net2 = SoftQNetwork(state_dim, action_dim, hidden_dim) #.to(device)
policy_net = PolicyNetwork(state_dim, action_dim, hidden_dim) #.to(device)

for target_param, param in zip(target_value_net.parameters(), value_net.parameters()):
    target_param.data.copy_(param.data)
    

value_criterion  = nn.MSELoss()
soft_q_criterion1 = nn.MSELoss()
soft_q_criterion2 = nn.MSELoss()

value_lr  = 3e-4
soft_q_lr = 3e-4
policy_lr = 3e-4

value_optimizer  = optim.Adam(value_net.parameters(), lr=value_lr)
soft_q_optimizer1 = optim.Adam(soft_q_net1.parameters(), lr=soft_q_lr)
soft_q_optimizer2 = optim.Adam(soft_q_net2.parameters(), lr=soft_q_lr)
policy_optimizer = optim.Adam(policy_net.parameters(), lr=policy_lr)


replay_buffer_size = 100
replay_buffer = ReplayBuffer(replay_buffer_size)


In [161]:
max_steps = 1000
batch_size = 5 #idk why need to check
office = simulation.Office()
diff = (office.get_timestep()-office._start_timestamp)
state = office.price_signal(diff.days + 1)
rewards = []
for step in range(max_steps):
    print(step)
    if step >110:
        action = policy_net.get_points(state).detach()
        next_state, reward, done = env.step(office, state, action.numpy())
    else:
        action = env.action_space.sample()
        next_state, reward, done = env.step(office, state, action) 
        #office.step -> returns reward, action, kwh, end
            # State = price for today and energy used yesterday(?)
            # need to add price to state
        
        
    replay_buffer.push(state, action, reward, next_state, done)
        
    state = next_state
        
    if len(replay_buffer) > batch_size:
            update(batch_size)
        
    if done:
        break
        
    rewards.append(reward)

rewards = [r[0] if r is np.ndarray else r for r in rewards]
plt.plot(np.array(range(max_steps)),rewards)

#Need to fix update batch issue

creating agents
creating controller
0
1
2
3
4
5


RuntimeError: invalid argument 0: Tensors must have same number of dimensions: got 2 and 3 at ../aten/src/TH/generic/THTensor.cpp:680