In [2]:
from citylearn import  CityLearn, building_loader, auto_size
from energy_models import HeatPump, EnergyStorage, Building
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
np.random.seed(3)

import ray 
import ray.rllib.agents.ppo as ppo
from ray.tune.logger import pretty_print


import math
import random

import gym
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Normal


import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#Use only one building for SINGLE AGENT environment, unmark multiple building IDs to simulate MULTI-AGENT environment. In the multi-agent environment
#the reward of each agent depend partially on the actions of the other agents or buildings (see reward_function.py)
building_ids = [8]#, 5, 9, 16, 21, 26, 33, 36, 49, 59]

In [4]:
'''
Building the RL environment with heating and cooling loads and weather files
CityLearn
    Weather file
    Buildings
        File with heating and cooling demands
        CoolingDevices (HeatPump)
        CoolingStorages (EnergyStorage)
'''

data_folder = Path("data/")

demand_file = data_folder / "AustinResidential_TH.csv"
weather_file = data_folder / 'Austin_Airp_TX-hour.csv'

heat_pump, heat_tank, cooling_tank = {}, {}, {}

#Ref: Assessment of energy efficiency in electric storage water heaters (2008 Energy and Buildings)
loss_factor = 0.19/24
buildings = []
for uid in building_ids:
    heat_pump[uid] = HeatPump(nominal_power = 9e12, eta_tech = 0.22, t_target_heating = 45, t_target_cooling = 10)
    heat_tank[uid] = EnergyStorage(capacity = 9e12, loss_coeff = loss_factor)
    cooling_tank[uid] = EnergyStorage(capacity = 9e12, loss_coeff = loss_factor)
    buildings.append(Building(uid, heating_storage = heat_tank[uid], cooling_storage = cooling_tank[uid], heating_device = heat_pump[uid], cooling_device = heat_pump[uid]))
    buildings[-1].state_space(np.array([1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 40.0, 1.001]), np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 17.0, -0.001]))
    buildings[-1].action_space(np.array([0.5]), np.array([-0.3]))
    
building_loader(demand_file, weather_file, buildings)  
auto_size(buildings, t_target_heating = 45, t_target_cooling = 10)

env = CityLearn(demand_file, weather_file, buildings = buildings, time_resolution = 1, simulation_period = (3500,6000))

[-3.00000000e-01 -2.00000000e-01 -1.00000000e-01  5.55111512e-17
  1.00000000e-01  2.00000000e-01  3.00000000e-01  4.00000000e-01]


In [48]:
from reward_function import reward_function
observations_space, actions_space = [],[]
for building in buildings:
    observations_space.append(building.observation_spaces)
    actions_space.append(building.action_spaces)

In [49]:
from reward_function import reward_function

In [50]:
alpha=0.9
w=np.ones(24+8)

def generate_state_action_matrix(state,action_shape):
    ns=len(state)
    x_s_a=np.zeros((action_shape,ns+action_shape))
    for i in range(action_shape):
        l=np.zeros(action_shape)
        l[i]=1
        x_s_a[i,:]=np.concatenate([np.array(state),l])
    return(x_s_a)
    
def get_q_values(x_s_a,w):
    na=x_s_a.shape[0]
    ns_na=x_s_a.shape[1]
    q_s_a=np.zeros(na)
    for i in range(na):
        #print(w.reshape(1,32).shape)
        #print(x_s_a[0,:].reshape(32,1).shape)
        q_s_a[i]=np.matmul(w.reshape(1,ns_na),x_s_a[i,:].reshape(ns_na,1))
        #print(q_s_a[i])
    return(q_s_a)

In [18]:
action_space=np.arange(-0.3,0.5,0.1)
action_space

array([-3.00000000e-01, -2.00000000e-01, -1.00000000e-01,  5.55111512e-17,
        1.00000000e-01,  2.00000000e-01,  3.00000000e-01,  4.00000000e-01])

In [43]:
#on policy sarsa
#on policy sarsa with epsilon greedy much worse than greedy
cost, cum_reward = {}, {}
gamma=0.9
alpha=0.1
na=8
for ep in range(100):
    q=[]
    states=[]
    state = env.reset()
    state=state[0][:24]
    states.append(state)
    done = False
    
    
    x_s_a=generate_state_action_matrix(state,8)
    q_s_a_s=get_q_values(x_s_a,w)
    action=action_space[np.argmax(q_s_a_s)]
    x=x_s_a[np.argmax(q_s_a_s)]
    q.append(np.max(q_s_a_s))
    while not done:
        next_state, reward, done, _ = env.step([[action]])
        reward = reward_function(reward)[0] 
        #print('rea',reward)
        state = next_state[0][:24]
        
        x_s_dash_a=generate_state_action_matrix(state,8)
        q_s_dash_a_s=get_q_values(x_s_dash_a,w)
        
        epsilon=np.random.rand(1)
        if epsilon<0.7:
            action_dash=action_space[np.argmax(q_s_dash_a_s)]
            q_s_dash=np.max(q_s_dash_a_s)
            x_=x_s_dash_a[np.argmax(q_s_dash_a_s)]
            
        else:
            ac=np.random.choice(na)
            action_dash=action_space[ac]
            q_s_dash=q_s_dash_a_s[ac]
            x_=x_s_dash_a[ac]
        #print(np.matmul(w,x))
        #print(np.matmul(w,x_s_dash_a[np.argmax(q_s_dash_a_s)]))
        q_s=q[-1]
        w=w+alpha*(reward+gamma*q_s_dash-q_s)*(gamma*x_-x)
        
        q.append(q_s_dash)
        
        action=action_dash
        x=x_
    cost[ep] = env.cost()
    print(cost[ep])
        
        
    
    

209.45633242157078
207.79708014301963
212.91057209964603
215.87820965597064
208.76994143248322
209.58828467767987
209.81606503279212
209.1824008933386
212.85672561944767
211.93615806702576
206.58280187793093
214.453297283236
209.3439791175629
213.2629431353172
210.7669777213256
214.19528807366063
209.66856026299118
211.043631403325
211.95450445279639
207.68568509317348
207.6896114752634
211.71039069141568
208.7884613345101
213.40713102300717
210.68526247574806
213.47169311329355
211.0302076925484
214.07172795661333
211.3828654175321
211.8865258929652
209.9543186365555
210.6980112989772
212.07330482066237
213.3428237207736
206.5225129033473
210.50196558405713
212.9327015333381
210.55848228730963
209.15268610190793
212.6062453858052
212.40819617232222
210.38517969933645
211.09370565000813
213.50970185771033
209.90619850002528
212.0414742040319
212.90959785587717
214.64663938882256
212.68493893342037
213.75136350809328
209.5415986646015
212.0320942128574
210.84236534610437
209.88723223765

In [53]:
#sarsa greedy
cost, cum_reward = {}, {}
gamma=0.9
alpha=0.1
na=8
w=np.ones(32)
for ep in range(100):
    q=[]
    states=[]
    state = env.reset()
    state=state[0][:24]
    states.append(state)
    done = False
    
    
    x_s_a=generate_state_action_matrix(state,8)
    q_s_a_s=get_q_values(x_s_a,w)
    action=action_space[np.argmax(q_s_a_s)]
    x=x_s_a[np.argmax(q_s_a_s)]
    q.append(np.max(q_s_a_s))
    while not done:
        next_state, reward, done, _ = env.step([[action]])
        reward = reward_function(reward)[0] 
        #print('rea',reward)
        state = next_state[0][:24]
        
        x_s_dash_a=generate_state_action_matrix(state,8)
        q_s_dash_a_s=get_q_values(x_s_dash_a,w)
        
        epsilon=np.random.rand(1)
        #if epsilon<0.7:
        action_dash=action_space[np.argmax(q_s_dash_a_s)]
        q_s_dash=np.max(q_s_dash_a_s)
        x_=x_s_dash_a[np.argmax(q_s_dash_a_s)]
            
#         else:
#             ac=np.random.choice(na)
#             action_dash=action_space[ac]
#             q_s_dash=q_s_dash_a_s[ac]
#             x_=x_s_dash_a[ac]
        #print(np.matmul(w,x))
        #print(np.matmul(w,x_s_dash_a[np.argmax(q_s_dash_a_s)]))
        q_s=q[-1]
        w=w+alpha*(reward+gamma*q_s_dash-q_s)*(gamma*x_-x)
        
        q.append(q_s_dash)
        
        action=action_dash
        x=x_
    cost[ep] = env.cost()
    print(cost[ep])
        
        
    
    

180.40900847061246
180.53393503856896
180.66307259918216
180.52080049387337
180.49528783022978
180.47274440159893
180.434599219371
180.5257305804471
180.52994307696972
180.62521659615845
180.52080049387337
180.49528783022978
180.47274440159893
180.434599219371
180.5257305804471
180.52994307696972
180.62521659615845
180.52080049387337
180.49528783022978
180.47274440159893
180.434599219371
180.5257305804471




180.3170853281487
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.25737295269303
180.257372952

In [42]:
#Qlearning
cost, cum_reward = {}, {}
gamma=0.9
alpha=0.1
na=8
for ep in range(100):
    q=[]
    states=[]
    state = env.reset()
    state=state[0][:24]
    states.append(state)
    done = False
    
    
    x_s_a=generate_state_action_matrix(state,8)
    q_s_a_s=get_q_values(x_s_a,w)
    action=action_space[np.argmax(q_s_a_s)]
    x=x_s_a[np.argmax(q_s_a_s)]
    q.append(np.max(q_s_a_s))
    while not done:
        next_state, reward, done, _ = env.step([[action]])
        reward = reward_function(reward)[0] 
        state = next_state[0][:24]
        
        x_s_dash_a=generate_state_action_matrix(state,8)
        q_s_dash_a_s=get_q_values(x_s_dash_a,w)
        
        epsilon=np.random.rand(1)
        if epsilon<0.7:
            action_dash=action_space[np.argmax(q_s_dash_a_s)]
            
        else:
            ac=np.random.choice(na)
            action_dash=action_space[ac]
        p=np.argmax(q_s_dash_a_s)    
        q_s_dash=np.max(q_s_dash_a_s)
        x_=x_s_dash_a[p]
        #print(np.matmul(w,x))
        #print(np.matmul(w,x_s_dash_a[np.argmax(q_s_dash_a_s)]))
        q_s=q[-1]
        w=w+alpha*(reward+gamma*q_s_dash-q_s)*(gamma*x_-x)
        
        q.append(q_s_dash)
        
        action=action_dash
        x=x_
    cost[ep] = env.cost()
    print(cost[ep])
        
        
    
    

195.7785537611777
195.87893517418928
197.31352797014839
196.25813051958207
194.84779475397622
195.47935961556
195.41635263119434
195.02502920559255
195.05472343671522
196.06723487103153
196.60921795038834
197.0381282875192
195.70548139389857
194.91254911564752
195.37059015725123
195.2383221258765
195.50643750011403
196.08142165924005
194.36577593074261
195.702408206766
196.1813634084208
195.10260042612197




204.4504002066826
212.21939707456477
206.05481775015213
210.26302373255905
210.8358306629208
210.1672014531316
210.27405438587272
214.1022604052116
214.82426070308873
211.6755645439997
212.8408300352563
211.4525275681009
214.7246346057107
212.6084896541909
214.53243791277663
210.5907491950985
213.13451297009874
211.61213750362012
212.26357596786434
212.31616898825652
211.03088113231914
210.27595003849373
211.080874502206
210.3424342078972
210.4376143255399
213.20713861923196
210.32883585368074
211.19812658735157
213.60847667326493
214.83895312158296
209.79274954147462
212.18296447750265
212.2783337516782
214.83981473012022
209.6105243787246
209.77321862969552
208.0574583606622
216.65203808574105
213.57866693705733
212.55831316553153
211.96614624194987
206.57469309570416
211.62979072025746
209.61803635310238
209.3438867415649
213.35301851933067
211.4229131778351
209.05881392804207
212.39510880384825
208.1255381894592
210.30904634521949
212.6404526782735
213.31702230508157
213.4594617189