In [0]:
import torch.nn as nn
import torch as t
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import gym

In [0]:
device = t.device('cuda' if t.cuda.is_available() else 'cpu')

class Actor(nn.Module):
  ###########################################################################
  # This is a generic action function out out a array which can be used as a output for 
  #  1:DDPG for futher processing 
  #  2:discrete actions for further softmax
  ###########################################################################
  def __init__(self, dim_state, dim_action, alpha, checkpoint='actor.pt', num_layers=2):
    super(Actor, self).__init__()
    self.alpha = alpha
    self.checkpoint = checkpoint
    
    layers = []
    for i in range(num_layers):
      layers.append(dim_state // 2**i)
    #print(layers)
    blocks=[self.block(i,j) for i, j in zip(layers[0:-1], layers[1:])]
    self.total_mc = nn.Sequential(*blocks)    
    self.linear = nn.Linear(layers[-1], dim_action)
    self.optimizer = optim.Adam(self.parameters(), lr = alpha, weight_decay=1e-4)
    self.to(device)
    
  def saveChk(self):
    print('save model...')
    t.save(self.state_dict(), self.checkpoint)
    
  def loadChk(self):
    print('load model')
    self.load_state_dict(t.load(self.checkpoint))
    
  def block(self, dim_in, dim_out):
    return nn.Sequential(
    nn.Linear(dim_in, dim_out),
    nn.ReLU()
    )
  
  def forward(self, x):
    output = x.to(device)
    
    output = self.total_mc(output)
    output = self.linear(output)
    return output
        

In [0]:
class Critic(nn.Module):
  def __init__(self, dim_state, dim_action, alpha, checkpoint='critic.pt',num_layers=2):
    super(Critic, self).__init__()
    self.checkpoint = checkpoint
    self.alpha = alpha
    
    layers = []
    for i in range(num_layers):
      layers.append((dim_state+dim_action) // 2**i)
    
    total_blocks = [self.blocks(i,j) for i,j in zip(layers[0:-1],layers[1:])]
    print(total_blocks)
    
    self.total_fc = nn.Sequential(*total_blocks)
    self.linear=nn.Linear(layers[-1], 1)
    self.optimizer = optim.Adam(self.parameters(), lr=alpha, weight_decay=1e-4)
    self.to(device)
      
  def blocks(self, dim_in, dim_out):
    return nn.Sequential(
    nn.Linear(dim_in, dim_out),
    nn.ReLU()
    )
  
  def saveChk(self):
    print('save model...')
    t.save(self.state_dict(), self.checkpoint)
  
  def loadChk(self):
    print('load model...')
    self.load_state_dict(t.load(self.checkpoint))
  
  def forward(self, state, action):
    state = state.to(device)
    action = action.to(device)
    
    output = self.total_fc(t.cat((state, action), dim=1))
    output = self.linear(output)
    return output 
#critic = Critic(50, 50, 1e-4, checkpoint='critic.pt',num_layers=2)

In [0]:
#Simulator class
class RecModel(nn.Module):
  def __init__(self, dim_input, layers):
    super(RecModel, self).__init__()
    
    layer_list = []
    for i in range(layers):
      layer_list.append(dim_input // 2**i)
    print('layers: ', layer_list)
    
    total_blocks=[self.block(i,j) for i,j in zip(layer_list[:-1],layer_list[1:])]
    self.total_mc = nn.Sequential(*total_blocks)
    self.linear = nn.Linear(layer_list[-1], 1)
    self.to(device)
  
  def block(self, dim_in, dim_out):
    return nn.Sequential(
      nn.Linear(dim_in, dim_out),
      nn.ReLU()
    )

  def forward(self, x):
    out= x.to(device)
    #print(out.dtype, out.type())
    
    out = self.total_mc(out.float())  #use t.tensor(np) will return torch.DoubleTensor, need to convert to float
    out = self.linear(out)
    out = t.sigmoid(out)
    return out


In [0]:
#Build the env compatible with Rllib
import gym
from gym.spaces import Box, Discrete

class Env(gym.Env):
  
  self.getStateActions(states, actions)
  self.getSimulator(simulator)
  
  def __init__(self, cofig):
    #Two required class members  
    high = 1000* np.ones(50)
    low = -high
    
    self.action_space = Box(high=high, low=low)
    self.observation_space = Box(high=high, low=low)
    
    self.state=None
    self.t = 0
  
  def getStateActions(self,states, actions):
    self.states = states
    self.actions = actions
    
  def getSimulator(self,simulator):
    self.simulator = simulator
  
  def reset(self):
    self.state = self.states[self.n]
    self.n+=1
    
    self.step = 0
    return self.state
  
  def step(self, action):
    #need to return state, reward, done, info
    prob = self.simulator(t.cat((t.tensor(self.state).float(), t.tensor(action).float())))
    if prob.item()>=0.5:
      reward =1
      state_new = self.state + action
    else:
      reward = 0
      state_new = state
    self.step+=1
    done = (self.step==1000)
    
    self.state = state_new
    return state, reward, done, {}

In [0]:
class replayBuffer():
  
  ################################################
  #The buffer should include s1, a1, r1, s2, done
  ################################################
  
  def __init__(self, maxm_size,dim_state, dim_action):
    self.counter = 0
    self.state_mem = np.zeros((maxm_size, dim_state))
    self.action_mem = np.zeros((maxm_size, dim_action))
    self.reward_mem = np.zeros(maxm_size)
    self.state_next_mem = np.zeros((maxm_size,dim_state))
    self.done_mem = np.zeros(maxm_size)
    self.maxm_size = maxm_size
    
  def store_transaction(self,s1, a1, r1, s2, done):
    index = self.counter % self.maxm_size
    
    self.state_mem[index] = s1
    self.action_mem[index] = a1
    self.reward_mem[index] = r1
    self.state_next_mem[index] = s2
    self.done_mem[index] = done
    
    self.counter+=1
    
  def sample_batch(self, batch_size=16):
    maxm_s = min(self.counter, self.maxm_size)
    batch = np.random.choice(maxm_s, batch_size)
    
    state_batch = self.state_mem[batch]
    action_batch = self.action_mem[batch]
    reward_batch = self.reward_mem[batch]
    state_next_batch=self.state_next_mem[batch]
    done_batch = self.done_mem[batch]
    
    return state_batch, action_batch, reward_batch, state_next_batch, done_batch
    