In [1]:
"""
This part of code is the Deep Q Network (DQN) brain.
view the tensorboard picture about this DQN structure on: https://morvanzhou.github.io/tutorials/machine-learning/reinforcement-learning/4-3-DQN3/#modification
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
Using:
Tensorflow: r1.2
"""

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm
from numba import jit
import random
import heapq
from heapq import heapqpop
from heapq import heapqpush


# np.random.seed(1)
# USE_CUDA = torch.cuda.is_available()


# Deep Q Network off-policy
class DQN(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(DQN, self).__init__()
        self.line1 = nn.Linear(num_inputs, 64)
        self.line2 = nn.Linear(64, 32)
        self.line3 = nn.Linear(32, num_outputs)

    def forward(self, x):
        x = F.relu(self.line1(x))
        x = F.relu(self.line2(x))
        x = self.line3(x)
        return x


class Train():
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.01,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=300,
            memory_size=500,
            batch_size=32,
            e_greedy_increment=None,
            output_graph=False,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))

        # consist of [target_net, evaluate_net]
        self.target_net = DQN(n_features, n_actions)
        self.eval_net = DQN(n_features, n_actions)
        #self.eval_net.load_state_dict(torch.load('C:/Rebalancing/data/result/pytorchmodel/params.pkl'))

        self.criterion = nn.MSELoss()
        self.optimizer = torch.optim.Adagrad(self.eval_net.parameters(), lr=self.lr)

    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = np.hstack((s, [a, r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1

    def choose_action(self, observation):
        # to have batch dimension when feed into tf placeholder
        observation = observation[np.newaxis, :]

        if np.random.uniform() < self.epsilon:
            # forward feed the observation and get q value for every actions
            actions_value = self.eval_net(Variable(torch.from_numpy(observation).float())).detach().numpy()
            action = np.argmax(actions_value)
        else:
            action = np.random.randint(0, self.n_actions)
        return action

    def learn(self):
        # check to replace target parameters
        if self.learn_step_counter % self.replace_target_iter == 0:
            # self.sess.run(self.target_replace_op)
            self.target_net.load_state_dict(self.eval_net.state_dict())
            # print('\ntarget_params_replaced\n')

        # sample batch memory from all memory
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        self.s = Variable(torch.from_numpy(batch_memory[:, :self.n_features]).float(), requires_grad=True)
        self.a = Variable(torch.from_numpy(batch_memory[:, self.n_features]).long())
        self.r = Variable(torch.from_numpy(batch_memory[:, self.n_features + 1]).float())
        self.s_ = Variable(torch.from_numpy(batch_memory[:, -self.n_features:]).float())


        current_Q_values = self.eval_net(self.s).gather(1, self.a.unsqueeze(1)).view(-1)
        next_Q_values = self.target_net(self.s_).detach().max(1)[0]
        # Compute the target of the current Q values
        target_Q_values = self.r + (self.gamma * next_Q_values)
        # Compute Bellman error
        loss = self.criterion(current_Q_values, target_Q_values)

        self.optimizer.zero_grad()
        # run backward pass
        loss.backward()

        # Perfom the update
        self.optimizer.step()

        # increasing epsilon
        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1

# if __name__ == '__main__':
#     DQN = DeepQNetwork(3,4, output_graph=True)

import numpy as np
import pandas as pd

class Area():
    def __init__(self, n, a_id):
        self.a_id = a_id
        self.normal_bike = n
        self.broken_bike = 0

    def move(self):
        self.normal_bike -= 1
        self.broken_bike += 1
        
    def repair(self):
        self.normal_bike += 1
        self.broken_bike -= 1



class BikeNet():
    def __init__(self, N, R, A, Q, repair, P, time_limit):
        self.N = N
        self.R = R
        self.A = A
        self.Q = Q
        self.area = list(range(self.A+1))
        self.repair = repair
        self.P = P
        self.time_limit = time_limit
        self.reset()
        self.trans = {}

    def reset(self):

        # initiation of instances of Area and scheduler
        self.T = 0
        self.carrier_position = 0
        self.scheduler = []
        self.s = [int(self.N/self.A)] * self.A + [0]*self.A
        for i in range(A):
            self.scheduler.append([random.expovariate(self.R[i][0]), 1, i])
        heapq.heapify(self.scheduler)

        return self.s.copy()

    def step(self, action):
        # time for carrier to take the action and repair one bicycle
        t = (abs(self.carrier_position % 3 - action % 3) + abs(self.carrier_position // 3 - action // 3)) *0.5 + self.repair
        t_cursor = self.T + t
        self.carrier_position = action
        reward = 0
        self.T = self.scheduler[0][0]

        # update the atate of QN during the tansformation time
        while self.T < t_cursor:
            event = heapqpop(self.scheduler)
            kind, place = event[1], event[2]
            if kind == 1:
                if self.s[place] == 0:
                    # this is a loss
                    reward -= 1
                    next_event = [self.T + random.expovariate(self.R[place][0]), 1, place]
                    heapq.heappush(self.scheduler, next_event)
                else:
                    target = random.choices(self.area, self.Q[place], k=1)[0]
                    if target == self.A:
                        self.s[place] -= 1
                        self.s[place+self.A] += 1
                        continue
                    else:
                        self.s[place] -= 1
                        next_event1 = [self.T + random.expovariate(self.R[place][1]), 2, target]
                        next_event2 = [self.T + random.expovariate(self.R[place][0]), 1, place]
                        heapq.heappush(self.scheduler, next_event1)
                        heapq.heappush(self.scheduler, next_event2)
            else:
                self.s[place] += 1
            heapq.heappop(self.scheduler)
            
        self.carrier_position = action
        if self.s[action+self.A]>0:
            self.s[action+self.A] -= 1
            self.s[action] += 1
        s_ = self.s.copy()

        self.T = t_cursor
        if self.T < self.time_limit:
            return s_, reward, 0
        else:
            return s_, reward, 1

# from maze_env import Maze
# from RL_brain import DeepQNetwork
from tqdm import tqdm
from numba import jit


@jit
def simulate():
    n_episodes = 10
    result = []
    for episode in tqdm(range(n_episodes)):
        step = 0
        sum_r = 0
        # initial observation
        observation = env.reset()
        action = 0
        #observation = np.array(int(N/A)) #devide all the normal bikes to all the areas evenly at the beginning

        while True:
            # fresh env
            #env.render()

            # RL choose action based on observation
            if not env.s[action+env.A]:
            #action = RL.choose_action(observation)
                action = (action+1)%A

            # RL take action and get next observation and reward
            observation_, reward, done = env.step(action)

            #RL.store_transition(observation, action, reward, observation_)

            #if (step > 200) and (step % 5 == 0):
            #    RL.learn()
            #RL.learn()

            # swap observation
            observation = observation_

            # break while loop when end of this episode
            if done:
                break
            #step += 1
            sum_r += reward
            
        result.append([episode, sum_r])

    # end of game
    print('learning over')
    return result


if __name__ == "__main__":
    # maze game
    np.random.seed(1)
    N = 80 #total number of bikes in the QN
    A = 4 #A for areas, indicates the aumber of areas and the action space
    R = {}  # [customer_arrval, ride]
    for i in range(A): R[i] = [1.0, 0.5]
    Q = [np.random.rand(A) for i in range(A)]
    Q = [q / sum(q)*0.99 for q in Q]
    Q = [np.append(q, 0.01) for q in Q]
    #Q = [[0,0.9,0.1], [0.9,0,0.1]]
    t_repair = 5
    P = 0
    time_limit = 180

    env = BikeNet(N, R, A, Q, t_repair, P, time_limit)
    
    RL = Train(A, 2*A,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      replace_target_iter=200,
                      memory_size=2000,
                      # output_graph=True
                      )
    output = simulate()
    #RL.plot_cost()

  0%|          | 0/10 [00:00<?, ?it/s]


AttributeError: 'BikeNet' object has no attribute 'a'

In [None]:
class DQN(nn.Module):
    def __init__(self, in_channels=4, num_actions=18):
        """
        Initialize a deep Q-learning network as described in
        https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
        Arguments:
            in_channels: number of channel of input.
                i.e The number of most recent frames stacked together as describe in the paper
            num_actions: number of action-value to output, one-to-one correspondence to action in game.
        """
        super(DQN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc4 = nn.Linear(7 * 7 * 64, 512)
        self.fc5 = nn.Linear(512, num_actions)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = F.relu(self.fc4(x.view(x.size(0), -1)))
        return self.fc5(x)

In [None]:
import gym
import torch.optim as optim

from dqn_model import DQN
from dqn_learn import OptimizerSpec, dqn_learing
from utils.gym import get_env, get_wrapper_by_name
from utils.schedule import LinearSchedule

BATCH_SIZE = 32
GAMMA = 0.99
REPLAY_BUFFER_SIZE = 1000000
LEARNING_STARTS = 50000
LEARNING_FREQ = 4
FRAME_HISTORY_LEN = 4
TARGER_UPDATE_FREQ = 10000
LEARNING_RATE = 0.00025
ALPHA = 0.95
EPS = 0.01

def main(env, num_timesteps):

    def stopping_criterion(env):
        # notice that here t is the number of steps of the wrapped env,
        # which is different from the number of steps in the underlying env
        return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps

    optimizer_spec = OptimizerSpec(
        constructor=optim.RMSprop,
        kwargs=dict(lr=LEARNING_RATE, alpha=ALPHA, eps=EPS),
    )

    exploration_schedule = LinearSchedule(1000000, 0.1)

    dqn_learing(
        env=env,
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
    )

if __name__ == '__main__':
    # Get Atari games.
    benchmark = gym.benchmark_spec('Atari40M')

    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = 0 # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed)

    main(env, task.max_timesteps)