In [1]:
import numpy as np
import gym 

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt 
import re
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "0, 1"


In [15]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
cpu_device = torch.device("cpu")

no_randomness = False
load_weights = False
semi_random = False

class replay_memory(object):
    def __init__(self, size, sd, b):
        self.max_size = size
        self.storage = []
        self.cur_size = 0
        self.batch_size = b
        self.index = 0 
        
    def add(self, s,a,r,ns,d):
        if self.cur_size < self.max_size:
            self.storage.append([{"s":s, "a":a, "r":r, "ns":ns, "d":d}])
            self.cur_size += 1
        else:
            self.storage.pop(0)
            self.storage.append([{"s":s, "a":a, "r":r, "ns":ns, "d":d}])
            
    def sample(self):
        s = []
        a = []
        r = []
        ns = []
        d = []
        for i in range(self.batch_size):
            indx = torch.randint(self.cur_size, size=(1,)).numpy()[0]
            if no_randomness:
                indx = i
            if semi_random:
                indx = self.index
                if self.index < self.cur_size-1 and self.index < self.max_size-1:
                    self.index += 1
                else:
                    self.index = 0 
            s += [self.storage[indx][0]["s"]]
            a += [self.storage[indx][0]["a"]]
            r += [self.storage[indx][0]["r"]]
            ns += [self.storage[indx][0]["ns"]]
            d += [self.storage[indx][0]["d"]]
            
        return {"s":s, "a":a, "r":r, "ns":ns, "d":d}
    

def get_action_(env, epsilon, action):
    if no_randomness:
        epsilon = 0 
    if semi_random:
        x=np.squeeze(action_ts.detach().numpy())
        if np.sum(x) < 10:
            if 2*np.log(x[0]) - 5 > 3*np.log(x[1]) - 7:
                a = 1
            else:
                a = 0 
        else:
            a = action.argmax().detach().numpy()            
    else:
        rnd = torch.rand((1)).numpy()[0]
        if rnd < epsilon:
            a = torch.randint(env.action_space.n, size=(1,)).numpy()[0]
        else:
            a = action.argmax().detach().to(cpu_device).numpy()
        
    return a
    
class exploration(object):
    def __init__(self, max_,min_,num_eps):
        self.epsilon = max_
        self.min_eps = min_
        self.num_eps = num_eps
        self.eps_red = (max_ - min_)/num_eps
    
    def reduce(self):
        if (self.epsilon > self.min_eps):
            self.epsilon -= self.eps_red 
        
        
class DQN(nn.Module):

    def __init__(self, ni, no):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(ni,10)
        self.fc2 = nn.Linear(10,10)
        self.fc3 = nn.Linear(10,no)
        if no_randomness and not load_weights:
            self.fc1.weight.data.fill_(0.2)
            self.fc2.weight.data.fill_(0.2)
            self.fc3.weight.data.fill_(0.2)
            self.fc1.bias.data.fill_(0.0)
            self.fc2.bias.data.fill_(0.0)
            self.fc3.bias.data.fill_(0.0)
        elif load_weights:
            d=np.loadtxt("fc1.weight", delimiter=",")
            self.fc1.weight.data = torch.tensor(d, dtype=torch.float32)
            d=np.loadtxt("fc2.weight", delimiter=",")
            self.fc2.weight.data = torch.tensor(d, dtype=torch.float32)
            d=np.loadtxt("fc3.weight", delimiter=",")
            self.fc3.weight.data = torch.tensor(d, dtype=torch.float32)
            d=np.loadtxt("fc1.bias", delimiter=",")
            self.fc1.bias.data = torch.tensor(d, dtype=torch.float32)
            d=np.loadtxt("fc2.bias", delimiter=",")
            self.fc2.bias.data = torch.tensor(d, dtype=torch.float32)
            d=np.loadtxt("fc3.bias", delimiter=",")            
            self.fc3.bias.data = torch.tensor(d, dtype=torch.float32)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    
def test(period, avg_q, policy_net, env):
    
    rewards = 0
    for i in range(100):
        state = env.reset()    
        done = False
        while not done:
            action = policy_net(torch.tensor(state).float().to(device))
            action = get_action_(env, 0, action)
            next_state, reward, done, _ = env.step(action)

            state = next_state
            rewards+=1
            state = next_state
    print ("Test: episode={0:d}, Q-value={1:0.2f}, reward={2:0.2f}".format(period, avg_q, rewards/100.))
    return rewards/100.
    
def get_weight_norm(net):
    grad_norm=0
    for param in net.parameters():
    #     print(param)
        grad_norm += torch.norm(param)

    return grad_norm

def get_grad_norm(net):
    grad_norm=0
    for param in net.parameters():
    #     print(param)
        grad_norm += torch.norm(param.grad)

    return grad_norm


def get_grad_list(net):
    grads=np.array([])
    for param in net.parameters():
        grads = np.concatenate((grads, param.grad.data.view(-1).detach().numpy()))
    return grads

In [16]:
torch.manual_seed(123)
env = gym.make("CartPole-v0")

model = DQN(env.observation_space.shape[0], env.action_space.n).to(device) 
policy_net = torch.nn.DataParallel(model,device_ids=[0,1],output_device=0).to(device)

target_model = DQN(env.observation_space.shape[0], env.action_space.n).to(device) 
target_net = torch.nn.DataParallel(target_model,device_ids=[0,1],output_device=0).to(device)

optimizer = optim.Adam(policy_net.parameters())
# optimizer = optim.SGD(policy_net.parameters(),lr=0.001)
# torch.nn.utils.clip_grad_norm(policy_net.parameters(),max_norm=10,norm_type=2)

In [19]:
######################################################################
# Training loop
batch_size = 128
min_replay_buffer = 1000
max_replay_buffer = 1e6

target_update = 200 
num_episodes = 200
show_detail = False
log_interval = 20 # will print the details of last xx train_steps
result = []

rbm = replay_memory(max_replay_buffer, env.observation_space, batch_size)
exp = exploration(0.9, 0.05, num_episodes)

train_step = 0
for i_episode in range(num_episodes):
     # initialize state
    state = env.reset()

    # Select and perform an action    
    # keep going until get to the goal state
    cnt = 0
    done = False
    rewards = []
    while not done:
        cnt+=1
        action_ts = policy_net(torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)).squeeze()
        action = get_action_(env, exp.epsilon, action_ts)
        next_state, reward, done, _ = env.step(action)
        rewards.append(reward)
        
        rbm.add(torch.tensor(state), torch.tensor(action), torch.tensor(reward), torch.tensor(next_state), 
               done)
            
        if show_detail:
            print (i_episode,"-",cnt,"-",rbm.cur_size, state, action_ts, action, reward, next_state, done)
        
        if rbm.cur_size >= min_replay_buffer:
            if rbm.cur_size == min_replay_buffer:
                print ("Started training")
            batch = rbm.sample()
            
            target_Q = target_net(torch.stack(batch["ns"]).float().to(device
                    )).squeeze().max(1)[0].detach()
            target = 0.99 * target_Q*(1-torch.tensor(batch["d"]).to(device).float()) + \
            torch.stack(batch["r"]).to(device)
            QValue = policy_net(torch.stack(batch["s"]).float().to(
                device)).squeeze().gather(1, torch.stack(batch["a"]).to(device).unsqueeze(1))
            
            loss_model = F.mse_loss(QValue, target.unsqueeze(1))
            if device.type == 'cuda':
                loss = torch.nn.DataParallel(loss_model, device_ids=[0,1])
            else:
                loss = loss_model

            if train_step%log_interval == 0:
                if show_detail:
                    print (i_episode, "-", train_step)
                    print (np.array(batch["d"],dtype=np.int))
                    print ([i for i in torch.stack(batch["a"]).numpy()])
                tmp = [i for i in target_Q.numpy()]
                if show_detail:
                    print ("target_Qvalue", tmp)
                tmp = [i for i in target.numpy()]
                if show_detail:
                    print ("target_value", tmp)
                tmp = [i for i in np.squeeze(QValue.detach().numpy())]
                if show_detail:
                    print ("Qvalue", tmp)
                            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_step+=1
                             
        state = next_state
        if train_step > 1 and train_step%target_update==0:
            target_net.load_state_dict(policy_net.state_dict())
            
    if train_step > 0 and i_episode%log_interval == 0:
        try:
            per = test(i_episode, QValue.mean().item(), policy_net, env)
        except:
            per = test(i_episode, 0, policy_net, env)
            
        result += [ per]
        #     print (cnt)
    exp.reduce()
        

Started training
Test: episode=60, Q-value=0.85, reward=9.35
Test: episode=80, Q-value=2.86, reward=9.26
Test: episode=100, Q-value=3.68, reward=9.40
Test: episode=120, Q-value=6.78, reward=41.44
Test: episode=140, Q-value=14.54, reward=198.07
Test: episode=160, Q-value=28.42, reward=169.63
Test: episode=180, Q-value=39.73, reward=143.91
