In [1]:
import torch
import torchvision

In [2]:
import torch.nn as nn
import torch.nn.functional as F 

In [49]:
X = torch.FloatTensor([[0 ,0, 0, 1], [1, 0, 1, 0], [0, 1, 0, 1], [ 1, 1, 1, 1], [ 1, 1, 1, 0]])
Y = torch.FloatTensor([[0], [1], [1], [0], [1]])
# Y = torch.FloatTensor([0, 1, 1, 0, 1]).reshape(5,1)

In [50]:
print(X.shape)
print(Y.shape)

torch.Size([5, 4])
torch.Size([5, 1])


In [51]:
linear1 = torch.nn.Linear(4, 2, bias=True)
linear2 = torch.nn.Linear(2, 1, bias=True)
sigmoid = torch.nn.Sigmoid()


In [52]:
model = torch.nn.Sequential(linear1, sigmoid, linear2, sigmoid)

In [53]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1)

In [55]:
for step in range(10000):
    optimizer.zero_grad()
    
    outputs = model(X)
    
    cost = criterion(outputs, Y)
    cost.backward()
    optimizer.step()    

    if step % 1000 == 0:
        print('%3d :  %15.15f' % (step, cost.item()))

  0 :  0.000929579488002
1000 :  0.000842447276227
2000 :  0.000770144746639
3000 :  0.000709243468009
4000 :  0.000657259137370
5000 :  0.000612328934949
6000 :  0.000573127414100
7000 :  0.000538651715033
8000 :  0.000508090073708
9000 :  0.000480786024127


In [56]:
import numpy as np
import pandas as pd
pred = model(X)
y = pd.Series(Y.flatten().numpy(), name='y')
yhat = pd.Series(pred.detach().flatten().numpy(), name='yhat')
comb = pd.concat([y, yhat], axis=1)
print(comb)

     y      yhat
0  0.0  0.000644
1  1.0  0.999486
2  1.0  0.999493
3  0.0  0.000616
4  1.0  1.000000


### CartPole

- ### PangYoLap DQN

In [57]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
buffer_limit  = 50000
batch_size    = 32

class ReplayBuffer():
    def __init__(self):
        self.buffer = collections.deque(maxlen=buffer_limit)
    
    def put(self, transition):
        self.buffer.append(transition)
    
    def sample(self, n): # 버퍼에서 샘플링
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])

        return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
               torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
               torch.tensor(done_mask_lst)
    
    def size(self):
        return len(self.buffer)

class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(4, 64)
        self.fc2 = nn.Linear(64, 64)   
        self.fc3 = nn.Linear(64, 2)

    def forward(self, x): # Q Value 리턴 (음수가 될 수 도 있음)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))    
        x = self.fc3(x)
        return x
      
    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random() # 0 ~ 1 
        if coin < epsilon:
            return random.randint(0,1)
        else : 
            return out.argmax().item()
            
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s, a, r, s_prime, done_mask = memory.sample(batch_size)

        q_out = q(s) # input size (32,4) return size (32,2)
        q_a = q_out.gather(1, a) # 취한 액션의 큐값만 골라냄 (32,1)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


env = gym.make('CartPole-v1')
q = Qnet()
q_target = Qnet()
q_target.load_state_dict(q.state_dict())
memory = ReplayBuffer()

print_interval = 50
score = 0.0  
optimizer = optim.Adam(q.parameters(), lr=learning_rate) # q_target 은 업데이트 안 함!

for n_epi in range(2500):
    epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
    s = env.reset()
    done = False

    while not done:
        a = q.sample_action(torch.from_numpy(s).float(), epsilon)      
        s_prime, r, done, info = env.step(a)
        done_mask = 0.0 if done else 1.0
        memory.put((s,a,r/100.0,s_prime, done_mask))
        s = s_prime

        score += r
        if done:
            break

    if memory.size()>2000:
        train(q, q_target, memory, optimizer)

    if n_epi%print_interval==0 and n_epi!=0:
        q_target.load_state_dict(q.state_dict()) # 타겟 네트워크 업데이트 (20 번 에피소드 마다)
        print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(n_epi, score/print_interval, memory.size(), epsilon*100))                
        
        if (score/print_interval) > 300:
            break
            
        score = 0.0
        
env.close()

n_episode :50, score : 10.2, n_buffer : 509, eps : 7.8%
n_episode :100, score : 10.2, n_buffer : 1018, eps : 7.5%
n_episode :150, score : 10.0, n_buffer : 1517, eps : 7.3%


  return torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \


n_episode :200, score : 10.0, n_buffer : 2018, eps : 7.0%
n_episode :250, score : 14.9, n_buffer : 2762, eps : 6.8%
n_episode :300, score : 23.0, n_buffer : 3913, eps : 6.5%
n_episode :350, score : 21.7, n_buffer : 4999, eps : 6.2%
n_episode :400, score : 12.9, n_buffer : 5642, eps : 6.0%
n_episode :450, score : 12.5, n_buffer : 6265, eps : 5.8%
n_episode :500, score : 12.1, n_buffer : 6870, eps : 5.5%
n_episode :550, score : 19.6, n_buffer : 7851, eps : 5.3%
n_episode :600, score : 42.3, n_buffer : 9967, eps : 5.0%
n_episode :650, score : 35.3, n_buffer : 11731, eps : 4.8%
n_episode :700, score : 91.1, n_buffer : 16288, eps : 4.5%
n_episode :750, score : 152.9, n_buffer : 23934, eps : 4.2%
n_episode :800, score : 179.5, n_buffer : 32911, eps : 4.0%
n_episode :850, score : 190.4, n_buffer : 42432, eps : 3.8%
n_episode :900, score : 197.3, n_buffer : 50000, eps : 3.5%
n_episode :950, score : 193.2, n_buffer : 50000, eps : 3.2%
n_episode :1000, score : 207.9, n_buffer : 50000, eps : 3.0%

In [59]:
path = 'C:\\Users\\shinki\\OneDrive - Novelis Inc\\Documents\\Data Science\\10. Learning\\7. RL\\q_target'
# # torch.save(q_target.state_dict(), path) # save weights only
# torch.save(q_target, path)
q_target = torch.load(path)

In [61]:
import time
for i_episode in range(5):
    observation = env.reset()
    for t in range(550):
        time.sleep(0.01)
        env.render()
        action = q_target(torch.Tensor(observation)).argmax().item() 
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            time.sleep(1)
            break
env.close()

Episode finished after 367 timesteps
Episode finished after 500 timesteps
Episode finished after 500 timesteps
Episode finished after 500 timesteps
Episode finished after 500 timesteps


- ### Random or Simple Policy

In [68]:
import time
import gym
env = gym.make('CartPole-v1')


for i_episode in range(5):
    observation = env.reset()
    for t in range(300):
        time.sleep(0.01)
        env.render()
#         action = env.action_space.sample()
#         action = random.randint(0,1)

        if observation[0] > 0: # if the pole is on the right side
            action = 0 #  pushing the cart to the left
        else:
            action = 1 #  pushing the cart to the right
        else:
        
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            time.sleep(1)
            break
env.close()

Episode finished after 36 timesteps
Episode finished after 53 timesteps
Episode finished after 28 timesteps
Episode finished after 39 timesteps
Episode finished after 34 timesteps


- ### Physics !!!

In [70]:
def theta_omega_policy(obs):
    theta, w = obs[2:4]
    if abs(theta) < 0.03:
        return 0 if w < 0 else 1
    else:
        return 0 if theta < 0 else 1
    
import time
import gym
env = gym.make('CartPole-v1')


for i_episode in range(5):
    observation = env.reset()
    for t in range(550):
        time.sleep(0.01)
        env.render()
        action = theta_omega_policy(observation)
        
        observation, reward, done, info = env.step(action)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            time.sleep(1)
            break
env.close()    

Episode finished after 500 timesteps
Episode finished after 500 timesteps
Episode finished after 500 timesteps
Episode finished after 500 timesteps
Episode finished after 500 timesteps


In [64]:
observation, reward, done, info = env.step(1)
print(observation, reward)

[-0.35309133  0.5595656  -0.0241827  -0.9231632 ] 1.0


In [63]:
observation = env.reset()
for i in range(20):

#     action = random.randint(0,1)
    if observation[0] > 0:
        action = 1
    else:
        action = 0
        
    observation, reward, done, info = env.step(action)
    print(observation, reward)

[-0.03732754 -0.1904975   0.0271492   0.26085016] 1.0
[-0.04113749 -0.38599628  0.03236621  0.5619711 ] 1.0
[-0.04885742 -0.58155715  0.04360563  0.8646728 ] 1.0
[-0.06048856 -0.7772446   0.06089909  1.1707411 ] 1.0
[-0.07603345 -0.97310334  0.08431391  1.4818778 ] 1.0
[-0.09549552 -1.1691467   0.11395146  1.799657  ] 1.0
[-0.11887845 -1.3653438   0.1499446   2.1254735 ] 1.0
[-0.14618532 -1.5616052   0.19245407  2.460481  ] 1.0
[-0.17741743 -1.7577648   0.2416637   2.805521  ] 1.0
[-0.21257272 -1.953561    0.2977741   3.1610403 ] 0.0
[-0.25164396 -2.1486168   0.3609949   3.5270054 ] 0.0
[-0.29461628 -2.342422    0.43153504  3.9028182 ] 0.0
[-0.34146473 -2.5343204   0.5095914   4.2872477 ] 0.0
[-0.39215112 -2.7235076   0.5953363   4.6783915 ] 0.0
[-0.4466213  -2.9090443   0.68890417  5.0736885 ] 0.0
[-0.50480217 -3.0898914   0.7903779   5.4699874 ] 0.0
[-0.5666    -3.264971   0.8997777  5.8636823] 0.0
[-0.6318994 -3.4332526  1.0170513  6.250892 ] 0.0
[-0.70056444 -3.5938635   1.1420692 