In [1]:
from collections import deque as dq
import random

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [3]:
# Modules needed to transform Mario_env to fit ML Model
from env_transform import *
# Gym is an OpenAI toolkit for RL
import gym
from gym.spaces import Box
from gym.wrappers import FrameStack
# NES Emulator for OpenAI Gym
from nes_py.wrappers import JoypadSpace
# Super Mario environment for OpenAI Gym
import gym_super_mario_bros

In [4]:
# Initialize Super Mario environment
env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0")
# Limit the action-space to
#   0. walk right
#   1. jump right
env = JoypadSpace(env, [["right"], ["right", "A"]])


# Apply Wrappers to environment
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env)
env = ResizeObservation(env, shape=84)
# env = FrameStack(env, num_stack=4)

# env.reset()
# next_state, reward, done, info = env.step(action=1)
# print(f"{next_state.shape},\n {reward},\n {done},\n {info}")

In [5]:
# NN를 학습시키기 위한 hyperparameter
learning_rate = 0.0005
batch_size = 32
gamma = 0.98
buffer_limit = 100000

gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
# 강화학습은 Training data set이라는게 따로 없다. Agent가 행동을 취하고 데이터셋을 쌓아나가야합니다.
# 그 데이터셋을 쌓기 위한 버퍼
class ReplayBuffer():
    def __init__(self):
        self.buffer = dq(maxlen=buffer_limit)
    
    # 버퍼에는 (state, action ,reward, nstate, done) 값이 들어갑니다.
    def put(self, transition):
        self.buffer.append(transition)
    
    # 샘플 함수를 만드는 이유는 버퍼에 쌓인 데이터셋에서 랜덤으로 학습을 시키기 위함입니다.
    # 그냥 연속해서 쌓인 n개의 데이터셋을 그대로 사용하면 데이터간의 상관관계가 너무 크기 때문에 학슴이 잘 안됩니다.
    def sample(self, n):
        mini_batch = random.sample(self.buffer, n)
        s_lst, a_lst, r_lst, s_prime_lst, done_mask_lst = [], [], [], [], []
        
        for transition in mini_batch:
            s, a, r, s_prime, done_mask = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            done_mask_lst.append([done_mask])
           
        return torch.tensor(s_lst, dtype=torch.float, device=gpu), torch.tensor(a_lst, device=gpu), \
               torch.tensor(r_lst, dtype=torch.float, device=gpu), torch.tensor(s_prime_lst, dtype=torch.float, device=gpu), \
               torch.tensor(done_mask_lst, dtype=torch.float, device=gpu)
    
    def size(self):
        return len(self.buffer)

In [7]:
# Mario의 state가 (4, 84, 84)sahpe, action은 2개이기 때문에 input 3, output 2인 CNN생성
class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.conv1 = nn.Conv2d(in_channels = 1,out_channels = 32,kernel_size = 8,stride = 4)
        self.conv2 = nn.Conv2d(in_channels = 32,out_channels = 64,kernel_size = 4,stride = 2)
        self.conv3 = nn.Conv2d(in_channels = 64,out_channels = 64,kernel_size = 3,stride = 1)
        self.fc1 = nn.Linear(5184, 512)
        self.fc2 = nn.Linear(512, 2)

    def forward(self, x):
        x = x.to(gpu)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = x.view(-1,5184)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x.float()
    
    # epsilon greedy 전략을 사용합니다.
    # 간단하게 설명하면 탐험이라는 개념을 통해서 가보지 않은 경로를 가볼 수 있게 해줍니다.
    def sample_action(self, observation, epsilon):
        out = self.forward(observation)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,1)
        else : 
            return out.argmax().item()

In [8]:
def train(q, q_target, memory, optimizer):
    for i in range(10):
        s,a,r,s_prime,done_mask = memory.sample(batch_size)
        
        # 벨만함수로부터 유도된 DQN 비용함수를 구현 학습시킵니다.
        q_out = q(s)
        q_a = q_out.gather(1,a)
        max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
        target = r + gamma * max_q_prime * done_mask
        loss = F.smooth_l1_loss(q_a, target)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
# Double Deep Q Learning 개념
# target_net을 semi constant로 사용
q = Qnet().to(gpu)
q_target = Qnet().to(gpu)
q_target.load_state_dict(q.state_dict())
memory = ReplayBuffer()

print_interval = 20
score = 0.0  
optimizer = optim.Adam(q.parameters(), lr=learning_rate)

for n_epi in range(40000+1):
    print(f'epi : {n_epi}')
    epsilon = max(0.01, 0.08 - 0.01*(n_epi/200)) #Linear annealing from 8% to 1%
    s = env.reset()
    done = False

    while not done:
        a = q.sample_action((s.unsqueeze(0).unsqueeze(0)).float(), epsilon)
        s_prime, r, done, info = env.step(a)
        done_mask = 0.0 if done else 1.0
#         print(s.unsqueeze(0).shape, type(s.unsqueeze(0)))
#         print(a, type(a))
#         print(r, type(r))
#         print(s_prime.unsqueeze(0).shape, type(s_prime.unsqueeze(0)))
#         print(done_mask, type(done_mask))
        memory.put((s.unsqueeze(0).numpy(),a,r/100.0,s_prime.unsqueeze(0).numpy(), done_mask))
        s = s_prime

        score += r
        if done:
            break
    
    # 메모리가 어느정도 차야 random sample이 가능하기 때문에 일정 이상 차면 학습을 진행
    if memory.size()>2000:
        train(q, q_target, memory, optimizer)

    if n_epi%print_interval==0 and n_epi!=0:
        torch.save(q.state_dict(), 'Mario_weight.pt')
        f = open('./reward.txt', 'a')
        f.write("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%\n".format(
                                                        n_epi, score/print_interval, memory.size(), epsilon*100))
        f.close()
        # 일정 주기마다 semi constant인 target-net도 업데이트.
        q_target.load_state_dict(q.state_dict())
        print("n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%".format(
                                                        n_epi, score/print_interval, memory.size(), epsilon*100))
        score = 0.0
env.close()

epi : 0
epi : 1
epi : 2
epi : 3
epi : 4
epi : 5
epi : 6
epi : 7
epi : 8
epi : 9
epi : 10
epi : 11
epi : 12
epi : 13
epi : 14
epi : 15
epi : 16
epi : 17
epi : 18
epi : 19
epi : 20
n_episode :20, score : 353.9, n_buffer : 3729, eps : 7.9%
epi : 21
epi : 22
epi : 23
epi : 24
epi : 25
epi : 26
epi : 27
epi : 28
epi : 29
epi : 30
epi : 31
epi : 32
epi : 33
epi : 34
epi : 35
epi : 36
epi : 37
epi : 38
epi : 39
epi : 40
n_episode :40, score : 439.0, n_buffer : 9716, eps : 7.8%
epi : 41
epi : 42
epi : 43
epi : 44
epi : 45
epi : 46
epi : 47
epi : 48
epi : 49
epi : 50
epi : 51
epi : 52
epi : 53
epi : 54
epi : 55
epi : 56
epi : 57
epi : 58
epi : 59
epi : 60
n_episode :60, score : 445.1, n_buffer : 11450, eps : 7.7%
epi : 61
epi : 62
epi : 63
epi : 64
epi : 65
epi : 66
epi : 67
epi : 68
epi : 69
epi : 70
epi : 71
epi : 72
epi : 73
epi : 74
epi : 75
epi : 76
epi : 77
epi : 78
epi : 79
epi : 80
n_episode :80, score : 469.2, n_buffer : 19220, eps : 7.6%
epi : 81
epi : 82
epi : 83
epi : 84
epi : 85
ep

epi : 642
epi : 643
epi : 644
epi : 645
epi : 646
epi : 647
epi : 648
epi : 649
epi : 650
epi : 651
epi : 652
epi : 653
epi : 654
epi : 655
epi : 656
epi : 657
epi : 658
epi : 659
epi : 660
n_episode :660, score : 493.2, n_buffer : 100000, eps : 4.7%
epi : 661
epi : 662
epi : 663
epi : 664
epi : 665
epi : 666
epi : 667
epi : 668
epi : 669
epi : 670
epi : 671
epi : 672
epi : 673
epi : 674
epi : 675
epi : 676
epi : 677
epi : 678
epi : 679
epi : 680
n_episode :680, score : 563.1, n_buffer : 100000, eps : 4.6%
epi : 681
epi : 682
epi : 683
epi : 684
epi : 685
epi : 686
epi : 687
epi : 688
epi : 689
epi : 690
epi : 691
epi : 692
epi : 693
epi : 694
epi : 695
epi : 696
epi : 697
epi : 698
epi : 699
epi : 700
n_episode :700, score : 704.0, n_buffer : 100000, eps : 4.5%
epi : 701
epi : 702
epi : 703
epi : 704
epi : 705
epi : 706
epi : 707
epi : 708
epi : 709
epi : 710
epi : 711
epi : 712
epi : 713
epi : 714
epi : 715
epi : 716
epi : 717
epi : 718
epi : 719
epi : 720
n_episode :720, score : 562

epi : 1252
epi : 1253
epi : 1254
epi : 1255
epi : 1256
epi : 1257
epi : 1258
epi : 1259
epi : 1260
n_episode :1260, score : 438.6, n_buffer : 100000, eps : 1.7%
epi : 1261
epi : 1262
epi : 1263
epi : 1264
epi : 1265
epi : 1266
epi : 1267
epi : 1268
epi : 1269
epi : 1270
epi : 1271
epi : 1272
epi : 1273
epi : 1274
epi : 1275
epi : 1276
epi : 1277
epi : 1278
epi : 1279
epi : 1280
n_episode :1280, score : 385.6, n_buffer : 100000, eps : 1.6%
epi : 1281
epi : 1282
epi : 1283
epi : 1284
epi : 1285
epi : 1286
epi : 1287
epi : 1288
epi : 1289
epi : 1290
epi : 1291
epi : 1292
epi : 1293
epi : 1294
epi : 1295
epi : 1296
epi : 1297
epi : 1298
epi : 1299
epi : 1300
n_episode :1300, score : 389.4, n_buffer : 100000, eps : 1.5%
epi : 1301
epi : 1302
epi : 1303
epi : 1304
epi : 1305
epi : 1306
epi : 1307
epi : 1308
epi : 1309
epi : 1310
epi : 1311
epi : 1312
epi : 1313
epi : 1314
epi : 1315
epi : 1316
epi : 1317
epi : 1318
epi : 1319
epi : 1320
n_episode :1320, score : 643.4, n_buffer : 100000, eps 

epi : 1834
epi : 1835
epi : 1836
epi : 1837
epi : 1838
epi : 1839
epi : 1840
n_episode :1840, score : 640.8, n_buffer : 100000, eps : 1.0%
epi : 1841
epi : 1842
epi : 1843
epi : 1844
epi : 1845
epi : 1846
epi : 1847
epi : 1848
epi : 1849
epi : 1850
epi : 1851
epi : 1852
epi : 1853
epi : 1854
epi : 1855
epi : 1856
epi : 1857
epi : 1858
epi : 1859
epi : 1860
n_episode :1860, score : 398.4, n_buffer : 100000, eps : 1.0%
epi : 1861
epi : 1862
epi : 1863
epi : 1864
epi : 1865
epi : 1866
epi : 1867
epi : 1868
epi : 1869
epi : 1870
epi : 1871
epi : 1872
epi : 1873
epi : 1874
epi : 1875
epi : 1876
epi : 1877
epi : 1878
epi : 1879
epi : 1880
n_episode :1880, score : 669.2, n_buffer : 100000, eps : 1.0%
epi : 1881
epi : 1882
epi : 1883
epi : 1884
epi : 1885
epi : 1886
epi : 1887
epi : 1888
epi : 1889
epi : 1890
epi : 1891
epi : 1892
epi : 1893
epi : 1894
epi : 1895
epi : 1896
epi : 1897
epi : 1898
epi : 1899
epi : 1900
n_episode :1900, score : 880.6, n_buffer : 100000, eps : 1.0%
epi : 1901
epi 