In [1]:
import gym
import gym_sokoban
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image
from IPython import display

import queue
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

import warnings
warnings.filterwarnings('ignore')
import os
if not os.path.exists("model"):
    os.mkdir("model")





# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
env = gym.make('Sokoban-small-v1')

def get_screen():
    # Returned screen requested by gym is 400x600x3, but is sometimes larger
    # such as 800x1200x3. Transpose it into torch order (CHW).
    screen = env.render(mode='tiny_rgb_array').transpose(2, 0, 1)[0]
    # Cart is in the lower half, so strip off the top and bottom of the screen
    screen_height, screen_width = screen.shape
    # Convert to float, rescale, convert to torch tensor
    # (this doesn't require a copy)
    screen = np.ascontiguousarray(screen, dtype=np.float32) / 255
    screen = torch.from_numpy(screen).unsqueeze(0)
    # Resize, and add a batch dimension (BCHW)
    return screen.unsqueeze(0).to(device)


# Get screen size so that we can initialize layers correctly based on shape
init_screen = get_screen()
_, _, screen_height, screen_width = init_screen.shape

#env.reset()
#plt.figure()
#plt.imshow(get_screen().cpu().squeeze(0).permute(1, 2, 0).numpy(),
          # interpolation='none')
#plt.title('Example extracted screen')
#plt.show()

In [3]:
env = gym.make('Sokoban-small-v1')
env.reset()
get_screen().size()
#env.render(mode='tiny_rgb_array').transpose((2, 0, 1))[0]

torch.Size([1, 1, 7, 7])

In [4]:
class NN(nn.Module):

    def __init__(self, h, w):
        super(NN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1)
        self.bn2 = nn.BatchNorm2d(32)
        #self.conv3 = nn.Conv2d(32, 32, kernel_size=3, stride=1)
        #self.bn3 = nn.BatchNorm2d(32)
       
    
        
        

        # Number of Linear input connections depends on output of conv2d layers
        # and therefore the input image size, so compute it.
        #def conv2d_size_out(size, kernel_size = 3, stride = 1):
         #   return (size - (kernel_size - 1) - 1) // stride  + 1
        #convw = conv2d_size_out(conv2d_size_out(conv2d_size_out(w)))
        #convh = conv2d_size_out(conv2d_size_out(conv2d_size_out(h)))
        #linear_input_size = convw * convh * 32
        
        
        self.fc1 = nn.Linear(3*3*32, 3*3*32)
        
        self.fc2_val = nn.Linear(3*3*32, 1)
        
        self.fc3_pol = nn.Linear(3*3*32, 8)
        self.fc4_pol_softmax = nn.Softmax()

    
        
       
        
    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x, val_bool = False):
        
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.fc1(x.view(x.size(0), -1)))
        
        # output value             
        x_val = F.relu(self.fc2_val(x.view(x.size(0), -1)))
        
        # output policy         
        x_pol = self.fc3_pol(x.view(x.size(0), -1))
        x_pol = self.fc4_pol_softmax(x_pol.view(x_pol.size(0), -1))           
    
        
        return (x_val, x_pol)
        


In [5]:
def train_model(t_max, main_model, global_t):

    thread = gym.make('Sokoban-small-v1') 


    # sync train model copy with global train model
    #train_model = main_model


    # thread-step-counter t = 0
    t = 0


    #pol_netlist[counter] = net

    # reset policy and val gradients
    optimizer.zero_grad() 

    # tstart = t
    # get state s_t
    thread.reset()

    r = []
    s = []
    a = []

    # while t < t_max and game_not_finished
    game_finished = False
    
    
    while t < t_max and not game_finished:

        # perform a_t with policy net
        #print((torch.from_numpy(thread.render('tiny_rgb_array')[:,:,0])))
        s.append(get_screen())
        #print(s[t])
        a.append(main_model(s[t])[1].max(1)[1].view(1, 1).item())

        _, reward, game_finished,_ = thread.step(a[t]+1) # mapping actions + 1 

        r.append(reward)

        


    # R = 0 if game_not_finished == True, else R = value(s_t)

    if (game_finished == True):
        R = 0
    else: 
        
        R = main_model(s[t])[0].view(1, 1).item()
        
        t = t + 1
        
    for j in range(t-1, -1, -1):
        R = r[j] + gamma*R
        val = main_model(s[j])[0]
    
        # PROBLEM: FUNKTIONIERT DAS BACKPROP RICHTIG, wegen verschiedenen branches
        loss = numpy.log(main_model(s[j])[1][a[j]-1]*(R-val))
        loss.backwards()
        optimizer.step()
        loss = np.square(val-R)
        loss.backwards()
        optimizer.step()
    
        actual_pol = main_model(s[j])[1][a[j]-1]
        criterion1 = nn.L1Loss()
        loss1 = criterion1(actual_pol*val, actual_pol*R)
        loss1.backward()
        optimizer.step()
        optimizer.zero_grad()
        criterion2 = nn.MSEloss()
        loss2 = criterion2(val, R)
        loss2.backward()
        optimizer.step()
        optimizer.zero_grad()
    # perform asynchronous updates of policy and value net using the gradients
    
    print("global training step", global_t, "finished")
            


In [6]:
def test_model(episodes):
    
    env = gym.make('Sokoban-small-v1')
    env.render('tiny_rgb_array')[:,:,0]
    
    iteration_count = 0
    reward_sum = 0
    episode_len_sum = 0
    for i_episode in range(episodes):
        # Initialize the environment and state
        env.reset()
        last_screen = get_screen()
        current_screen = get_screen()
        for t in count():
            iteration_count += 1
            #current_screen = get_screen()
            # Select and perform an action
            action = main_model(get_screen(), val_bool = False)[1].max(1)[1].view(1, 1).item()
            
            _, reward, done, _ = env.step(action+1)
            reward_sum += reward
            if done:
                episode_len_sum += t
                break
            if iteration_count > 50:
                break
    env.close()
    average_reward = reward_sum / iteration_count
    average_ep_len = episode_len_sum / episodes
    print(average_reward, average_ep_len)
    return average_reward, average_ep_len
    
#test_model(2)

In [7]:
# HYPERPARAMETERS
env_no = 2
global_tmax = 100
t_max = 30
game_not_finished = True # vorläufig, eig soll das das gym zurückgeben
gamma = 0.99# discont rate 

# INITIALIZATION
main_model = NN(screen_height, screen_width).to(device)

optimizer = optim.RMSprop(main_model.parameters())



In [8]:
# TRAIN AND TEST LOOP
for global_t in range(global_tmax):
    train_model(t_max, main_model, global_t)

    if global_t % 10 == 0:
        test_model(20)

global training step 0 finished
[SOKOBAN] Retry . . .
[SOKOBAN] Retry . . .
-0.08571428571428559 0.0
global training step 1 finished
global training step 2 finished
[SOKOBAN] Retry . . .
global training step 3 finished
global training step 4 finished
[SOKOBAN] Retry . . .
global training step 5 finished
global training step 6 finished
global training step 7 finished
global training step 8 finished
global training step 9 finished
global training step 10 finished
[SOKOBAN] Retry . . .
[SOKOBAN] Retry . . .
[SOKOBAN] Retry . . .
[SOKOBAN] Retry . . .
-0.09999999999999987 0.0
global training step 11 finished
global training step 12 finished
[SOKOBAN] Retry . . .
global training step 13 finished
global training step 14 finished
global training step 15 finished
global training step 16 finished
global training step 17 finished
global training step 18 finished
global training step 19 finished
global training step 20 finished
-0.09999999999999987 0.0
global training step 21 finished
global trai

KeyboardInterrupt: 

In [11]:
print(main_model.parameters())

<generator object Module.parameters at 0x000001D4C545F6C8>


In [14]:
list(main_model.parameters())

[Parameter containing:
 tensor([[[[ 0.1925, -0.2914, -0.0067],
           [-0.3249, -0.1618, -0.2433],
           [ 0.2535, -0.1491,  0.0276]]],
 
 
         [[[-0.1695,  0.3124,  0.0666],
           [-0.0081,  0.3120, -0.3321],
           [ 0.1762, -0.2488, -0.3210]]],
 
 
         [[[-0.1112, -0.0478,  0.0779],
           [-0.2145,  0.0940, -0.0955],
           [-0.1347, -0.1201, -0.2593]]],
 
 
         [[[ 0.2740, -0.1930,  0.2117],
           [-0.0659, -0.2719, -0.2950],
           [-0.2541,  0.1644, -0.0018]]],
 
 
         [[[ 0.0949, -0.2039, -0.1859],
           [ 0.0058, -0.3058,  0.1397],
           [-0.0197,  0.2906, -0.0105]]],
 
 
         [[[-0.2576,  0.1217,  0.2857],
           [-0.2212, -0.2707,  0.2259],
           [-0.0015, -0.0568, -0.0387]]],
 
 
         [[[ 0.2864,  0.1605,  0.1230],
           [-0.3046,  0.0346,  0.2702],
           [-0.1151, -0.1010, -0.1690]]],
 
 
         [[[-0.0550,  0.2812, -0.0067],
           [-0.1199,  0.1951, -0.2223],
           [-0.