In [11]:
#NÃ¶tig um eigene Module richtig zu laden
import os
os.getcwd()
os.chdir("C:\\Users\\admin\\Desktop\\FML\\fml-project")
os.getcwd()

'C:\\Users\\admin\\Desktop\\FML\\fml-project'

In [18]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model

class ReplayBuffer():
    def __init__(self, mem_size, input_dims):
        self.mem_size = mem_size
        self.mem_cntr = 0
        
        self.state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.new_state_memory = np.zeros((self.mem_size, *input_dims), dtype=np.float32)
        self.action_memory = np.zeros(self.mem_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.mem_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.int32)
    
    #state_ is next state
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1-int(done)
        self.mem_cntr += 1
        
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size, replace=False)
        
        states = self.state_memory[batch]
        states_ =  self.new_state_memory[batch]
        actions =  self.action_memory[batch]
        rewards = self.reward_memory[batch]
        terminal = self.terminal_memory[batch]
        
        return states, actions, rewards, states_, terminal

# change this later   
def build_dqn(lr, n_actions, input_dims):
    inputs = tf.keras.Input(shape=input_dims)
    x = tf.keras.layers.Conv2D(64, 3, padding='same', activation='relu')(inputs)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    outputs = tf.keras.layers.Dense(n_actions, activation=None)(x)
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    model.compile(optimizer=Adam(learning_rate=lr), loss='mean_squared_error')
    return model

class Agent():
    def __init__(self, lr, gamma, n_actions, epsilon, batch_size,
                input_dims, epsilon_dec=1e-3, epsilon_end=0.01,
                mem_size=1000000, fname='dqn_model.h5'):
        self.action_space = [i for i in range(n_actions)]
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = epsilon_dec
        self.eps_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        self.memory = ReplayBuffer(mem_size, input_dims)
        self.q_eval=build_dqn(lr, n_actions, input_dims)
    
    def store_transition(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
    
    def choose_action(self, observation):
        if np.random.random() < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(observation)
            action = np.argmax(actions)
        
        return action
    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            # only learn if buffer is full enough
            return
        
        states, actions, rewards, states_, dones = self.memory.sample_buffer(self.batch_size)
        
        q_eval = self.q_eval.predict(states)
        q_next = self.q_eval.predict(states_)

        q_target = np.copy(q_eval)
        batch_index = np.arange(self.batch_size, dtype=np.int32)

        q_target[batch_index, actions] = rewards + self.gamma * np.max(q_next, axis=1)*dones

        self.q_eval.train_on_batch(states, q_target)

        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min
    
    def save_model(self):
        self.q_eval.save(self.model_file)
        
    def load_model(self):
        self.q_eval = load_model(self.model_file)

In [None]:
from adapter.bomberman_adapter import BombermanEnvironment
from tf_agents.environments import tf_py_environment

env = tf_py_environment.TFPyEnvironment(BombermanEnvironment())
lr = 0.001
n_games = 500
n_actions = 6 #env.step expects an int between 0 and 5
agent = Agent(gamma=0.99, epsilon=1.0, lr=lr,
              input_dims=env.observation_spec().shape,
             n_actions=n_actions, mem_size=100000, batch_size=64,
             epsilon_end=0.01)
scores = []
eps_history=[]

for i in range(n_games):
    done = False
    score = 0
    timestep = env.reset()
    observation = timestep.observation
    turn=0
    while not done:
        action = agent.choose_action(observation)

        timestep = env.step(action)
        observation_ = timestep.observation
        reward = timestep.reward.numpy()
        done = True if timestep.step_type.numpy()[0]==2 else False
        score += reward
        agent.store_transition(observation, action, reward, observation_, done)
        observation = observation_
        agent.learn()
        turn +=1
        
    eps_history.append(agent.epsilon)
    scores.append(score)
    avg_score = np.mean(scores[-100:])
    print(f'episode: {i}, score: {score}, avg_score: {avg_score}, epsilon: {agent.epsilon}, num_turns: {turn}')

episode: 0, score: [-2.0001], avg_score: -2.0000998973846436, epsilon: 1.0, num_turns: 8
episode: 1, score: [-2.], avg_score: -2.0000500679016113, epsilon: 1.0, num_turns: 9
episode: 2, score: [-1.9997001], avg_score: -1.9999333620071411, epsilon: 1.0, num_turns: 9
episode: 3, score: [-1.9994], avg_score: -1.9998000860214233, epsilon: 1.0, num_turns: 6
episode: 4, score: [-2.0001], avg_score: -1.9998600482940674, epsilon: 1.0, num_turns: 14
episode: 5, score: [-1.9995], avg_score: -1.9998000860214233, epsilon: 1.0, num_turns: 5
episode: 6, score: [-1.89], avg_score: -1.984114408493042, epsilon: 1.0, num_turns: 9
episode: 7, score: [-1.8003], avg_score: -1.9611375331878662, epsilon: 0.991, num_turns: 12
episode: 8, score: [-1.8897], avg_score: -1.9531999826431274, epsilon: 0.985, num_turns: 6
episode: 9, score: [-1.9997001], avg_score: -1.9578500986099243, epsilon: 0.979, num_turns: 6
episode: 10, score: [-1.9997001], avg_score: -1.9616546630859375, epsilon: 0.973, num_turns: 6
episode:

episode: 80, score: [-1.9998001], avg_score: -1.9311248064041138, epsilon: 0.4379999999999995, num_turns: 5
episode: 81, score: [-1.9992], avg_score: -1.931955099105835, epsilon: 0.4329999999999995, num_turns: 5
episode: 82, score: [-1.9997001], avg_score: -1.9327712059020996, epsilon: 0.4269999999999995, num_turns: 6
episode: 83, score: [-1.9998001], avg_score: -1.9335691928863525, epsilon: 0.4219999999999995, num_turns: 5
episode: 84, score: [-1.9995], avg_score: -1.9343448877334595, epsilon: 0.4169999999999995, num_turns: 5
episode: 85, score: [-1.8897], avg_score: -1.9338256120681763, epsilon: 0.4109999999999995, num_turns: 6
episode: 86, score: [-1.8995], avg_score: -1.9334311485290527, epsilon: 0.4059999999999995, num_turns: 5
episode: 87, score: [-1.9995], avg_score: -1.9341819286346436, epsilon: 0.40099999999999947, num_turns: 5
episode: 88, score: [-1.9996], avg_score: -1.9349169731140137, epsilon: 0.39099999999999946, num_turns: 10
episode: 89, score: [-1.9984001], avg_score:

episode: 159, score: [-1.7897], avg_score: -1.9465022087097168, epsilon: 0.01, num_turns: 6
episode: 160, score: [-1.8998001], avg_score: -1.9455050230026245, epsilon: 0.01, num_turns: 5
episode: 161, score: [-1.7997], avg_score: -1.9435009956359863, epsilon: 0.01, num_turns: 6
episode: 162, score: [-1.7898], avg_score: -1.9413949251174927, epsilon: 0.01, num_turns: 5
episode: 163, score: [-1.903], avg_score: -1.9425339698791504, epsilon: 0.01, num_turns: 24
episode: 164, score: [-1.9998001], avg_score: -1.9425369501113892, epsilon: 0.01, num_turns: 5
episode: 165, score: [-1.9997001], avg_score: -1.9455389976501465, epsilon: 0.01, num_turns: 6
episode: 166, score: [-1.9995], avg_score: -1.94553804397583, epsilon: 0.01, num_turns: 5
episode: 167, score: [-1.9996], avg_score: -1.9465429782867432, epsilon: 0.01, num_turns: 7
