In [1]:
# !pip install opencv-python



In [2]:
# !pip install pillow



In [1]:
import numpy as np
import cv2
from PIL import Image
import time
import pickle
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

In [2]:
EPISODES = 30000
SHOW_EVERY = 3000

epsilon = 0.6
EPS_DECAY = 0.9998
DISCOUNT = 0.95
LEARNING_RATE = 0.1

In [3]:
class Cube:
    def __init__(self,size):
        self.size = size
        self.x = np.random.randint(0, self.size)
        self.y = np.random.randint(0, self.size)
        
    def __str__(self):
        return f'{self.x},{self.y}'
    
    def __sub__(self,other):
        return (self.x-other.x,self.y-other.y)
    
    def __eq__(self,other):
        return self.x == other.x and self.y == other.y
    
    def action(self,choise):
        if choise == 0 :
            self.move(x=1, y=1)
        elif choise == 1 :
            self.move(x=-1, y=1)
        elif choise == 2 :
            self.move(x=1, y=-1)
        elif choise == 3 :
            self.move(x=-1, y=-1)
        elif choise == 4 :
            self.move(x=0, y=1)        
        elif choise == 5 :
            self.move(x=0, y=-1) 
        elif choise == 6 :
            self.move(x=1, y=0) 
        elif choise == 7 :
            self.move(x=-1, y=0)             
        elif choise == 8 :
            self.move(x=0, y=0)             
            
    def move(self,x=False,y=False):
        if not x:
            self.x += np.random.randint(-1,2)
        else:
            self.x += x
            
        if not y:
            self.y += np.random.randint(-1,2)
        else:
            self.y += y   
            
        if self.x < 0 :
            self.x = 0
        elif self.x >= self.size :
            self.x = self.size -1

        if self.y < 0 :
            self.y = 0
        elif self.y >= self.size :
            self.y = self.size -1

In [4]:
class envCube:
    SIZE = 10
    OBSERVATION_SPACE_VALUES = (SIZE,SIZE,3)
#     OBSERVATION_SPACE_VALUES = (4,)
    ACTION_SPACE_VALUES = 9
    RETURN_IMAGE = True
    
    FOOD_REWARD = 25
    ENEMY_PENALITY = -300
    MOVE_PENALITY = -1    
    
    d = {1:(255,0,0), #blue
         2:(0,255,0), #green
         3:(0,0,255)} #red

    PLAYER_N = 1
    FOOD_N =2
    ENEMY_N =3    
    
    def reset(self):
        self.player = Cube(self.SIZE)
        self.food = Cube(self.SIZE)
        while self.food == self.player:
            self.food = Cube(self.SIZE)
        
        self.enemy = Cube(self.SIZE)
        while self.enemy == self.player or self.enemy == self.food:
            self.enemy = Cube(self.SIZE)
        
        if self.RETURN_IMAGE:
            observation = np.array(self.get_image())/255
        else:
            observation = (self.player - self.food)+(self.player - self.enemy) 
        
        self.episode_step = 0
        
        return observation
    
    def step(self,action):
        self.episode_step += 1
        self.player.action(action)
        self.food.move()
        self.enemy.move()

        if self.RETURN_IMAGE:
            new_observation = np.array(self.get_image())/255
        else:
            new_observation = (self.player - self.food)+(self.player - self.enemy)

        if self.player == self.food :
            reward = self.FOOD_REWARD
        elif self.player == self.enemy :
            reward = self.ENEMY_PENALITY
        else:
            reward = self.MOVE_PENALITY

        done = False
        if self.player == self.food or self.player == self.enemy or self.episode_step>=200:
            done = True
        
        return new_observation,reward,done,{}
    
    def render(self,mode='human'):
        img = self.get_image()       
        img = img.resize((800,800))
        cv2.imshow('Predator',np.array(img))
        cv2.waitKey(1)
    
    def get_image(self):
        env = np.zeros((self.SIZE,self.SIZE,3), dtype=np.uint8)
        env[self.food.x][self.food.y] = self.d[self.FOOD_N]
        env[self.player.x][self.player.y] = self.d[self.PLAYER_N]
        env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]
        img = Image.fromarray(env,'RGB')
        return img

    def get_qtable(self,qtable_name=None):
        if qtable_name is None:
            q_table = {}
            for x1 in range(-self.SIZE+1, self.SIZE):
                for y1 in range(-self.SIZE+1, self.SIZE):
                    for x2 in range(-self.SIZE+1, self.SIZE):
                        for y2 in range(-self.SIZE+1, self.SIZE):
                            q_table[(x1,y1,x2,y2)] = [np.random.uniform(-5,0) for i in range(self.ACTION_SPACE_VALUES)]
        else:
            with open(qtable_name,'rb') as f:
                q_table=pickle.load(f)
        return q_table

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten, Conv2D
from tensorflow.keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.callbacks import FileLogger, ModelIntervalCheckpoint
env = envCube()

In [6]:
def build_model(status,nb_actions):
    model = Sequential()
    model.add(Conv2D(32,(3,3),activation='relu',input_shape=(1,) + status))
    model.add(Conv2D(32,(3,3),activation='relu'))
    model.add(Flatten())
    model.add(Dense(32,activation='relu'))
    model.add(Dense(32,activation='relu'))    
    model.add(Dense(nb_actions,activation='linear'))
    return model

In [7]:
model = build_model(env.OBSERVATION_SPACE_VALUES,env.ACTION_SPACE_VALUES)
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 1, 8, 8, 32)       896       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 1, 6, 6, 32)       9248      
_________________________________________________________________
flatten (Flatten)            (None, 1152)              0         
_________________________________________________________________
dense (Dense)                (None, 32)                36896     
_________________________________________________________________
dense_1 (Dense)              (None, 32)                1056      
_________________________________________________________________
dense_2 (Dense)              (None, 9)                 297       
Total params: 48,393
Trainable params: 48,393
Non-trainable params: 0
____________________________________________________

In [13]:
def build_agent(model,nb_actions):
    memory = SequentialMemory(limit=50000, window_length=1)
    policy = BoltzmannQPolicy()
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000,
                   enable_double_dqn=True,target_model_update=5000, policy=policy)
    dqn.compile(Adam(learning_rate=1e-3), metrics=['mae'])
    return dqn

In [31]:
def build_duel_agent(model,nb_actions):
    memory = SequentialMemory(limit=50000, window_length=1) 
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05,
                              nb_steps=500000)
    dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=1000,
               enable_dueling_network=True, dueling_type='avg',policy=policy)
    dqn.compile(Adam(learning_rate=1e-4))
    return dqn

In [32]:
dqn_duel = build_duel_agent(model,env.ACTION_SPACE_VALUES)

In [34]:
checkpoint_weights_filename = './models_duel/dqn_weights_{step}.h5f'
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=10000)]
dqn_duel.fit(env, nb_steps=1000000, visualize=False, verbose=1,callbacks=callbacks)

Training for 1000000 steps ...
Interval 1 (0 steps performed)
156 episodes - episode_reward: -183.192 [-491.000, 25.000] - loss: 612.513 - mean_q: 93.782 - mean_eps: 0.990

Interval 2 (10000 steps performed)


<tensorflow.python.keras.callbacks.History at 0x7fd9a119c2e0>

In [14]:
dqn = build_agent(model,env.ACTION_SPACE_VALUES)

In [15]:
dqn.fit(env, nb_steps=100000, visualize=False, verbose=1)

Training for 100000 steps ...
Interval 1 (0 steps performed)
    1/10000 [..............................] - ETA: 26:03 - reward: -1.0000



137 episodes - episode_reward: -196.095 [-493.000, 24.000] - loss: 170.642 - mae: 5.091 - mean_q: -0.080

Interval 2 (10000 steps performed)
137 episodes - episode_reward: -236.255 [-484.000, 25.000] - loss: 231.204 - mae: 11.132 - mean_q: -6.692

Interval 3 (20000 steps performed)
117 episodes - episode_reward: -192.709 [-493.000, 25.000] - loss: 209.880 - mae: 14.052 - mean_q: -10.550

Interval 4 (30000 steps performed)
142 episodes - episode_reward: -184.704 [-483.000, 24.000] - loss: 201.401 - mae: 14.187 - mean_q: -11.529

Interval 5 (40000 steps performed)
138 episodes - episode_reward: -209.594 [-489.000, 25.000] - loss: 198.439 - mae: 17.624 - mean_q: -15.747

Interval 6 (50000 steps performed)
135 episodes - episode_reward: -181.407 [-480.000, 25.000] - loss: 208.235 - mae: 23.469 - mean_q: -22.667

Interval 7 (60000 steps performed)
127 episodes - episode_reward: -178.228 [-499.000, 25.000] - loss: 193.905 - mae: 25.916 - mean_q: -25.542

Interval 8 (70000 steps performed)
12

<tensorflow.python.keras.callbacks.History at 0x7fd9e2d49a00>

In [16]:
dqn.save_weights('double-dqn_weights_R2.h5f', overwrite=True)

In [17]:
scores = dqn.test(env, nb_episodes=20, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 20 episodes ...
Episode 1: reward: -200.000, steps: 200
Episode 2: reward: -87.000, steps: 113
Episode 3: reward: -24.000, steps: 50
Episode 4: reward: 25.000, steps: 1
Episode 5: reward: -104.000, steps: 130
Episode 6: reward: -9.000, steps: 35
Episode 7: reward: -10.000, steps: 36
Episode 8: reward: -20.000, steps: 46
Episode 9: reward: -51.000, steps: 77
Episode 10: reward: 11.000, steps: 15
Episode 11: reward: -200.000, steps: 200
Episode 12: reward: -84.000, steps: 110
Episode 13: reward: -348.000, steps: 49
Episode 14: reward: -200.000, steps: 200
Episode 15: reward: -404.000, steps: 105
Episode 16: reward: 10.000, steps: 16
Episode 17: reward: -303.000, steps: 4
Episode 18: reward: -200.000, steps: 200
Episode 19: reward: -200.000, steps: 200
Episode 20: reward: -200.000, steps: 200
-129.9


In [52]:
del dqn,model

In [36]:
dqn.load_weights('./models/dqn_weights_imageEnv_810000.h5f')


Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fd9a044e250> and <tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fd9a06b6a90>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.convolutional.Conv2D object at 0x7fd9a06b6a90> and <tensorflow.python.keras.layers.core.Flatten object at 0x7fd9f1c0e5e0>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.core.Dense object at 0x7fd9a06d7d90> and <tensorflow.python.keras.layers.core.Dense object at 0x7fd9a06e9610>).

Two checkpoint references resolved to different objects (<tensorflow.python.keras.layers.core.Dense object at 0x7fd9a06e9610> and <tensorflow.python.keras.layers.core.Dense object at 0x7fd9a06e9c10>).


ValueError: Layer weight shape (32, 9) not compatible with provided weight shape (32, 10)

In [60]:
scores = dqn.test(env, nb_episodes=10, visualize=True)
print(np.mean(scores.history['episode_reward']))

Testing for 10 episodes ...




Episode 1: reward: -200.000, steps: 200
Episode 2: reward: 16.000, steps: 10
Episode 3: reward: 0.000, steps: 26
Episode 4: reward: -200.000, steps: 200
Episode 5: reward: -75.000, steps: 101
Episode 6: reward: 12.000, steps: 14
Episode 7: reward: 9.000, steps: 17
Episode 8: reward: 4.000, steps: 22
Episode 9: reward: -200.000, steps: 200
Episode 10: reward: -51.000, steps: 77
-68.5
