In [13]:
!pip install pygame
!pip install stable_baselines3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame
  Downloading pygame-2.1.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (21.8 MB)
[K     |████████████████████████████████| 21.8 MB 1.4 MB/s 
[?25hInstalling collected packages: pygame
Successfully installed pygame-2.1.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stable_baselines3
  Downloading stable_baselines3-1.6.0-py3-none-any.whl (177 kB)
[K     |████████████████████████████████| 177 kB 4.1 MB/s 
[?25hCollecting gym==0.21
  Downloading gym-0.21.0.tar.gz (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 6.5 MB/s 
Building wheels for collected packages: gym
  Building wheel for gym (setup.py) ... [?25l[?25hdone
  Created wheel for gym: filename=gym-0.21.0-py3-none-any.whl size=1616823 sha256=17a795d25ba4ec418c1626ac2d8e69265b0319a304c4e7af51d92057a487ecc0
  Stored in directo

In [28]:
import pygame
import random
import numpy as np
#np.set_printoptions(threshold=np.inf)
from pygame.surfarray import array3d
import torch
import cv2
import gym 
from gym import spaces 
from stable_baselines3 import DQN
import os
import pickle
from stable_baselines3.common.utils import polyak_update


In [15]:
# Required to fool the system that there is a video output
os.environ["SDL_VIDEODRIVER"] = "dummy" 

In [16]:
# Drone class
class Drone:
    def __init__(self, gameDisplay, display_width=800, display_height=600, *args, **kwargs):
        self.drone_speed = 20 # Default rate of change for drone movement
        self.x_change = 0
        self.y_change = 0
        self.x = 0
        self.y = 0
        self.drone_width = 70 # 35
        self.drone_height = 70 # 35
        self.display_width = display_width
        self.display_height = display_height
        self.gameDisplay = gameDisplay
        self.img = pygame.image.load('images/drone1.png').convert() # To fix up png files use: pngcrush -ow -rem allb -reduce file.png
        self.img = pygame.transform.scale(self.img, (int(self.display_width*0.1),int(self.display_height*0.12)))
        #self.img = pygame.transform.scale(self.img, (int(self.display_width*0.05),int(self.display_height*0.06)))

    def move_left(self):
        self.x_change = -self.drone_speed

    def move_right(self):
        self.x_change = self.drone_speed

    def move_up(self):
        self.y_change = -self.drone_speed

    def move_down(self):
        self.y_change = +self.drone_speed

    def update(self):
        self.x += self.x_change
        self.y += self.y_change
        self.x_change = 0
        self.y_change = 0

    def draw(self):
        self.gameDisplay.blit(self.img, (self.x,self.y))


In [17]:
# Obstacle class
class Obstacle:
    def __init__(self, gameDisplay, display_width=800, display_height=600, *args, **kwargs):
        #self.x = 0
        #self.y = 0
        self.x = random.randrange(0, display_width)
        self.y = -100 #random.randrange(-1300, -550) # display_height * (-1) # to give more space for obstacle to fully render
        self.speed = 40
        self.height = 100 # self.display_width / 8
        self.width = 100 # self.display_width / 6
        self.display_width = display_width
        self.display_height = display_height
        self.gameDisplay = gameDisplay
        self.img = pygame.image.load('images/asteroid.png').convert() 
        self.img = pygame.transform.scale(self.img, (int(self.display_width*0.16),int(self.display_height*0.2)))
        #self.img = pygame.transform.scale(self.img, (int(self.display_width*0.08),int(self.display_height*0.1)))

    def reset(self):
        self.x = random.randrange(0, self.display_width)
        self.y = 0 - self.height

    def update(self):
        self.y += self.speed

    def draw(self):
        self.gameDisplay.blit(self.img, (self.x,self.y))

In [19]:
# Environment class

def pre_processing(image, w=84, h=84):
    image = image[:800, 20:, :] # crop out the top so score is not visible
    #cv2.imwrite("original.jpg", image)
    image = cv2.cvtColor(cv2.resize(image, (w, h)), cv2.COLOR_BGR2GRAY)
    #cv2.imwrite("color.jpg", image)
    _, image = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY)
    #cv2.imwrite("bw.jpg", image)

    a = np.array(image[None, :, :]).astype(np.float32) 
    #a = image[None, :, :].astype(np.uint8) # use for open ai baselines
    a = a / 255 # normalise the outputs # do not use for open ai gym

    return a #image[None, :, :].astype(np.float32)


class DroneWars(gym.Env):
    def __init__(self, gameDisplay, display_width=800, display_height=600, clock=None, fps = 30, *args, **kwargs):
        super(DroneWars, self).__init__()
        self.my_drone1 = Drone(gameDisplay)
        self.my_drone1.x = display_width * 0.8
        self.my_drone1.y = display_height * 0.85 #
        self.my_drone2 = Drone(gameDisplay)
        self.my_drone2.x = display_width * 0.2
        self.my_drone2.y = display_height * 0.85 # 500
        self.gameDisplay = gameDisplay
        self.display_width = display_width
        self.display_height = display_height
        self.score = 0
        self.gameExit = False
        self.clock = clock
        self.fps = fps
        self.black = (0,0,0)
        self.white = (255,255,255)
        self.dark_red = (150,0,0)
        self.green = (0,255,0)
        self.dark_green = (0,150,0)
        self.red = (255,0,0)
        self.obstacle_list = []
        self.n_actions = 9 # 3 actions per drone so it's 3^3 action space
        self.action_space = spaces.Discrete(self.n_actions)
        self.observation_space = spaces.Box(low=0, high=255, shape=(1, 84, 84), dtype=np.float32)
        #self.observation_space = spaces.Box(low=0, high=255, shape=(1,84,84), dtype=np.uint8) #needed for cnn policy for open baselines
        self.num_of_obstacles = 1 # nuber of obstacles
        
        for n in range(0,self.num_of_obstacles):
            self.obstacle_list.append(Obstacle(gameDisplay))

        pygame.display.set_caption('Drone Wars')
        

    def close(self):
        pass


    def reset(self):
        #r = np.zeros((1,84,84)).astype(np.float32) # use for custom model
        r = np.zeros((1,84,84)).astype(np.uint8) # use for openbaselines
        return r


    def render(self):
        self.gameDisplay.fill(self.white) # Comment this out if using scrolBackground
        for obs in self.obstacle_list:
            obs.draw()
            
        self.my_drone1.draw()
        self.my_drone2.draw()

        self.scoreboard(self.score)
        pygame.display.update()


    def scoreboard(self, count):
        font = pygame.font.SysFont(None, 25)
        text = font.render("Score: "+str(count), True, self.black)
        self.gameDisplay.blit(text,(0,0))


    def out_of_bounds(self, drone, display_width, display_height):
        if (drone.x > display_width - drone.drone_width or drone.x < 0) or \
            (drone.y > display_height - drone.drone_height or drone.y < 0):
            
            return True 


    def collision_multi(self, drone, obstacle_list):
        for obs in obstacle_list:
            if (drone.y < obs.y + obs.height):

                if (drone.x > obs.x
                    and drone.x < obs.x + obs.width or drone.x + drone.drone_width > obs.x 
                    and drone.x + drone.drone_width < obs.x + obs.width):
                    
                    return True   


    def collision(self, drone, obstacle):
            if (drone.y < obstacle.y + obstacle.height):

                if (drone.x > obstacle.x
                    and drone.x < obstacle.x + obstacle.width or drone.x + drone.drone_width > obstacle.x 
                    and drone.x + drone.drone_width < obstacle.x + obstacle.width):
                    
                    return True   


    def step(self, action, record=False): # 0: do nothing, 1: go left, 2: go right
        reward = 0.1
        
        if action == 0:
            #pass
            #print("Action: 0, do nothing")
            reward += 0.01
            
        if action == 1:
            # drone1 do nothing, drone2 move left
            #print("Action: 1, drone2 left")
            self.my_drone2.move_left()
            
        if action == 2:
            #drone 1 do nothing, drone 2 move right
            #print("Action: 2, drone2 right")
            self.my_drone2.move_right()
        
        if action == 3:
            #drone 1 & 2 move left
            #print("Action: 3, drone1 left, drone2 move left")
            self.my_drone1.move_left()
            self.my_drone2.move_left()

        if action == 4:
            #drone 1 move left, drone 2 do nothing
            #print("Action: 4, drone1 left")
            self.my_drone1.move_left()

        if action == 5:
            #drone 1 move left, drone 2 move right
            #print("Action: 3, drone1 left, drone2 move right")
            self.my_drone1.move_left()
            self.my_drone2.move_right()

        if action == 6:
            #drone 1&2 move right
            #print("Action: 6, drone1 right, drone2 move right")
            self.my_drone1.move_right()
            self.my_drone2.move_right()

        if action == 7:
            #drone 1 move right, drone 2 do nothing
            #print("Action: 7, drone1 right")
            self.my_drone1.move_right()

        if action == 8:
            #print("Action: 8, drone1 right, drone2 move left")
            self.my_drone1.move_right()
            self.my_drone2.move_left()
            # drone 1 move right, drone 2 move left
        
        
        # Uncomment bellow for single drone actions
        """
        if action == 0:
            pass
        #    reward += 0.01

        elif action == 1:
            self.my_drone1.move_left()

        elif action == 2:
            self.my_drone1.move_right()
        """
        
        # Update drone 1 & 2 position 
        self.my_drone1.update()
        self.my_drone2.update()

        # Update obstacle position. Move obstacle down the screen.
        for obs in self.obstacle_list:
            obs.update()

        # Detect if obstacle went to the bottom of the screen, then reset y & x coordinates to start from the top again at a random x coordinate. 
        for obs in self.obstacle_list:
            if obs.y > self.display_height:
                obs.reset()
                reward = 1
                self.score += 1

        # Detect if drone1 left the display bounds, then game over
        if self.out_of_bounds(self.my_drone1, self.display_width, self.display_height):
            reward = -1
            self.gameExit = True

        if self.out_of_bounds(self.my_drone2, self.display_width, self.display_height):
            #crash()
            reward = -1
            self.gameExit = True

        # Detect when obstacle collides with the drone1 and reduce the score 
        if self.collision_multi(self.my_drone1, self.obstacle_list):
            self.score -= 1 
            reward = -1
            self.gameExit = True

        # Detect when obstacle collides with the drone2 and reduce the score 
        if self.collision_multi(self.my_drone2, self.obstacle_list):
            self.score -= 1 
            reward = -1
            self.gameExit = True

        self.render()
        self.clock.tick(self.fps) 
        #print("clock:", self.clock.get_fps()) # Uncomment to printout actual fps 
        #print("fps", self.fps) 

        if self.gameExit:
            self.__init__(self.gameDisplay, self.display_width, self.display_height, self.clock, self.fps)
        
        state = pygame.display.get_surface() 
        state = array3d(state)
       
        done = (not (reward > 0))
        info = {}

        # Return
        if record:
            #return pre_processing(state), np.transpose(cv2.cvtColor(state, cv2.COLOR_RGB2BGR), (1, 0, 2)), reward, done, info # Use for openbaselines
            return torch.from_numpy(pre_processing(state)), np.transpose(cv2.cvtColor(state, cv2.COLOR_RGB2BGR), (1, 0, 2)), reward, done, info 
        else:
            #return pre_processing(state), reward, done, info # use for gym baselines
            return torch.from_numpy(pre_processing(state)), reward, done, info 


In [25]:
import torch.nn as nn

class DeepQNetwork(nn.Module):
    def __init__(self):
        super(DeepQNetwork, self).__init__()

        self.conv1 = nn.Sequential(nn.Conv2d(4, 32, kernel_size=8, stride=4), nn.ReLU(inplace=True))
        self.conv2 = nn.Sequential(nn.Conv2d(32, 64, kernel_size=4, stride=2), nn.ReLU(inplace=True))
        self.conv3 = nn.Sequential(nn.Conv2d(64, 64, kernel_size=3, stride=1), nn.ReLU(inplace=True))

        self.fc1 = nn.Sequential(nn.Linear(7 * 7 * 64, 512), nn.ReLU(inplace=True)) # orig
        self.fc2 = nn.Linear(512, 9)
        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
                nn.init.uniform_(m.weight, -0.01, 0.01)
                nn.init.constant_(m.bias, 0)

    def forward(self, input):
        output = self.conv1(input)
        output = self.conv2(output)
        output = self.conv3(output)
        output = output.view(output.size(0), -1)
        output = self.fc1(output)
        output = self.fc2(output)

        return output

In [44]:
# Train custom Network

# TODO:
# Implement Hindsight Experience Replay or similar
# Training works well for a single drone but strugles for 2 drones

# Model parameters
model_params = {
    'batch_size' : 32, 
    'optimizer' : "adam", # ["sgd", "adam"]
    'lr' : 1e-4,
    'gamma' : 0.99,
    'initial_epsilon' : 1,
    'final_epsilon' : 0.001,
    'num_decay_iters' : 500000,
    'num_iters' : 650000,
    'replay_memory_size' : 100000, # Replay memory size must not exeed available RAM # 10000 = 1Gb
    'saved_folder' : "model",
    'render' : True 
}

def train(model_params):
    
    pygame.init()
    clock = pygame.time.Clock()
    #flags = pygame.SHOWN # Use this pygame flag for training on local machine to see game rendering
    flags = pygame.HIDDEN 
    gameDisplay = pygame.display.set_mode((800,600), flags) 
    tau = 1 # required for polyak update for openbaselines
    update_starts = 1 # num of steps after when to start training the target network
    updated = False 
    model_update_rate = 5 # update target network after number of episodes
    episodes = 0 
    rewards = []
    scores = []
    all_scores = np.array(1)

    if torch.cuda.is_available():
        torch.cuda.manual_seed(123)
    else:
        torch.manual_seed(123)

    model = DeepQNetwork() 
    model_target = DeepQNetwork()
    # read more https://blog.gofynd.com/building-a-deep-q-network-in-pytorch-fa1086aa5435

    if torch.cuda.is_available():
        model.cuda()
        model_target.cuda()

    optimizer = torch.optim.Adam(model.parameters(), lr= model_params['lr'])
    optimizer_target = torch.optim.Adam(model_target.parameters(), lr= model_params['lr'])

    if not os.path.isdir(model_params['saved_folder']):
        os.makedirs(model_params['saved_folder'])
    checkpoint_path = os.path.join(model_params['saved_folder'], "drone_wars.pth")
    checkpoint_path_target = os.path.join(model_params['saved_folder'], "drone_wars_target.pth")
    memory_path = os.path.join(model_params['saved_folder'], "replay_memory.pkl")

    # Check if model exists in path and continue training from last step
    if os.path.isfile(checkpoint_path):
        checkpoint = torch.load(checkpoint_path)
        checkpoint_target = torch.load(checkpoint_path_target)
        iter = checkpoint["iter"] + 1
        model.load_state_dict(checkpoint["model_state_dict"])
        model_target.load_state_dict(checkpoint["model_state_dict"])
        #model_target.load_state_dict(checkpoint_target["model_state_dict"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        #optimizer_target.load_state_dict(checkpoint_target["optimizer"])
        print("Load trained model from iteration {}".format(iter))
    else:
        iter = 0
    
    if os.path.isfile(memory_path):
        with open(memory_path, "rb") as f:
            replay_memory = pickle.load(f)
        print("Load replay memory")
    else:
        replay_memory = []
    #criterion = nn.MSELoss() # Mean square error loss
    criterion = nn.SmoothL1Loss() # stablebaselines dqn is using huber loss

    env = DroneWars(gameDisplay, display_width=800, display_height=600, clock=clock, fps=200)

    state, _, _, _ = env.step(0)
    state = torch.cat(tuple(state for _ in range(4)))[None, :, :, :] # coppies same state over for 4 times
    # [None, :, :, :] doesnt do anything...
    # uses 4 channels, coppies sames state info 4 times? # can change in nn to 2 channels
    
    """
    a = np.eye(9, dtype=int)
    actions = {}
    for n in range(9):
        actions[n] = a[n]
    """
    # multiple drones
    action_dict = {
        0 : [1,0,0,0,0,0,0,0,0],
        1 : [0,1,0,0,0,0,0,0,0], 
        2 : [0,0,1,0,0,0,0,0,0],
        3 : [0,0,0,1,0,0,0,0,0],
        4 : [0,0,0,0,1,0,0,0,0], 
        5 : [0,0,0,0,0,1,0,0,0],
        6 : [0,0,0,0,0,0,1,0,0],
        7 : [0,0,0,0,0,0,0,1,0],
        8 : [0,0,0,0,0,0,0,0,1] 
    }
    """
    # one drone
    action_dict = {
        0 : [1,0,0],
        1 : [0,1,0], 
        2 : [0,0,1],
    }
    """
    while iter < model_params['num_iters']:

        if torch.cuda.is_available():
            #prediction = model(state.cuda())[0]
            if iter > update_starts:
                prediction = model_target(state.cuda())[0]
            else:
                prediction = model(state.cuda())[0]
        else:
            #prediction = model(state)[0]
            #prediction = model_target(state)[0]
            if iter > update_starts:
                prediction = model_target(state)[0]
            else:
                prediction = model(state)[0]
                
        # Exploration or exploitation
        epsilon = model_params['final_epsilon'] + (
                max(model_params['num_decay_iters'] - iter, 0) * (model_params['initial_epsilon'] - model_params['final_epsilon']) / model_params['num_decay_iters'])
        
        u = random.random()
        random_action = u <= epsilon

        if random_action:
            #action = random.randint(0, 2) # single drone
            action = random.randint(0, 8)
        else:
            action = torch.argmax(prediction).item()

        next_state, reward, done, _ = env.step(action)

        next_state = torch.cat((state[0, 1:, :, :], next_state))[None, :, :, :]

        replay_memory.append([state, action, reward, next_state, done])
        

        if len(replay_memory) > model_params['replay_memory_size']:
            del replay_memory[0]
        
        batch = random.sample(replay_memory, min(len(replay_memory), model_params['batch_size']))
        state_batch, action_batch, reward_batch, next_state_batch, done_batch = zip(*batch)

        state_batch = torch.cat(tuple(state for state in state_batch))

        action_batch = torch.from_numpy(np.array([action_dict[action] for action in action_batch], dtype=np.float32))

        reward_batch = torch.from_numpy(np.array(reward_batch, dtype=np.float32)[:, None])
        next_state_batch = torch.cat(tuple(state for state in next_state_batch))

        if torch.cuda.is_available():
            state_batch = state_batch.cuda()
            action_batch = action_batch.cuda()
            reward_batch = reward_batch.cuda()
            next_state_batch = next_state_batch.cuda()
        
        #current_prediction_batch = model(state_batch)
        #next_prediction_batch = model(next_state_batch)

        if iter > update_starts: 
            current_prediction_batch = model_target(state_batch)
            next_prediction_batch = model_target(next_state_batch)
        else: 
            current_prediction_batch = model(state_batch)
            next_prediction_batch = model(next_state_batch)

        y_batch = torch.cat(
            tuple(reward if done else reward + model_params['gamma'] * torch.max(prediction) for reward, done, prediction in
                  zip(reward_batch, done_batch, next_prediction_batch)))

        q_value = torch.sum(current_prediction_batch * action_batch, dim=1)
        optimizer.zero_grad()
        loss = criterion(q_value, y_batch)
        loss.backward()
        optimizer.step()

        state = next_state
        
        # Keeping score list
        score = env.score
        iter += 1
        rewards.append(reward)
        scores.append(score)
        all_scores = np.append(all_scores, score)

        
        # Increment episode if done
        if done:
            episodes += 1
            updated = False
 
        if iter < update_starts and done:
            print(f"# Collecting samples for {iter+1}/{update_starts} steps #")
            print(f"Episode: {episodes}")
            print()

        # Update target network every 5 episodes
        if (iter > update_starts) and ((episodes) % model_update_rate == 0) and (updated == False): 
            polyak_update(model.parameters(), model_target.parameters(), tau)
            updated = True
            
            print("###############################")
            print("### Updating target network ###")
            print("###############################")
            
            print(f"Episode: {episodes}")
            print(f"Step: {iter+1}/{model_params['num_iters']}")
            print(f"Loss: {loss:.5f}")
            print(f"LR: {optimizer.param_groups[0]['lr']:.5f}")
            print(f"Epsilon: {epsilon:.4f}")
            print(f"Mean Reward: {np.mean(rewards):.4f}")
            print(f"Mean Score: {np.mean(scores):.4f}")
            print()
            
            rewards = []
            scores = []

        # Save model
        if (iter + 1) % 5000 == 0:
            checkpoint = {"iter": iter,
                          "model_state_dict": model.state_dict(),
                          "optimizer": optimizer.state_dict()}
            torch.save(checkpoint, checkpoint_path)

            checkpoint_target = {iter: iter, 
                            "model_state_dict": model_target.state_dict(),
                            "optimizer": optimizer_target.state_dict()}
            torch.save(checkpoint_target, checkpoint_path_target)

            print("## Saving model. Average Score: ", np.mean(all_scores))
            all_scores = np.array(1) # Reset all_scores list

            with open(memory_path, "wb") as f:
                pickle.dump(replay_memory, f, protocol=pickle.HIGHEST_PROTOCOL)




In [45]:
train(model_params)

###############################
### Updating target network ###
###############################
Episode: 0
Step: 3/650000
Loss: 0.00502
LR: 0.00010
Epsilon: 1.0000
Mean Reward: 0.1000
Mean Score: 0.0000

###############################
### Updating target network ###
###############################
Episode: 5
Step: 84/650000
Loss: 0.02059
LR: 0.00010
Epsilon: 0.9998
Mean Reward: 0.0444
Mean Score: 0.1605



KeyboardInterrupt: ignored