**AUTHORIZATIONS**

In [None]:
from google.colab import drive
drive.mount('./mount')

**IMPORT LIBRARIES**

In [3]:
import collections
import numpy as np

import torch
import torch.nn as nn
from torch import optim

**Parameters**

In [4]:
# Env
ROW = 10

# Q-learning
GAMMA = 0.99  # discount
BATCH_SIZE = 3
REPLAY_SIZE = 10000
LEARNING_RATE = 1e-4
SYNC_TARGET_LOOPS = 1000
REPLAY_START_SIZE = 10000

# + epsilon -> chance for random action
EPSILON_DECAY_LAST_FRAME = 10**5
EPSILON_START = 1.0
EPSILON_FINAL = 0.008

# Neural network
INPUT_SIZE = 28
N_ACTIONS = 4

**Environment (Snake game)**

In [5]:
class SnakeSensors:
    def __init__(self, row, moves):
        self.row = row
        self.dis = self.row-1
        self.moves = moves

        self.next_to_head_dir = [(1, 0), (0, 1), (-1, 0), (0, -1)]
    
    def update_sensor_board(self, board, snake):
        self.board = board
        self.head_y, self.head_x = snake

        # distance to boundaries in 4 dimensions
        self.dis_y_down = self.dis - self.head_y
        self.dis_y_up = self.dis - self.dis_y_down
        self.dis_x_right = self.dis - self.head_x
        self.dis_x_left = self.dis - self.dis_x_right

    def check_up(self, target):
        if self.head_y == 0: return 0
        for i in range(self.head_y-1, -1, -1):
            if self.board[i, self.head_x] == target:
                return 1
        return 0
    
    def check_down(self, target):
        if self.head_y == self.dis: return 0
        for i in range(self.head_y+1, self.row):
            if self.board[i, self.head_x] == target:
                return 1
        return 0
    
    def check_right(self, target):
        if self.head_x == self.dis: return 0
        for i in range(self.head_x+1, self.row):
            if self.board[self.head_y, i] == target:
                return 1
        return 0
    
    def check_left(self, target):
        if self.head_x == 0: return 0
        for i in range(self.head_x-1, -1, -1):
            if self.board[self.head_y, i] == target:
                return 1
        return 0
    
    def check_right_up(self, target):
        # choose shorter distance
        distance = self.dis_y_up if self.dis_y_up < self.dis_x_right else self.dis_x_right
        if distance == 0: return 0
        for n in range(1, distance+1):
            if self.board[self.head_y-n, self.head_x+n] == target:
                return 1
        return 0
    
    def check_right_down(self, target):
        distance = self.dis_y_down if self.dis_y_down < self.dis_x_right else self.dis_x_right
        if distance == self.row: return 0
        for n in range(1, distance+1):
            if self.board[self.head_y+n, self.head_x+n] == target: 
                return 1
        return 0
    
    def check_left_up(self, target):
        distance = self.dis_y_up if self.dis_y_up < self.dis_x_left else self.dis_x_left
        if distance == 0: return 0
        for n in range(1, distance+1):
            if self.board[self.head_y-n, self.head_x-n] == target: 
                return 1
        return 0

    def check_left_down(self, target):
        distance = self.dis_y_down if self.dis_y_down < self.dis_x_left else self.dis_x_left
        if distance == self.row: return 0
        for n in range(1, distance+1):
            if self.board[self.head_y+n, self.head_x-n] == target: 
                return 1
        return 0
    
    def all_eight_directions(self, target):
        to_up = self.check_up(target)                       # up
        to_right_up = self.check_right_up(target)           # up right
        to_right = self.check_right(target)                 # right
        to_right_down = self.check_right_down(target)       # right down
        to_down = self.check_down(target)                   # down
        to_left_down = self.check_left_down(target)         # down left
        to_left = self.check_left(target)                   # left
        to_left_up = self.check_left_up(target)             # left up
        return np.array([to_up, to_right_up, to_right, to_right_down, to_down, to_left_down, to_left, to_left_up])

    def next_to_head(self, target):
        array = np.array([])
        for y, x in self.next_to_head_dir:
          if self.head_y+y > -1 and self.head_y+y < self.dis \
            and self.head_x+x > -1 and self.head_x+x < self.dis:
            if self.board[self.head_y+y, self.head_x+x] == target:
              array = np.append(array, np.array([1]))
            else:   array = np.append(array, np.array([0]))
          else:     array = np.append(array, np.array([0]))
        return array
    
    def distance_to_walls(self):
        return np.round(np.array([self.dis_y_up, self.dis_x_right, self.dis_y_down, self.dis_x_left]) / self.dis, 1)

    def get_head_direction(self, head_dir):
        if type(head_dir) == type(None):
          return np.array([0, 0, 0, 0])
        else:
          if np.array_equal(head_dir, self.moves["up"]): return np.array([1, 0, 0, 0])        # up
          elif np.array_equal(head_dir, self.moves["right"]): return np.array([0, 1, 0, 0])   # right
          elif np.array_equal(head_dir, self.moves["down"]): return np.array([0, 0, 1, 0])    # down
          elif np.array_equal(head_dir, self.moves["left"]): return np.array([0, 0, 0, 1])    # left

    def get_tail_direction(self, snake):
        if len(snake)> 1:
            tail_dir = tuple((np.array([snake[1, 0], snake[1, 1]] \
                 - np.array([snake[0, 0], snake[0, 1]]))).reshape(1, -1)[0])
            if np.array_equal(tail_dir, self.moves["up"]): return np.array([1, 0, 0, 0])      # up
            elif np.array_equal(tail_dir, self.moves["right"]): return np.array([0, 1, 0, 0]) # right
            elif np.array_equal(tail_dir, self.moves["down"]): return np.array([0, 0, 1, 0])  # down
            elif np.array_equal(tail_dir, self.moves["left"]): return np.array([0, 0, 0, 1])  # left
        else:
            return np.array([0, 0, 0, 0])

### ENV
class Environment():
############################### Initalize parameters ###############################
    def __init__(self, n_row):
        # row * row = board
        self.row = n_row
        # map of board
        self.blocks = {"empty": 0, "snake": 1, "apple": 2}
        # (y, x)
        self.moves_dir = {"up": np.array([-1, 0]), "right": np.array([0, 1]), \
                      "down": np.array([1, 0]), "left": np.array([0, -1])}
        # List of all rewards
        self.reward_dict = {"hit self": -100, "hit boundary": -100, "eat apple": 10, \
                            "step": -1, "a lot of steps": -100, "win game": 1000}
        # Number of possible actions
        self.action_space = 4
        # Set up sensors for getting state
        self.Sensors = SnakeSensors(self.row, self.moves_dir)
        # Prepare game
        self.reset()

    def second_init(self):
        self.done = False           # If game is over (death)
        self.direction = None       # Direction of head (for computing state)
        self.steps = 0              # count of steps until it reache apple
        self.eaten_apples = 0       # count of eaten apples
        self.info = "Unfinished"    # If player win the game

############################### GENERATING ###############################
    def generate_grid(self):
        # generate board (grid) of zeros (always square)
        self.board = np.zeros((self.row, self.row))
    
    def generate_snake(self):
        # Randomly choose spot to generate snake
        indices = np.random.randint(0, high=self.row, size=2)
        y, x = indices[0], indices[1]

        self.board[y, x] = self.blocks["snake"]
        self.snake_body = np.array([[y, x]])
        self.beginning_lenght = 1

    def generate_apple(self):
        # Randomly generate apple (if there isn't already body of snake)
        while True:
            indices = np.random.randint(0, high=self.row, size=2)
            y, x = indices[0], indices[1]

            if self.board[y, x] == self.blocks["empty"]:
                self.board[y, x] = self.blocks["apple"]
                self.apple_pos = np.array([y, x])
                break

############################### CHECK LOGIC ###############################
    def check_n_steps(self):
        # If count of steps is bigger than treshold; game over
        if self.steps > (self.row**2/2):
            self.done = True
            self.reward = self.reward_dict["a lot of steps"]

    def check_hit_self(self):
        # Check if set of body isn't long as it had eaten apples
        self.body = set([(i[0], i[1]) for i in self.snake_body.tolist()])
        self.len_body = len(self.body)
        if len(self.body) != self.eaten_apples+self.beginning_lenght:
            self.done = True
            self.reward = self.reward_dict["hit self"]

    def check_boundaries(self, new_head):
        # Check if (y, x) go beyond boundary
        y, x = new_head
        if y < 0 or x < 0 or y > self.row-1 or x > self.row-1:
            self.done = True
            self.reward = self.reward_dict["hit boundary"]
    
    def check_end_of_game(self):
        # If whole board is filled with snake; player won
        if np.all(self.board.all(self.blocks["snake"])):
            self.done = True
            self.reward = self.reward_dict["win game"]
            self.info = "Finished"
    
    def check_eaten_apple(self, head):
        # If head is on position of apple; restart steps and update other components...
        if np.array_equal(head, self.apple_pos):
            self.steps = 0
            self.eaten_apples += 1
            self.generate_apple()
            self.reward = self.reward_dict["eat apple"] + len(self.snake_body)**2
            return True
        return False

    def snake_algorithm(self, new_head):
        # Set new head of snake before current head in corresponding direction
        self.snake_body = np.vstack((self.snake_body, new_head))

        # if eaten apple == False; tail is deleted
        if not self.check_eaten_apple(self.snake_body[-1]):
            self.snake_body = np.delete(self.snake_body, 0, 0)
    
    def move(self, action):
        # handling whole logic
        if action == 0:     direction = self.moves_dir["up"]
        elif action == 1:   direction = self.moves_dir["right"]
        elif action == 2:   direction = self.moves_dir["down"]
        elif action == 3:   direction = self.moves_dir["left"]
        self.direction = direction
        head_pos = self.snake_body[-1]
        new_head_pos = (head_pos[0]+direction[0], head_pos[1]+direction[1])

        self.check_n_steps()
        self.check_hit_self()
        self.check_boundaries(new_head_pos)
        if not self.done:
            self.snake_algorithm(new_head_pos)

    def compute_state(self):
        # Compute state of snake sensors from SnakeSensors; get passed to Agent
        self.Sensors.update_sensor_board(self.board, self.snake_body[-1])
        #next_to_head = self.Sensors.next_to_head(self.blocks["empty"])
        distance = self.Sensors.distance_to_walls()
        see_apple = self.Sensors.all_eight_directions(self.blocks["apple"])
        see_self = self.Sensors.all_eight_directions(self.blocks["snake"])
        head_dir = self.Sensors.get_head_direction(self.direction)
        tail_dir = self.Sensors.get_tail_direction(self.snake_body)
        self.state = np.concatenate((distance, see_apple, see_self, head_dir, tail_dir), axis=0)

############################### PERFORM FUNCTIONS FOR ENV ###############################
    def sample_action(self):
        # return random action
        return np.random.choice(np.array([0, 1, 2, 3]), size=1)[0]
    
    def refresh_board(self):
        # refresh board; write on board snake and apple
        self.generate_grid()
        for body in self.snake_body:
            self.board[body[0], body[1]] = self.blocks["snake"]
        self.board[self.apple_pos[0], self.apple_pos[1]] = self.blocks["apple"]

    def reset(self):
        # Reset/set up game parameters
        self.second_init()

        # Generate and refresh board
        self.generate_grid()
        self.generate_snake()
        self.generate_apple()
        self.compute_state()

        return self.state

    def step(self, action):
        # Perform action, whole back up logic and return results of action
        self.reward = self.reward_dict["step"]
        self.steps += 1
        if self.done:
            self.reset()

        self.move(action)
        self.refresh_board()
        self.compute_state()

        return self.state, self.reward, self.done, self.info

**AGENT**

In [6]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class ExperienceBuffer:
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience):
        self.buffer.append(experience)

    def sample(self, batch_size):
        indices = np.random.choice(len(self.buffer), batch_size, replace=False)
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])
        return np.array(states), np.array(actions), np.array(rewards, dtype=np.float32), \
               np.array(dones, dtype=np.uint8), np.array(next_states)

class Agent:
    def __init__(self, env, exp_buffer):
        self.env = env
        self.reset_env = env
        self.exp_buffer = exp_buffer
        self._reset()
        self.generation_count = 0

    def _reset(self):
        #self.state = self.env.reset()
        self.state = self.reset_env.reset()
        self.total_reward = 0.0
    
    @torch.no_grad()
    def play_step(self, net, epsilon=0.0, device="cpu"):
        done_reward = None

        if np.random.random() < epsilon:
            action = self.env.sample_action()
        else:
            state_a = np.array([self.state], copy=False).astype("float32")
            state_v = torch.from_numpy(state_a).to(device, dtype=torch.float32)
            q_vals_v = net(state_v)
            _, action_v = torch.max(q_vals_v, dim=1)
            action = int(action_v.item())

        # do step in the environment
        new_state, reward, is_done, _ = self.env.step(action)
        self.total_reward += reward

        exp = Experience(self.state, action, reward, is_done, new_state)
        self.exp_buffer.append(exp)
        self.state = new_state

        if is_done:
            done_reward = self.total_reward
            self._reset()
            self.generation_count += 1
        return done_reward

**DQN AI**

In [7]:
class Neural_Network(nn.Module):
    def __init__(self, lr=LEARNING_RATE):
        super(Neural_Network, self).__init__()
        """
        Input to NN:
            [distance to wall, see apple, see it self, head direction, tail direction] -> 28 elements
        output of NN:
            [0: up    1: right    2: down    3: left] -> 4 elements
        """
        
        self.model = nn.Sequential(
            nn.Linear(INPUT_SIZE, 20),
            nn.ReLU(),
            nn.Linear(20, 12),
            nn.ReLU(),
            nn.Linear(12, N_ACTIONS)
        )

        self.optimizer = torch.optim.Adam(self.parameters(), lr=lr)
    
    def forward(self, input_tensor):
        return self.model(input_tensor)

class SaveAndLoad:
    def load_models(self, net, target_net, agent, device):
        net.load_state_dict(torch.load("mount/My Drive/Colab Notebooks/snake/net.dat", map_location=torch.device(device)))
        target_net.load_state_dict(net.state_dict())
        with open("mount/My Drive/Colab Notebooks/snake/index", 'rb') as f:
            index = np.load(f)[0]
        with open("mount/My Drive/Colab Notebooks/snake/total_rewards", 'rb') as f:
            total_rewards = np.load(f).tolist()
        with open("mount/My Drive/Colab Notebooks/snake/count_deaths", 'rb') as f:
            agent.generation_count = np.load(f)[0]
        return net, target_net, index, total_rewards
    
    def save_models(self, net, agent, index, total_rewards):
        torch.save(net.state_dict(), "mount/My Drive/Colab Notebooks/snake/net.dat")
        with open("mount/My Drive/Colab Notebooks/snake/index", 'wb') as f:
            np.save(f, np.array([index]))
        with open("mount/My Drive/Colab Notebooks/snake/total_rewards", 'wb') as f:
            np.save(f, np.array(total_rewards))
        with open("mount/My Drive/Colab Notebooks/snake/count_deaths", 'wb') as f:
            np.save(f, np.array([agent.generation_count]))

class DQN(SaveAndLoad):
    def __init__(self, net, buffer, agent, load=False):
        self.device = self.select_device()
        self.net = net.to(self.device)
        self.target_net = net.to(self.device)
        self.buffer = buffer
        self.agent = agent

        self.epsilon = EPSILON_START
        self.lr = LEARNING_RATE

        self.second_init(load)
    
    def second_init(self, load):
        # parameters
        self.best_mean_reward = None
        self.mean_reward = None
        self.finished = False
        # loading sequence
        if load: 
            self.net, self.target_net, \
                self.index, self.total_rewards = \
                    self.load_models(self.net, self.target_net, self.agent, self.device)
        else:
            self.index = 0
            self.total_rewards = []

    def select_device(self):
        if torch.cuda.is_available():
            torch.set_default_tensor_type(torch.cuda.FloatTensor)
            print("using cuda:", torch.cuda.get_device_name(0))
        return torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    def save(self):
        self.save_models(self.net, self.agent, self.index, self.total_rewards)
    
    def api(self):    
        return (self.agent.env.board, self.agent.env.state, self.agent.env.reward, \
                self.epsilon, self.mean_reward, \
                self.agent.env.steps, self.agent.generation_count, \
                self.agent.env.eaten_apples)
    
    def light_api(self):
        return (self.epsilon, self.mean_reward, \
                self.agent.env.steps, self.agent.generation_count, \
                self.agent.env.eaten_apples)
    
    def super_light_api(self):
        return (self.agent.env.info)
    
    def calc_loss(self, batch, device="cpu"):
        # unpack batch
        states, actions, rewards, dones, next_states = batch

        # convert everything from batch to torch tensors and move it to device
        states_v = torch.tensor(states).to(device, dtype=torch.float32)
        next_states_v = torch.tensor(next_states).to(device, dtype=torch.float32)
        actions_v = torch.tensor(actions).to(device, dtype=torch.int64)
        rewards_v = torch.tensor(rewards).to(device, dtype=torch.float32)
        done_mask = torch.ByteTensor(dones).to(device)
        done_mask = done_mask.to(torch.bool)

        # get output from NNs which is used for calculating state action value with discount
        state_action_values = self.net(states_v).gather(1, actions_v.unsqueeze(-1)).squeeze(-1)
        next_state_values = self.target_net(next_states_v).max(1)[0]
        next_state_values[done_mask] = 0.0
        next_state_values = next_state_values.detach()

        expected_state_action_values = next_state_values * GAMMA + rewards_v
        # Calculate NN loss
        return nn.MSELoss()(state_action_values, expected_state_action_values)
    
    def simulate(self):
        # Training AI
        self.index += 1
        self.epsilon = max(EPSILON_FINAL, EPSILON_START - self.index / EPSILON_DECAY_LAST_FRAME)

        reward = self.agent.play_step(self.net, self.epsilon, device=self.device)

        if reward is not None:
            self.total_rewards.append(reward)
            self.mean_reward = np.mean(self.total_rewards[-100:])
            
            if self.best_mean_reward is None or self.best_mean_reward < self.mean_reward:
                self.save()
                self.total_rewards = self.total_rewards[-100:]

                if self.best_mean_reward is not None:
                    self.agent_info = {"Generation": self.agent.generation_count, "Mean reward": self.mean_reward, "Epsilon": self.epsilon}
                    print(self.agent_info)
                self.best_mean_reward = self.mean_reward

            if self.agent.env.info == "Finished":
                print("Solved in %d frames!" % self.index)
                self.save()
                self.finished = True
                return
            
        if len(self.buffer) < REPLAY_START_SIZE:
            return
        
        # After certain amount time target net become first net
        if self.index % SYNC_TARGET_LOOPS == 0:
            self.target_net.load_state_dict(self.net.state_dict())

        # Calculate loss of NN and train it
        self.net.optimizer.zero_grad()
        batch = self.buffer.sample(BATCH_SIZE)
        loss_t = self.calc_loss(batch, device=self.device)
        loss_t.backward()
        self.net.optimizer.step()



**Train**

In [None]:
net = Neural_Network()
env = Environment(ROW)
buffer = ExperienceBuffer(REPLAY_SIZE)
agent = Agent(env, buffer)
dqn = DQN(net, buffer, agent, load=False)

flag = True
count = 0

while True:
    dqn.simulate()

    if dqn.super_light_api() == "Finished":
        break

    if count % 100000 == 0:
        epsilon, mean_reward, steps, generation, score = dqn.light_api()
        print("Generation", generation, "Mean reward", mean_reward, "Epsilon", epsilon, "Mean Reward", mean_reward)
        dqn.save()
        count = 0
    count += 1

using cuda: Tesla K80
Generation 0 Mean reward None Epsilon 0.99999 Mean Reward None
{'Generation': 2, 'Mean reward': -127.5, 'Epsilon': 0.99943}
{'Generation': 3, 'Mean reward': -122.66666666666667, 'Epsilon': 0.99929}
{'Generation': 4, 'Mean reward': -118.25, 'Epsilon': 0.99923}
{'Generation': 5, 'Mean reward': -115.6, 'Epsilon': 0.99917}
{'Generation': 6, 'Mean reward': -111.66666666666667, 'Epsilon': 0.99909}
{'Generation': 7, 'Mean reward': -110.42857142857143, 'Epsilon': 0.99905}
{'Generation': 8, 'Mean reward': -109.25, 'Epsilon': 0.99903}
{'Generation': 1749, 'Mean reward': -108.74, 'Epsilon': 0.72139}
{'Generation': 2134, 'Mean reward': -108.72, 'Epsilon': 0.66933}
{'Generation': 2145, 'Mean reward': -108.67, 'Epsilon': 0.6679200000000001}
{'Generation': 2146, 'Mean reward': -108.66, 'Epsilon': 0.66791}
{'Generation': 2147, 'Mean reward': -108.61, 'Epsilon': 0.66774}
{'Generation': 2148, 'Mean reward': -108.35, 'Epsilon': 0.66771}
{'Generation': 2149, 'Mean reward': -108.2, 'E