## Classic snake DQN
Using the DQN(with experienced replay) implementation of Google-deepmind(2013) to train an agent to play the classic snake game
- The **DQNagent** class handles the replay memory and the 2 sets of networks. Implements a modified version of the archietecture specified by DeepMind.
- The **snake_RL_env** provides a custom classic snake environment, provides an interface for the agent and is specifically designed for RL tasks.

In [1]:
import snake_RL_env
from DQN_agent import DQNagent
import gymnasium as gym
from gymnasium.wrappers import RecordVideo

In [2]:
import tensorflow as tf
import numpy as np
import time

In [3]:
print("Tensorflow version:", tf.__version__)
print("GPU avaible:", tf.config.list_physical_devices('GPU')  )

Tensorflow version: 2.19.0
GPU avaible: []


In [4]:
training_period=50
num_episodes=1000
record_performance_every=50
min_e=.01
total_e_decay_cycles=1

#### **Initialising the environment**

In [5]:
env=gym.make("snake_RL_env/ClassicSnake-v0", render_mode="rgb_array")
env = RecordVideo(
    env,
    video_folder="Classic_snake_gym_vid",
    name_prefix="training",
    episode_trigger=lambda x: x % training_period == 0  
)

  logger.warn(


#### **Initialising the agent**

In [6]:
agent=DQNagent.agent(env.unwrapped.size, env.unwrapped.action_space.n)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# they store the avg_rew, min_rew and max_rew arrays after every e_decay_cycle
AVG_REW=[]
MIN_REW=[]
MAX_REW=[]

a=agent.env_size
b=agent.stack_frame
# these note the reward per episode and avg_rew, min_rew, max_rew every certain number of episodes
rewards=np.array([0], dtype=np.float64)
avg_rew=np.array([0], dtype=np.float64)
min_rew=np.array([0], dtype=np.float64)
max_rew=np.array([0], dtype=np.float64)

e_decay_cycle=1
for episode in range(num_episodes+1):
    obs1, info= env.reset()
    obs2, reward, terminate, truncate, info=env.step(0)
    
    st_=np.array([obs2,obs1])  #(st+1, st)
    st=st_.reshape((a,a,b))
    episode_reward=0    #cumulative reward obtained in a given episode
    done=0
    e=1  
    
    start_time=time.time()
    while not done:     #executing a single episode
        action=agent.e_greedy(st, e)
        obs, rt, terminate, truncate, info=env.step(action)
        st1_=np.array([obs, st_[0]])
        st1=st1_.reshape((a,a,b))
        agent.update_replay_memory(st, action, rt, st1, terminate)
        done = terminate or truncate
        agent.train(done)
        
        st=st1
        st_=st1_
        episode_reward+=rt
        
    end_time=time.time()
    
    print(f"Time: {end_time - start_time:.2f}, Episode_reward: {episode_reward:.2f}")

    
    e=max(min_e, .999*e)    
    np.append(rewards, episode_reward)
    
    if (episode%record_performance_every==0):
        Z=rewards[-record_performance_every:]
        np.append(avg_rew, np.sum(Z)/Z.shape[0])
        np.append(min_rew, np.min(Z))
        np.append(max_rew, np.max(Z))
        
        agent.model.save(f"checkpoints/dqn_episode_{episode}.keras")
        # PENDING: code to log data in tensorboard using tf.summary
    
    # this helps our agent escape local minimas, if it ever gets stuck in one
    '''if(episode == e_decay_cycle*num_episodes // total_e_decay_cycles):
        e=1
        e_decay_cycle+=1
        AVG_REW.append(avg_rew)
        MIN_REW.append(min_rew)
        MAX_REW.append(max_rew)
        rewards=np.array([0], dtype=np.float64)
        avg_rew=np.array([0], dtype=np.float64)
        min_rew=np.array([0], dtype=np.float64)
        max_rew=np.array([0], dtype=np.float64)
        
        agent.model.save(f"checkpoints/dqn_episode_{episode}.keras")'''
        

Time: 0.06, Episode_reward: -0.31
Time: 0.00, Episode_reward: -0.25
Time: 0.00, Episode_reward: -1.11
Time: 0.00, Episode_reward: -1.11
Time: 0.00, Episode_reward: -0.18
Time: 0.00, Episode_reward: -1.21
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Episode_reward: -1.28
Time: 0.00, Episode_reward: -1.26
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Episode_reward: -1.07
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Episode_reward: -1.22
Time: 0.00, Episode_reward: -1.05
Time: 0.00, Episode_reward: -1.15
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Episode_reward: -0.05
Time: 0.00, Episode_reward: -1.17
Time: 0.00, Episode_reward: -1.12
Time: 0.00, Episode_reward: -1.17
Time: 0.00, Episode_reward: -1.09
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Episode_reward: -1.23
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Episode_reward: -0.29
Time: 0.00, Ep