In [0]:
!apt-get install -y xvfb python-opengl > /dev/null 2>&1
!pip install gym pyvirtualdisplay > /dev/null 2>&1

import numpy as np
import matplotlib.pyplot as plt
from IPython import display as ipythondisplay


In [0]:
import numpy as np
import random
from collections import deque

import matplotlib.pyplot as plt
from PIL import Image
import imageio

import os
import gym
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras import Model, Sequential, regularizers
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Input, MaxPooling2D, Dropout
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.losses import Huber
from keras.applications import DenseNet121
from tensorflow.keras import layers


from skimage.color import rgb2gray
from skimage.transform import resize

import math

Using TensorFlow backend.


In [0]:
GAME_NAME = "Breakout-v4"

BATCH_SIZE = 100
NUM_EPISODES = 500
STEPS_PER_EPISODE = 50000
EPSILON = 1
EXP_MEMORY = 1000000
GAMMA = 0.99
EPSILON_DECAY = 0.0001
STATE_SIZE = (120,84,4)

frame_queue = deque([np.zeros((120,84), dtype="float64") for i in range(4)], maxlen=4)
nframe_queue = deque([np.zeros((120,84), dtype="float64") for i in range(4)], maxlen=4)

In [0]:
def to_grayscale(observe):
    processed_observe = np.uint8(
        resize(rgb2gray(observe), (120 , 84), mode='constant') * 255)
    return processed_observe/255

# def stack(img):
#     im = np.reshape(img, (120,84))
#     return np.array(im, dtype="float64")

def preprocess(img):
    return to_grayscale(img)

# def add_to_queue(frame):
#   frame_queue.popleft()
#   frame_queue.append(frame)


# def add_to_nqueue(frame):
#   nframe_queue.popleft()
#   nframe_queue.append(frame)

# def pre_processing(observe):
#     processed_observe = np.uint8(
#         resize(rgb2gray(observe), (120 , 84), mode='constant') * 255)
#     return processed_observe


In [0]:
class Memory:
    def __init__(self, max_memory):
        self.maximum_mem = EXP_MEMORY
        self._states = deque(maxlen = max_memory)

    def save_state(self, state):
        self._states.append(state)
        if len(self._states) > self.maximum_mem:
            self._states.pop(0)

    def get_states(self, no_samples):
        if no_samples > len(self._states):
            return random.sample(self._states, len(self._states))
        else:
            return random.sample(self._states, no_samples)
            
    def num_states(self):
        return len(self._states)

In [0]:
class Agent:

  def __init__(self,env, InputShape, OutputShape):
    self.input_shape = InputShape
    self.output_shape = OutputShape
    self.env = env
    self.model = self.build_model(InputShape, OutputShape)
    self.epsilon = EPSILON

  def build_model(self,InputShape, OutputShape):
    # create model
    with tf.device('/device:GPU:0'):
      model = Sequential([


          Conv2D(32, kernel_size=(3, 3), activation="relu", input_shape=InputShape, padding="same"),
          Conv2D(32, kernel_size=(3, 3), activation="relu", padding="same"),
          Conv2D(64, kernel_size=(3, 3), activation="relu", padding="same"),
          Flatten(),
          
          Dense(128, activation="relu"),
          Dense(128, activation="relu"),
          Dense(OutputShape, activation="softmax", kernel_regularizer=regularizers.l2(0.001)),


      ])
      # Compile model
      model.compile(loss='mse',
                optimizer=RMSprop(lr=0.00025,rho=0.95,epsilon=0.01),
                metrics=['accuracy'])
    return model

  def choose_action_greedy(self, state):
    if np.random.rand() <= self.epsilon:
        return self.env.action_space.sample()

    q_values = self.model.predict(state)
    return np.argmax(q_values[0])

  def choose_action(self, state):
    q_values = self.model.predict(state)
    return np.argmax(q_values[0])


  def train(self,primary_network, memory):

    if memory.num_states() < (BATCH_SIZE * 3):
        return (0,0)

    batch = memory.get_states(BATCH_SIZE)
    # states = np.array([i[0] for i in batch])
    # next_states = np.array([(np.zeros(STATE_SIZE)
    #                              if i[3] is None else i[3]) for i in batch])
    actions = np.array([i[1] for i in batch])
    rewards = np.array([i[2] for i in batch])

    states = np.zeros((BATCH_SIZE, 120,
                        84, 4))
    next_states = np.zeros((BATCH_SIZE, 120,
                        84, 4))

    for idx, val in enumerate(batch):
        states[idx] = val[0]
        next_states[idx] = val[3]

    
    Q_a = self.model.predict(states)
    
    Q__a = self.model.predict(next_states)
    # setup training arrays
    X = np.zeros((len(batch), *self.input_shape))
    Y_labls = np.zeros((len(batch), self.output_shape))

    for i, b in enumerate(batch):
        state, action, reward, next_state = b[0], b[1], b[2], b[3]
        
        current_q = Q_a[i]
        
        if next_state is None:
            current_q[action] = reward
        else:
            current_q[action] = reward + GAMMA * np.amax(Q__a[i])
        X[i] = state
        Y_labls[i] = current_q
    ls = self.model.train_on_batch(X, Y_labls)

    return (ls[0],ls[1])

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
env = gym.make(GAME_NAME)
agent = Agent(env, (120,84,4), env.action_space.n)

num_actions = env.action_space.n
memory = Memory(EXP_MEMORY)

In [0]:
MAX_EPSILON = 1
MIN_EPSILON = 0.01
LAM = 0.0005
GAMMA = 0.95
BATCH_SIZE = 32

RANDOM_REWARD_STD = 1.0

eps = MAX_EPSILON
render = False
double_q = False
steps = 0
for e in tqdm(range(0, NUM_EPISODES)):
    # Reset the enviroment
    state = env.reset()
    cnt = 0
    total_reward = 0
    avg_loss,avg_acc = 0,0

    state = preprocess(state)
    history = np.stack((state, state, state, state), axis=2)
    history = np.reshape([history], (1, 120, 84, 4))

    for timestep in range(STEPS_PER_EPISODE):
        if render:
            env.render()
        action = agent.choose_action_greedy(history) #change this to np.random.choice(action_Size)
        next_state, reward, done, info = env.step(action)
        #reward = np.random.normal(1.0, RANDOM_REWARD_STD)
        if done:
            next_state = np.zeros(state.shape)
        
        # store in memory
        next_state = preprocess(next_state)
        next_state = np.reshape([next_state], (1, 120, 84, 1))
        next_history = np.append(next_state, history[:, :, :, :3], axis=3)

        memory.save_state((history, action, reward, next_history, done))

        ls,acc = agent.train(agent.model, memory)
        avg_loss += ls
        avg_acc += acc

        history = next_history
        total_reward += reward

        # exponentially decay the eps value
        steps += 1
        eps = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAM * steps)

        if done:
            avg_loss /= cnt
            avg_acc /= cnt
            agent.model.save("drive/My Drive/deepnetwork/weights.h5")
            print(f"Episode: {e}, Reward: {total_reward}, avg loss: {avg_loss:.3f}, avg accuracy: {avg_acc:.3f}, eps: {eps:.3f}")
            break
        
        cnt += 1

In [0]:
from keras.models import load_model
model = load_model("drive/My Drive/deepnetwork/weights.h5")
total_epochs, total_penalties = 0, 0
num_of_episodes = 10
env.reset()
counter = 0
images = []

for e in range(2):

    print("Episode >> %d" %e)
    state = env.reset()

    state, _, _, _ = env.step(1)
    

    state = preprocess(state)
    history = np.stack((state, state, state, state), axis=2)
    history = np.reshape([history], (1, 120, 84, 4))
    
    epochs = 0
    penalties = 0
    reward = 0
    total_reward = 0
    
    done = False
    
    while not done:
        prev_screen = env.render(mode='rgb_array')
        # plt.imshow(prev_screen)
        # ipythondisplay.clear_output(wait=True)
        # ipythondisplay.display(plt.gcf())
        
        # env.render()
        
        action = agent.choose_action(history)

        # fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4)
        # fig.suptitle('Horizontally stacked subplots %s' %action)
        # ax1.imshow(history[0][:,:,0])
        # ax2.imshow(history[0][:,:,1])
        # ax3.imshow(history[0][:,:,2])
        # ax4.imshow(history[0][:,:,3])
        # plt.show()

        if(len(images) >= 1440):
          cx = images
          cx = np.array(cx)
          cx[0:1439] = cx[1:1440]
          cx[1440] = prev_screen
          images = list(cx)
        else:
          images.append(prev_screen)

        next_state, reward, done, info = env.step(action)
        
        # store in memory
        next_state = preprocess(next_state)
        next_state = np.reshape([next_state], (1, 120, 84, 1))
        next_history = np.append(next_state, history[:, :, :, :3], axis=3)

        history = next_history
        state = next_state
        
        if reward == -10:
            penalties += 1
        
        total_reward += reward
        epochs += 1
        counter += 1
        
    total_penalties += penalties
    total_epochs += epochs


# ipythondisplay.clear_output(wait=True)
# show_video()
env.close()

OSError: ignored

In [0]:
import cv2
import numpy as np
import glob
 
img_array = []
for filename in images:
    img = filename
    height, width, layers = img.shape
    size = (width,height)
    img_array.append(img)
 
 
out = cv2.VideoWriter('project.mp4',cv2.VideoWriter_fourcc(*'DIVX'), 15, size)
 
for i in range(len(img_array)):
    out.write(img_array[i])
out.release()