In [1]:
%run chrome_dino_game_env.ipynb

pygame 2.1.0 (SDL 2.0.16, Python 3.7.13)
Hello from the pygame community. https://www.pygame.org/contribute.html
Python 3.7.13
SmallCactus1.png: <rect(0, 0, 40, 71)>
SmallCactus2.png: <rect(0, 0, 68, 71)>
SmallCactus3.png: <rect(0, 0, 105, 71)>
LargeCactus1.png: <rect(0, 0, 48, 95)>
LargeCactus2.png: <rect(0, 0, 99, 95)>
LargeCactus3.png: <rect(0, 0, 102, 95)>
Bird1.png: <rect(0, 0, 97, 68)>
Bird2.png: <rect(0, 0, 93, 62)>
DinoRun1.png: <rect(0, 0, 87, 94)>
DinoRun2.png: <rect(0, 0, 88, 94)>
DinoJump.png: <rect(0, 0, 88, 94)>
DinoDuck1.png: <rect(0, 0, 118, 60)>
DinoDuck2.png: <rect(0, 0, 116, 60)>
Cloud.png: <rect(0, 0, 84, 101)>
GameOver.png: <rect(0, 0, 386, 40)>
Reset.png: <rect(0, 0, 75, 101)>
Track.png: <rect(0, 0, 2404, 28)>


ValueError: Cell is empty

ValueError: Cell is empty

In [8]:
from keras.layers import Dense, Activation
from keras.models import Sequential, load_model

In [9]:
env = ChromeDinoEnv()

In [10]:
env.observation_space.sample()

array([ 474.18318284, 2302.08356864,  365.50760302, 2110.91761293])

In [11]:
class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions, discrete=False):
        self.mem_size = max_size
        self.input_shape = input_shape
        self.discrete = discrete
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        dtype = np.int8 if self.discrete else np.float32
        self.action_memory = np.zeros((self.mem_size, n_actions), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size)
        self.terminal_memory = np.zeros(self.mem_size, dtype=np.float32)
        
    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1-int(done)
        if self.discrete:
            actions = np.zeros(self.action_memory.shape[1])
            actions[action] = 1.0
            self.action_memory[index] = actions
        else:
            self.action_memory[index] = action
        self.mem_cntr += 1
    
    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)
        
        states = self.state_memory[batch]
        states_ = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]
        terminal = self.terminal_memory[batch]
        
        return states, actions, rewards, states_, terminal
    
def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims):
    model = Sequential([
        Dense(fc1_dims, input_shape=(input_dims, )),
        Activation('relu'),
        Dense(fc2_dims),
        Activation('relu'),
        Dense(n_actions)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(lr=lr), loss='mse')
        
    return model

In [12]:
class Agent(object):
    def __init__(self, alpha, gamma, n_actions, epsilon, batch_size, input_dims, epsilon_dec=0.9996, epsilon_end=0.01, mem_size=1000000, fname='dqn_model.h5'):
        self.action_space = [i for i in range(n_actions)]
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_dec = epsilon_dec
        self.epsilon_min = epsilon_end
        self.batch_size = batch_size
        self.model_file = fname
        
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions, discrete=True)
        self.q_eval = build_dqn(alpha, n_actions, input_dims, 256, 256)
        
    def remember(self, state, action, reward, new_state, done):
        self.memory.store_transition(state, action, reward, new_state, done)
        
    def choose_action(self, state):
        state = state[np.newaxis, :]
        rand = np.random.random()
        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            actions = self.q_eval.predict(state, verbose=0)
            action = np.argmax(actions)
        return action
    
    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return 
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        
        action_values = np.array(self.action_space, dtype=np.int8)
        action_indices = np.dot(action, action_values)
        
        q_eval = self.q_eval.predict(state, verbose=0)
        q_next = self.q_eval.predict(new_state, verbose=0)
        
        q_target = q_eval.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        
        q_target[batch_index, action_indices] = reward + self.gamma*np.max(q_next, axis=1)*done
        _ = self.q_eval.fit(state, q_target, verbose=0)
        
        self.epsilon = self.epsilon * self.epsilon_dec if self.epsilon > self.epsilon_min else self.epsilon_min
        
    def save_model(self):
        self.q_eval.save(self.model_file)
        
    def load_model(self):
        self.q_eval = load_model(self.model_file)

In [13]:
if __name__ == 'main':
    env = gym.make('LunarLander-v2')
    n_games = 500
    agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0005, input_dims=8, n_actions=4, mem_size=1000000, batch_size=64, epsilon_end=0.01)
    scores = []
    eps_hist = []
    
    for i in range(n_games):
        done = False
        score = 0
        observation = env.reset()
        while not done:
            action = agent.choose_action(observation)
            observation_, reward, done, info = env.step(action)
            score += reward
            agent.remember(observation, action, reward, observation_, done)
            observation = observation_
            agent.learn()
        eps_hist.append(agent.epsilon)
        scores.append(score)
        
        avg_score = np.mean(scores[max(0, i-100):(i+1)])
        print('episode ', i, 'score %.2f' % score, 'average score %.2f' %avg_score)
        
        if i % 10 == 0 and i > 0:
            agent.save_model()
    

In [14]:
#env = gym.make('LunarLander-v2')
env = ChromeDinoEnv()
n_games = 500
agent = Agent(gamma=0.99, epsilon=1.0, alpha=0.0005, input_dims=env.observation_space.shape[0], n_actions=env.action_space.n, mem_size=1000000, batch_size=64, epsilon_end=0.01)
scores = []
eps_hist = []
    
for i in range(n_games):
    done = False
    score = 0
    observation = env.reset()
    while not done:
        action = agent.choose_action(observation)
        observation_, reward, done, info = env.step(action)
        score += reward
        agent.remember(observation, action, reward, observation_, done)
        observation = observation_
        agent.learn()
    eps_hist.append(agent.epsilon)
    scores.append(score)
        
    avg_score = np.mean(scores[max(0, i-100):(i+1)])
    print('episode ', i, 'score %.2f' % score, 'average score %.2f' %avg_score)
        
    if i % 10 == 0 and i > 0:
        agent.save_model()

2022-09-17 17:02:39.368397: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super(Adam, self).__init__(name, **kwargs)


episode  0 score 149.00 average score 149.00
episode  1 score 79.00 average score 114.00
episode  2 score 259.00 average score 162.33
episode  3 score 171.00 average score 164.50
episode  4 score 800.00 average score 291.60
episode  5 score 349.00 average score 301.17
episode  6 score 174.00 average score 283.00
episode  7 score 261.00 average score 280.25
episode  8 score 171.00 average score 268.11
episode  9 score 77.00 average score 249.00
episode  10 score 175.00 average score 242.27
episode  11 score 72.00 average score 228.08
episode  12 score 80.00 average score 216.69
episode  13 score 83.00 average score 207.14
episode  14 score 84.00 average score 198.93
episode  15 score 72.00 average score 191.00
episode  16 score 79.00 average score 184.41
episode  17 score 162.00 average score 183.17
episode  18 score 262.00 average score 187.32
episode  19 score 83.00 average score 182.10
episode  20 score 72.00 average score 176.86
episode  21 score 355.00 average score 184.95
episode 

episode  179 score 81.00 average score 150.35
episode  180 score 72.00 average score 150.33
episode  181 score 74.00 average score 150.26
episode  182 score 80.00 average score 149.42
episode  183 score 162.00 average score 150.20
episode  184 score 72.00 average score 150.20
episode  185 score 170.00 average score 151.10
episode  186 score 162.00 average score 151.99
episode  187 score 260.00 average score 150.29
episode  188 score 170.00 average score 148.47
episode  189 score 262.00 average score 150.35
episode  190 score 72.00 average score 150.25
episode  191 score 164.00 average score 150.27
episode  192 score 166.00 average score 151.14
episode  193 score 171.00 average score 152.04
episode  194 score 84.00 average score 150.32
episode  195 score 72.00 average score 148.48
episode  196 score 261.00 average score 150.28
episode  197 score 169.00 average score 151.14
episode  198 score 166.00 average score 151.16
episode  199 score 85.00 average score 151.29
episode  200 score 73.

episode  356 score 174.00 average score 143.89
episode  357 score 80.00 average score 143.89
episode  358 score 80.00 average score 143.86
episode  359 score 169.00 average score 144.82
episode  360 score 80.00 average score 144.01
episode  361 score 259.00 average score 144.87
episode  362 score 169.00 average score 144.82
episode  363 score 81.00 average score 144.91
episode  364 score 81.00 average score 144.92
episode  365 score 83.00 average score 144.11
episode  366 score 81.00 average score 144.20
episode  367 score 174.00 average score 145.21
episode  368 score 80.00 average score 145.22
episode  369 score 162.00 average score 146.01
episode  370 score 173.00 average score 146.12
episode  371 score 172.00 average score 147.09
episode  372 score 441.00 average score 150.65
episode  373 score 81.00 average score 150.65
episode  374 score 170.00 average score 150.61
episode  375 score 72.00 average score 150.61
episode  376 score 75.00 average score 149.67
episode  377 score 166.0

In [48]:
env = ChromeDinoEnv(render_mode='human')
done = False
observation = env.reset()
while not done:
    action = agent.choose_action(observation)
    observation_, reward, done, info = env.step(action)
    observation = observation_
    