In [2]:
import numpy as np
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import gym
from tensorflow.keras import models, layers
import random

2023-11-03 09:13:15.121924: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-03 09:13:15.126914: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-03 09:13:15.198406: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-03 09:13:15.198443: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-03 09:13:15.198462: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [6]:
env = gym.make("Breakout-v0", obs_type='grayscale')
env = gym.wrappers.AtariPreprocessing(env=env, frame_skip=1)
env = gym.wrappers.FrameStack(env=env, num_stack=4)

In [43]:
def build_model(action_size):
    model = models.Sequential()
    model.add(layers.Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=(84, 84, 4)))
    model.add(layers.Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.Flatten())
    model.add(layers.Dense(512, activation='relu'))
    model.add(layers.Dense(action_size, activation='linear'))
    model.compile(loss='mse', optimizer=Adam(learning_rate=0.001))
    return model


def preprocess_frame(frame):
    # Resize, normalize, etc.
    # ... your preprocessing steps here ...
    processed_frame = np.resize(frame, (84, 84))  # This is a placeholder, replace with actual preprocessing
    return processed_frame

# Initialize frame stacker
frame_stacker = deque(maxlen=4)

# Function to stack frames
def stack_frames(new_frame, is_new_episode=False, stack=frame_stacker):
    frame = preprocess_frame(new_frame)
    
    if is_new_episode:
        # Clear our stack
        stack.clear()
        # Because we're in a new episode, copy the same frame 4x
        for _ in range(4):
            stack.append(frame)
    else:
        # Append frame to deque, automatically removes the oldest frame
        stack.append(frame)
    
    # Stack the frames along the third dimension and return a new numpy array
    stacked_state = np.stack(stack, axis=2)
    return stacked_state

In [44]:
# Set parameters
N = 10000  # Replay memory capacity
M = 1000  # Number of episodes
T = 10000  # Max steps per episode
C = 1000  # Target network update frequency
epsilon = 1
epsilon_decay = 0.995
epsilon_min = 0.1
gamma = 0.9
action_size = env.action_space.n  # Number of actions
state_size = env.observation_space.shape[0]  # State size

# Initialize replay memory
D = deque(maxlen=N)


Q = build_model(action_size)
Q_hat = build_model(action_size)
Q_hat.set_weights(Q.get_weights())


In [9]:
Q.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 20, 20, 32)        8224      
                                                                 
 conv2d_1 (Conv2D)           (None, 9, 9, 64)          32832     
                                                                 
 conv2d_2 (Conv2D)           (None, 7, 7, 64)          36928     
                                                                 
 flatten (Flatten)           (None, 3136)              0         
                                                                 
 dense (Dense)               (None, 512)               1606144   
                                                                 
 dense_1 (Dense)             (None, 4)                 2052      
                                                                 
Total params: 1686180 (6.43 MB)
Trainable params: 168618

In [17]:
Q_hat.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_3 (Conv2D)           (None, 20, 20, 32)        8224      
                                                                 
 conv2d_4 (Conv2D)           (None, 9, 9, 64)          32832     
                                                                 
 conv2d_5 (Conv2D)           (None, 7, 7, 64)          36928     
                                                                 
 flatten_1 (Flatten)         (None, 3136)              0         
                                                                 
 dense_2 (Dense)             (None, 512)               1606144   
                                                                 
 dense_3 (Dense)             (None, 4)                 2052      
                                                                 
Total params: 1686180 (6.43 MB)
Trainable params: 1686

In [45]:
state = env.reset()
next_state, reward, done, _, _ = env.step(1)
np.asarray(next_state).reshape(84,84,4).shape

(84, 84, 4)

In [47]:
from tqdm import tqdm
state = np.asarray(env.reset()[0]).reshape(84,84,4)
total_reward = 0
for t in tqdm(range(T)):
    # Epsilon-greedy action selection
    if np.random.rand() <= epsilon:
        action = random.randrange(action_size)
    else:
        act_values = Q.predict(state)
        action = np.argmax(act_values[0])

    next_state, reward, done, _, _ = env.step(action)
    next_state = np.asarray(next_state).reshape(84,84,4)
    total_reward += reward

    # Store transition in D
    D.append((state, action, reward, next_state, done))

    state = next_state

    # Check if the episode is done
    if done:
        print(f"Episode: {episode}/{M}, Score: {total_reward}")
        break

    # Train using a random minibatch from D
    if len(D) > 32:
        minibatch = random.sample(D, 32)
        for w, a, r, w_next, terminal in minibatch:
            target = r
            if not terminal:
                target = (reward + gamma * np.amax(Q_hat.predict(np.expand_dims(w_next, axis=0), verbose=0)))

            target_f = Q.predict(np.expand_dims(w, axis=0), verbose=0)
            target_f[0][a] = target
            Q.fit(np.expand_dims(w, axis=0), target_f, epochs=1, verbose=0)

# Update epsilon
if epsilon > epsilon_min:
    epsilon *= epsilon_decay

# Update target network
if episode % C == 0:
    Q_hat.set_weights(Q.get_weights())

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 7/10000 [00:55<22:09:21,  7.98s/it]


KeyboardInterrupt: 