In [29]:
import gym
import tensorflow as tf
from tensorflow import keras
import numpy as np

from collections import deque # for experience replay

replay_buffer = deque(maxlen=2000) # 2000 is the maximum number of transitions we want to store

# constants / initializations
batch_size = 32
discount_factor = 0.95
optimizer = keras.optimizers.Adam(lr=1e-3)
loss_fn = keras.losses.mean_squared_error
learning_rate = 0.0001
 

env = gym.make("CartPole-v1", render_mode="rgb_array")

input_shape = [4] # == env.observation_space.shape
n_outputs = 2 # == env.action_space.n

model = keras.models.Sequential([
keras.layers.Dense(32, activation="elu", input_shape=input_shape),
keras.layers.Dense(32, activation="elu"),
keras.layers.Dense(n_outputs)
])
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(optimizer, loss="mse")


def epsilon_greedy_policy(state, epsilon=0):
    if np.random.rand() < epsilon:
        return np.random.randint(2)
    else:
        #print('state FOR TENSOR: ', state)
        #s_tensor = make_tensor(state, False)
        #data_tensor = tf.ragged.constant(state)
        #print('data_tensor: ', data_tensor)
        #Q_values = model.predict(data_tensor, verbose=0)  # outputs two Q-values
        #state = np.array(state)
        #print(state)
        #state = np.array(state)
        print('state: ', state)
        Q_values = model.predict(state[np.newaxis, :]) # outputs two Q-values
        return np.argmax(Q_values[0])
    

def sample_experiences(batch_size):
    indices = np.random.randint(len(replay_buffer), size=batch_size)
    batch = [replay_buffer[index] for index in indices]
    states, actions, rewards, next_states, dones = [
        np.array([experience[field_index] for experience in batch])
        for field_index in range(5)]
    return states, actions, rewards, next_states, dones

def make_tensor(s, list: bool):
    '''in order to be used in net.predict() method'''
    s_tensor = tf.convert_to_tensor(s)
    if list:
        return s_tensor
    return tf.expand_dims(s_tensor, 0)


def training_step(batch_size):
    experiences = sample_experiences(batch_size)
    states, actions, rewards, next_states, dones = experiences
    next_Q_values = model.predict(next_states)
    max_next_Q_values = np.max(next_Q_values, axis=1)
    target_Q_values = (rewards +
                        (1 - dones) * discount_factor * max_next_Q_values)
    mask = tf.one_hot(actions, n_outputs)
    with tf.GradientTape() as tape:
        all_Q_values = model(states)
        print('all_Q_values: ', all_Q_values)
        Q_values = tf.reduce_sum(all_Q_values * mask, axis=1, keepdims=True)
        loss = tf.reduce_mean(loss_fn(target_Q_values, Q_values))
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))





In [30]:
def play_one_step(env, state, epsilon):
    
    action = epsilon_greedy_policy(state, epsilon)
    
    next_state, reward, done, trunk, info = env.step(action)

    print(done)
   
    #position, velocity, angle, angular_vel = env.observation
    # !!!!!!!!!!!!!!!!!!!!!
    #next_state, reward, term, trunk, done, info = env.step(action=action)    

    replay_buffer.append((state, action, reward, next_state, done))
    return next_state, reward, done, info




for episode in range(600):
    obs = env.reset()
    print('episode: ', episode)
    for step in range(200):
        epsilon = 0.5 # max(1 - episode / 500, 0.01)
        obs, reward, done, info = play_one_step(env, obs, epsilon)
        if done:
            break
    if episode > 50:
        training_step(batch_size)

episode:  0
False
False
False
False
False
False
False
False
