In [11]:
import gym

env = gym.make('CartPole-v1')


In [16]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import gym

# Initialize environment
env = gym.make('CartPole-v1')

# GAIL: Generator and Discriminator Networks
class Generator(tf.keras.Model):
    def __init__(self):
        super(Generator, self).__init__()
        self.dense1 = layers.Dense(24, activation='relu')
        self.dense2 = layers.Dense(24, activation='relu')
        self.output_layer = layers.Dense(env.action_space.n, activation='softmax')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)

class Discriminator(tf.keras.Model):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.dense1 = layers.Dense(24, activation='relu')
        self.dense2 = layers.Dense(24, activation='relu')
        self.output_layer = layers.Dense(1, activation='sigmoid')

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        return self.output_layer(x)

# Instantiate models
generator = Generator()
discriminator = Discriminator()

# Optimizers
gen_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
disc_optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

# Loss functions
cross_entropy = tf.keras.losses.BinaryCrossentropy()

# Function to generate trajectories using the generator policy
def generate_trajectory(env, policy, max_steps=200):
    state = env.reset()
    states = []
    actions = []
    for _ in range(max_steps):
        state = state.reshape(1, -1)
        action_probs = policy(state)
        action = np.argmax(action_probs.numpy())
        next_state, reward, done, _ = env.step(action)
        states.append(state)
        actions.append(action)
        state = next_state
        if done:
            break
    states = np.array(states).reshape(-1, env.observation_space.shape[0])
    actions = np.array(actions).reshape(-1, 1)
    return states, actions

# Training loops (simplified)
def train_gail(env, generator, discriminator, gen_optimizer, disc_optimizer, expert_data, epochs=1000):
    expert_states, expert_actions = zip(*expert_data)
    expert_states = np.vstack(expert_states)
    expert_labels = np.ones((expert_states.shape[0], 1))

    for epoch in range(epochs):
        # Generate trajectories using the generator policy
        generated_states, generated_actions = generate_trajectory(env, generator)
        generated_labels = np.zeros((generated_states.shape[0], 1))

        # Combine expert and generated data
        all_states = np.vstack([expert_states, generated_states])
        all_labels = np.vstack([expert_labels, generated_labels])

        print(f'Epoch {epoch}')
        print(f'all_states shape: {all_states.shape}')
        print(f'all_labels shape: {all_labels.shape}')

        # Update discriminator
        with tf.GradientTape() as tape:
            pred_labels = discriminator(all_states)
            disc_loss = cross_entropy(all_labels, pred_labels)
            print(f'Disc Loss: {disc_loss.numpy()}')

        grads = tape.gradient(disc_loss, discriminator.trainable_variables)
        if not any(grad is None for grad in grads):
            disc_optimizer.apply_gradients(zip(grads, discriminator.trainable_variables))
        else:
            print("Discriminator gradients are None")

        # Update generator
        with tf.GradientTape() as tape:
            gen_pred_labels = discriminator(generated_states)
            gen_loss = cross_entropy(tf.ones_like(gen_pred_labels), gen_pred_labels)
            print(f'Gen Loss: {gen_loss.numpy()}')

        grads = tape.gradient(gen_loss, generator.trainable_variables)
        if not any(grad is None for grad in grads):
            gen_optimizer.apply_gradients(zip(grads, generator.trainable_variables))
        else:
            print("Generator gradients are None")

        if epoch % 100 == 0:
            print(f'Epoch: {epoch}, Discriminator Loss: {disc_loss.numpy()}, Generator Loss: {gen_loss.numpy()}')

# Example expert data (to be replaced with actual expert trajectories)
expert_data = [(np.random.random((1, 4)), np.random.randint(env.action_space.n)) for _ in range(100)]

# Format expert data correctly
expert_states = np.vstack([x[0] for x in expert_data])
expert_actions = np.array([x[1] for x in expert_data]).reshape(-1, 1)
expert_data = list(zip(expert_states, expert_actions))

# Train GAIL
train_gail(env, generator, discriminator, gen_optimizer, disc_optimizer, expert_data)

# Evaluate performance (simplified)
def evaluate_policy(env, policy):
    total_reward = 0
    state = env.reset()
    done = False
    while not done:
        state = state.reshape(1, -1)
        action = np.argmax(policy(state).numpy())
        state, reward, done, _ = env.step(action)
        total_reward += reward
    return total_reward

print(f'GAIL Policy Evaluation: {evaluate_policy(env, generator)}')


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
all_states shape: (116, 4)
all_labels shape: (116, 1)
Disc Loss: 0.10822189599275589
Gen Loss: 1.9205527305603027
Generator gradients are None
Epoch 169
all_states shape: (119, 4)
all_labels shape: (119, 1)
Disc Loss: 0.1518055498600006
Gen Loss: 2.0285542011260986
Generator gradients are None
Epoch 170
all_states shape: (117, 4)
all_labels shape: (117, 1)
Disc Loss: 0.10868223756551743
Gen Loss: 2.3427393436431885
Generator gradients are None
Epoch 171
all_states shape: (109, 4)
all_labels shape: (109, 1)
Disc Loss: 0.032340168952941895
Gen Loss: 2.9699203968048096
Generator gradients are None
Epoch 172
all_states shape: (109, 4)
all_labels shape: (109, 1)
Disc Loss: 0.03497376665472984
Gen Loss: 2.8266515731811523
Generator gradients are None
Epoch 173
all_states shape: (110, 4)
all_labels shape: (110, 1)
Disc Loss: 0.02982230670750141
Gen Loss: 3.3773269653320312
Generator gradients are None
Epoch 174
all_states shape: