## Homework

In the above model for CartPole, try to modify the model to experiment new ideas. 

Requirements:
1. Use cross-entropy method. 
2. Change NN to have more or less neurons, more layers, different activation functions, different optimizer, learning rate. How does it impact the results.
3. Change BATCH_SIZE and PERCENTILE. How does it impact the results.
4. New rule in this game: you must run your same action twice in every step before the agent receives the observation and chooses next action. Modify and train your model. Show the results and discuss the difference.
5. Extend from above. Build another NN to use one observation to generate two consecutive actions. You will have 4 possible outputs: LL, LR, RL, RR. Then you will apply the two consecutive actions in the environment and then receive the last observation to predict next two consecutive actions. Train the model and show your result. 

Your grade is based on the completeness. 

Submit in BOTH ipynb and html formats. 

In [1]:
import numpy as np
import tensorflow as tf
import gym

# Set seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Function to run episodes and get total rewards
def run_episodes(model, env, num_episodes, consecutive_actions=1):
    total_rewards = []
    for episode in range(num_episodes):
        state = env.reset()
        episode_reward = 0
        done = False

        while not done:
            # Repeat the same action for 'consecutive_actions' times
            for _ in range(consecutive_actions):
                action_prob = model.predict(np.expand_dims(state, axis=0))[0]
                action = np.random.choice(2, p=action_prob)
                state, reward, done, _ = env.step(action)
                episode_reward += reward

        total_rewards.append(episode_reward)

    return total_rewards

# Function to train the model using the cross-entropy method
def train_model(model, env, num_iterations, batch_size, percentile, consecutive_actions=1):
    for iteration in range(num_iterations):
        rewards = run_episodes(model, env, batch_size, consecutive_actions)
        elite_states, elite_actions = select_elites(rewards, percentile, consecutive_actions)

        # Flatten elite_states to fit the input shape
        elite_states_flat = np.reshape(elite_states, (elite_states.shape[0] * elite_states.shape[1], elite_states.shape[2]))

        # One-hot encode elite_actions
        elite_actions_one_hot = tf.keras.utils.to_categorical(elite_actions, num_classes=2)

        # Train the model on elite_states and elite_actions
        model.fit(elite_states_flat, elite_actions_one_hot, epochs=1, verbose=0)

        # Evaluate the model every 10 iterations
        if iteration % 10 == 0:
            total_rewards = run_episodes(model, env, 100, consecutive_actions)
            mean_reward = np.mean(total_rewards)
            print(f"Iteration: {iteration}, Mean Reward: {mean_reward}")

# Function to select elites based on rewards
def select_elites(rewards, percentile, consecutive_actions):
    reward_threshold = np.percentile(rewards, percentile)

    elite_indices = np.where(rewards >= reward_threshold)[0]
    elite_states = []
    elite_actions = []

    for idx in elite_indices:
        state, action = generate_elite_sequence(idx, consecutive_actions)
        elite_states.append(state)
        elite_actions.append(action)

    return np.array(elite_states), np.array(elite_actions)

# Function to generate elite sequence for a given episode index
def generate_elite_sequence(episode_index, consecutive_actions):
    state = env.reset()
    elite_states = [state]
    elite_actions = []

    for _ in range(consecutive_actions):
        action_prob = model.predict(np.expand_dims(state, axis=0))[0]
        action = np.random.choice(2, p=action_prob)
        state, _, _, _ = env.step(action)

        elite_states.append(state)
        elite_actions.append(action)

    return np.array(elite_states), np.array(elite_actions)

# Create CartPole environment
env = gym.make('CartPole-v1')

# 1. Default Model
model_default = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(4,)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])
optimizer_default = tf.keras.optimizers.Adam(learning_rate=0.001)
model_default.compile(optimizer=optimizer_default, loss='categorical_crossentropy', metrics=['accuracy'])

# 2. Train the model with cross-entropy method
train_model(model_default, env, num_iterations=100, batch_size=100, percentile=70, consecutive_actions=1)

# 3. Change NN architecture, activation, optimizer, and learning rate
model_custom = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='tanh', input_shape=(4,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])
optimizer_custom = tf.keras.optimizers.Adam(learning_rate=0.0005)
model_custom.compile(optimizer=optimizer_custom, loss='categorical_crossentropy', metrics=['accuracy'])

# 4. Train the custom model with cross-entropy method
train_model(model_custom, env, num_iterations=100, batch_size=100, percentile=70, consecutive_actions=1)

# 5. New rule: Run the same action twice in every step
# 6. Train the model with the new rule (consecutive_actions=2)
train_model(model_custom, env, num_iterations=100, batch_size=100, percentile=70, consecutive_actions=2)

# 7. Extend the model for two consecutive actions (LL, LR, RL, RR)
model_extended = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='tanh', input_shape=(4,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')
])
optimizer_extended = tf.keras.optimizers.Adam(learning_rate=0.0005)
model_extended.compile(optimizer=optimizer_extended, loss='categorical_crossentropy', metrics=['accuracy'])

# 8. Train the extended model
train_model(model_extended, env, num_iterations=100, batch_size=100, percentile=70, consecutive_actions=2)





ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.