In [None]:
%matplotlib inline
import gymnasium as gym
import pandas as pd

env = gym.make("LunarLander-v2", render_mode="rgb_array")
observation, info = env.reset(seed=0)

# Create an empty dataframe with the desired columns
df = pd.DataFrame(
    columns=[
        'Action', 
        'Observation', 
        'Reward']
    )

dfs = []    
for _ in range(1000):
    action = env.action_space.sample()  # agent policy that uses the observation and info
    observation, reward, terminated, truncated, info = env.step(action)

    # Append the data to the dataframe
    row_df = pd.DataFrame({'Action': action, 'Observation': observation, 'Reward': reward})
    dfs.append(row_df)
    
    if terminated or truncated:
        observation, info = env.reset()

df = pd.concat(dfs, ignore_index=True)
print(df.tail())
env.close()

In [None]:
import gym

env = gym.make('LunarLander-v2')

def reward_function(state, action, next_state):
    # Unpack the state and action
    x, y, v_x, v_y, angle, v_angle, left_leg, right_leg = state
    a, t = action
    
    # Unpack the next state
    next_x, next_y, next_v_x, next_v_y, next_angle, next_v_angle, next_left_leg, next_right_leg = next_state
    
    # Compute the reward based on the next state
    reward = 0
    
    # Negative reward for crashing
    if next_y <= 0:
        reward -= 100
        
    # Positive reward for landing successfully
    if next_y > 0 and abs(next_angle) < 0.1 and abs(next_v_x) < 0.2 and abs(next_v_y) < 0.2:
        reward += 100
        
    # Negative reward for running out of fuel
    if next_left_leg < 0.01 or next_right_leg < 0.01:
        reward -= 10
        
    # Penalty for using the engine
    reward -= abs(a) * 0.1
    
    return reward


In [None]:
import numpy as np

class Bandit:
    def __init__(self, n_arms):
        self.n_arms = n_arms
        self.reward_means = np.random.normal(0, 1, n_arms)
    
    def pull_arm(self, arm):
        reward = np.random.normal(self.reward_means[arm], 1)
        return reward
