In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim

In [2]:
prob = "LunarLanderContinuous"
window_size = 10

In [3]:
import gymnasium as gym
from choose_action import choose_action_func
from state_pred_models import NextStateQuantileNetwork, quantile_loss, NextStateSinglePredNetwork, quantile_loss_median, mse_loss
from setup import setup_class
prob_vars = setup_class(prob)
model_QRNN = NextStateQuantileNetwork(prob_vars.state_dim, prob_vars.action_dim, prob_vars.num_quantiles)
optimizer_QRNN = optim.Adam(model_QRNN.parameters(), lr=1e-3)


In [4]:
# Reload model parameters and optimizer state
checkpoint = torch.load('C:\\Users\\nicle\\Desktop\\QRNN-MPC\\Files\\saved_model_LunarLanderContinuous.pth')
model_QRNN.load_state_dict(checkpoint['model_QRNN_state_dict'])
optimizer_QRNN.load_state_dict(checkpoint['optimizer_QRNN_state_dict'])

# After loading, switch model to evaluation mode
model_QRNN.eval()


NextStateQuantileNetwork(
  (layer1): Linear(in_features=10, out_features=256, bias=True)
  (layer2): Linear(in_features=256, out_features=256, bias=True)
  (layer3): Linear(in_features=256, out_features=88, bias=True)
)

In [5]:
env = gym.make("LunarLander-v3", render_mode="human", continuous=True)
seed = 0
do_RS = False
do_QRNN_step_rnd = False
use_sampling = False
use_mid = True
use_ASGNN = None
model_ASN = None


In [6]:
for episode in range(10):
        state, _ = env.reset(seed=seed)
        episode_reward = 0
        done = False
        actions_list = []
        if prob_vars.prob == "Pendulum":
            state = env.state.copy()
        if prob_vars.prob == "PandaReacher" or prob_vars.prob == "PandaPusher" or prob_vars.prob == "PandaReacherDense":
            prob_vars.goal_state = state['desired_goal'] # 3 components
            state = state['observation']#[:3] # 6 components for Reacher, 18 components for Pusher
        if prob_vars.prob == "MuJoCoReacher":
            prob_vars.goal_state = np.array([state[4], state[5]])
            state = np.array([state[0], state[1], state[2], state[3], state[6], state[7], state[8], state[9]])
        if prob_vars.prob == "MuJoCoPusher":
            prob_vars.goal_state = np.array([state[20], state[21], state[22]])
            
        
        costs = []
        episodic_step_rewards = []
        episodic_step_reward_values = []
        
        # if episode == 0:
        #     # Random initial action sequence
        #     # initial_action_sequence = np.random.randint(0, 2, horizon)
        #     init_particles = [np.random.uniform(-2, 2, horizon) for _ in range(num_particles)] # 2*horizon_list[0]
        # else:
        #     # Add small random noise to encourage exploration (for now this can stay the same)
        #     # particles = np.clip(best_action_sequence + np.random.randint(0, 2, horizon), 0, 1)
        #     for i in range(len(particles)):
        #         # print("best_particle ", best_particle, "\n")
        #         # print("np.random.uniform(-0.5, 0.5, horizon) ", np.random.uniform(-0.5, 0.5, horizon), "\n")
        #         particles[i] = np.clip(best_particle + np.random.uniform(-0.5, 0.5, horizon), -2, 2)
        #         # particles = np.clip(init_particles + np.random.randint(0, 2, len(init_particles)), 0, 1)
        
        # particles = [np.random.uniform(-2, 2, horizon) for _ in range(num_particles)] # 2*horizon_list[0]
        
        if prob_vars.prob == "CartPole":
            particles = np.random.randint(0, 2, (prob_vars.num_particles, prob_vars.horizon))
        elif prob_vars.prob == "Acrobot" or prob_vars.prob == "MountainCar": 
            particles = np.random.randint(0, 3, (prob_vars.num_particles, prob_vars.horizon))
        elif prob_vars.prob == "LunarLander":
            particles = np.random.randint(0, 4, (prob_vars.num_particles, prob_vars.horizon))
        elif prob_vars.prob == "PandaReacher" or prob_vars.prob == "MuJoCoReacher" or prob_vars.prob == "PandaPusher" or prob_vars.prob == "MuJoCoPusher" or prob_vars.prob == "LunarLanderContinuous" or prob_vars.prob == "PandaReacherDense":
            particles = np.random.uniform(prob_vars.action_low, prob_vars.action_high, (prob_vars.num_particles, prob_vars.action_dim*prob_vars.horizon))
        else: # Pendulum, MountainCarContinuous
            particles = np.random.uniform(prob_vars.action_low, prob_vars.action_high, (prob_vars.num_particles, prob_vars.horizon))
        
        # particles = np.zeros((num_particles, horizon))
        # particles[0] = np.array([-1.982811505902002, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, -1.0812550657808897, 2.0, 2.0, 2.0, 0.6938272212396055, 2.0])
        
        # for step in range(tqdm(max_steps)):
        for step in range(prob_vars.max_steps):
            # Get the current state
            # """ Need to change this!!!!!!!!!!!!!!!!! """
            # state = env.state
            # print("step ", step, "\n")
            if prob_vars.prob == "Pendulum":
                state = env.state.copy()
            
            # Choose the best action sequence
            # if step == 0:
            #     best_particle, particles, cost = particle_filtering_cheating(init_particles, env, state, horizon, nb_reps=5, using_Env=usingEnv, episode=episode, step=step)
            # elif step >= 1:
            #     best_particle, particles, cost = particle_filtering_cheating(particles, env, state, horizon, nb_reps=5, using_Env=usingEnv, episode=episode, step=step)
            
            # particles = np.random.uniform(-2, 2, (num_particles, horizon))
            
            # print("state ", state, "\n")
            
            # print("len(particles) ", len(particles), "\n")
            
            if do_RS or do_QRNN_step_rnd:
                if prob_vars.prob == "CartPole":
                    particles = np.random.randint(0, 2, (prob_vars.num_particles, prob_vars.horizon))
                elif prob_vars.prob == "Acrobot" or prob_vars.prob == "MountainCar":
                    particles = np.random.randint(0, 3, (prob_vars.num_particles, prob_vars.horizon))
                elif prob_vars.prob == "LunarLander":
                    particles = np.random.randint(0, 4, (prob_vars.num_particles, prob_vars.horizon))
                elif prob_vars.prob == "PandaReacher" or prob_vars.prob == "MuJoCoReacher" or prob_vars.prob == "PandaPusher" or prob_vars.prob == "MuJoCoPusher" or prob_vars.prob == "LunarLanderContinuous" or prob_vars.prob == "PandaReacherDense":
                    particles = np.random.uniform(prob_vars.action_low, prob_vars.action_high, (prob_vars.num_particles, prob_vars.action_dim*prob_vars.horizon))
                else: # Pendulum, MountainCarContinuous
                    particles = np.random.uniform(prob_vars.action_low, prob_vars.action_high, (prob_vars.num_particles, prob_vars.horizon))
            # best_particle, action, best_cost, particles = choose_action(prob, state, do_RS, use_sampling, use_mid, use_ASGNN, horizon, particles, model_QRNN, action_low, action_high, nb_reps_MPC, std, change_prob, nb_top_particles, nb_random, episode=episode, step=step, goal_state=goal_state)
            ##

            particles = np.clip(particles, prob_vars.action_low, prob_vars.action_high)

            best_particle, action, best_cost, particles = choose_action_func(prob_vars, state, particles, do_RS, use_sampling, use_mid, use_ASGNN, model_QRNN, model_ASN, episode=episode, step=step, goal_state=prob_vars.goal_state)
            # best_particle, action, best_cost, particles = choose_action(prob_vars.prob, state, horizon, particles, do_RS, use_sampling, use_mid, use_ASGNN, model_QRNN, model_ASN, action_dim, action_low, action_high, states_low, states_high, nb_reps_MPC, std, change_prob, nb_top_particles, nb_random, episode=episode, step=step, goal_state=goal_state)
            
            # best_particle, particles, cost = particle_filtering_cheating(particles, env, state, horizon, nb_reps=5, using_Env=usingEnv, episode=episode, step=step)
            
            # print("action ", action, "\n")
            
            # print("best_particle ", best_particle, "\n")
            # if prob != "CartPole" and prob != "Acrobot" and prob != "PandaReacher" and prob != "MuJoCoReacher":
            #     action = [best_particle[0]]
            
            actions_list.append(action)
            
            costs.append(best_cost)
            
            if prob_vars.prob == "Pendulum" or prob_vars.prob == "MountainCarContinuous" or prob_vars.prob == "Pendulum_xyomega" or prob_vars.prob == "InvertedPendulum":
                action = [best_particle[0]]
                # print("action ", action, "\n")
            
            elif prob_vars.prob == "PanadaReacher" or prob_vars.prob == "MuJoCoReacher" or prob_vars.prob == "PandaPusher" or prob_vars.prob == "MuJoCoPusher" or prob_vars.prob == "LunarLanderContinuous" or prob_vars.prob == "PandaReacherDense":
                action = best_particle[:prob_vars.action_dim]
            
            elif prob_vars.prob == "CartPole" or prob_vars.prob == "Acrobot" or prob_vars.prob == "MountainCar" or prob_vars.prob == "LunarLander":
                action = int(action)
            
            # if prob == "CartPole" or prob == "Acrobot":
            #     action = int(action)
            
            # Apply the first action from the optimized sequence
            next_state, reward, done, truncated, info = env.step(action)
            
            episode_reward += reward
            actions_list.append(action)
            
            # Apply the first action from the optimized sequence
            # next_state, reward, done, terminated, info = env.step(action)
            # episode_reward += reward
            if prob_vars.prob == "Pendulum":
                # state = env.state.copy()
                next_state = env.state.copy()
            if prob_vars.prob == "PandaReacher" or prob_vars.prob == "PandaPusher" or prob_vars.prob == "PandaReacherDense":
                prob_vars.goal_state = next_state['desired_goal'] # 3 components
                next_state = next_state['observation']#[:3] # 6 components
            if prob_vars.prob == "MuJoCoReacher":
                next_state = np.array([next_state[0], next_state[1], next_state[2], next_state[3], next_state[6], next_state[7], next_state[8], next_state[9]])
                
            # print("state ", state, "next_state ", next_state, "\n")
            # print("states[0] ", state[0], "states[1] ", state[1], "\n")
            
            # episodic_step_rewards.append(episode_reward)
            # episodic_step_reward_values.append(reward)
            
            # next_state = env.state.copy()
            # Store experience in replay buffer
            # print("state ", state, "\n")
            
            # if prob != "CartPole" and prob != "Acrobot":
            #     replay_buffer.append((state, action, reward, next_state, done))
            # else:
            #     replay_buffer.append((state, np.array([action]), reward, next_state, terminated))
            # if prob_vars.prob == "CartPole" or prob_vars.prob == "Acrobot" or prob_vars.prob == "MountainCar" or prob_vars.prob == "LunarLander":
            #     replay_buffer_QRNN.append((state, np.array([action]), reward, next_state, truncated))
            # else:
            #     replay_buffer_QRNN.append((state, action, reward, next_state, truncated))
            
                
            # if len(replay_buffer_QRNN) < prob_vars.batch_size:
            #     pass
            # else:
            #     batch = random.sample(replay_buffer_QRNN, prob_vars.batch_size)
            #     states, actions_train, rewards, next_states, dones = zip(*batch)
            #     # print("batch states ", states, "\n")
            #     states = torch.tensor(states, dtype=torch.float32)
            #     actions_tensor = torch.tensor(actions_train, dtype=torch.float32)
            #     # print("actions.shape ", actions_tensor, "\n")
            #     rewards = torch.tensor(rewards, dtype=torch.float32)
            #     next_states = torch.tensor(next_states, dtype=torch.float32)
            #     dones = torch.tensor(dones, dtype=torch.float32)

            #     # if prob == "PandaReacher" or prob == "PandaPusher" or prob == "MuJoCoReacher":
            #     #     # Clip states to ensure they are within the valid range
            #     #     # before inputting them to the model (sorta like normalization)
            #     states = torch.clip(states, prob_vars.states_low, prob_vars.states_high)
            #     # states = 2 * ((states - prob_vars.states_low) / (prob_vars.states_high - prob_vars.states_low)) - 1
            #     actions_tensor = torch.clip(actions_tensor, prob_vars.action_low, prob_vars.action_high)
                
            #     # Predict next state quantiles
            #     predicted_quantiles = model_QRNN(states, actions_tensor)  # Shape: (batch_size, num_quantiles, state_dim)
                
            #     # Use next state as target (can be improved with target policy)
            #     target_quantiles = next_states
                
            #     # Compute the target quantiles (e.g., replicate next state across the quantile dimension)
            #     # target_quantiles = next_states.unsqueeze(-1).repeat(1, 1, num_quantiles)

            #     # Compute Quantile Huber Loss
            #     loss = quantile_loss(predicted_quantiles, target_quantiles, prob_vars.quantiles)
                
            #     # # Compute Quantile Huber Loss
            #     # loss = quantile_loss(predicted_quantiles, target_quantiles, quantiles)
                
            #     # Optimize the model
            #     optimizer_QRNN.zero_grad()
            #     loss.backward()
            #     optimizer_QRNN.step()
            
            # if prob == "MuJoCoReacher":
            #     if np.sqrt(next_state[-2]**2+next_state[-1]**2) < 0.05:
            #         print("Reached target position \n")
            #         done = True
            
            done = done or truncated
            if done:
                # nb_episode_success += 1
                print("episode_reward ", episode_reward, "\n")
                break
            
            if not do_RS or not do_QRNN_step_rnd:
                if prob_vars.prob == "CartPole":
                    # Shift all particles to the left by removing the first element
                    particles[:, :-1] = particles[:, 1:]
                
                    # Generate new random values (0 or 1) for the last column
                    new_values = np.random.randint(0, 2, size=(particles.shape[0], 1))
                    
                    # Add the new values to the last position
                    particles[:, -1:] = new_values
                    
                elif prob_vars.prob == "Acrobot" or prob_vars.prob == "MountainCar":
                    # Shift all particles to the left by removing the first element
                    particles[:, :-1] = particles[:, 1:]
                    
                    # Generate new random values (0 or 1) for the last column
                    new_values = np.random.randint(0, 3, size=(particles.shape[0], 1))
                    
                    # Add the new values to the last position
                    particles[:, -1:] = new_values

                elif prob_vars.prob == "LunarLander":
                    # Shift all particles to the left by removing the first element
                    particles[:, :-1] = particles[:, 1:]
                    
                    # Generate new random values (0 or 1) for the last column
                    new_values = np.random.randint(0, 4, size=(particles.shape[0], 1))
                    
                    # Add the new values to the last position
                    particles[:, -1:] = new_values
                
                elif prob_vars.prob == "PandaReacher" or prob_vars.prob == "MuJoCoReacher" or prob_vars.prob == "PandaPusher" or prob_vars.prob == "MuJoCoPusher" or prob_vars.prob == "LunarLanderContinuous" or prob_vars.prob == "PandaReacherDense":
                    # Shift all particles to the left by removing the first element
                    particles[:, :-prob_vars.action_dim] = particles[:, prob_vars.action_dim:]
                    
                    # Generate new random values for the last column
                    new_values = np.random.uniform(prob_vars.action_low, prob_vars.action_high, size=(particles.shape[0], prob_vars.action_dim))
                    
                    # Add the new values to the last position
                    particles[:, -prob_vars.action_dim:] = new_values  
                
                else: # Pendulum, MountainCarContinuous, Pendulum_xyomega
                    # Shift all particles to the left by removing the first element
                    particles[:, :-1] = particles[:, 1:]
                    
                    # Generate new random values for the last column
                    new_values = np.random.uniform(prob_vars.action_low, prob_vars.action_high, size=(particles.shape[0], 1))
                    
                    # Add the new values to the last position
                    particles[:, -1:] = new_values
            particles = np.clip(particles, prob_vars.action_low, prob_vars.action_high)
            
            # if step == 0:
            #     # print("len(init_particles) ", len(init_particles), "\n")
            #     # particles = np.copy(init_particles)
            #     for i in range(len(particles)):
            #         # particles[i] = np.clip(top_particles[0][1:] + [np.random.randint(0, 2)], 0, 1)
            #         particles[i] = np.clip(np.append(particles[i][1:],[np.random.uniform(-2, 2)]), -2, 2)
            #         # print("particles[i] ", particles[i].shape, "\n")    
            # else:
            #     # print("len(particles) ", len(particles), "\n")
            #     particles[0] = best_particle
            #     for i in range(1, len(particles)):
            #         # particles[i] = np.clip(top_particles[0][1:] + [np.random.randint(0, 2)], 0, 1)
            #         particles[i] = np.clip(np.append(particles[i][1:],[np.random.uniform(-2, 2)]), -2, 2)
            #         # print("particles[i] ", particles[i].shape, "\n")
            
            # state = env.state.copy() # next_state
            
            state = next_state
        
        # print("best_particle ", best_particle, "\n")
        # print("actions ", actions, "\n")
        # print('horizon: %d, episode: %d, reward: %d' % (horizon, episode, episode_reward))
        # episode_reward_list.append(episode_reward)

        # episode_success_rate.append(nb_episode_success/(episode+1)) # Episodic success rate for Panda Gym envs
        # episode_success_rate.append(nb_episode_success) # /max_steps # Episodic success rate for Panda Gym envs     
        
        # print("actions_list ", actions_list, "\n")
        
        # print(f'episode: {episode}, reward: {episode_reward}')
        # # episode_reward_list.append(episode_reward)
        # print("actions_list ", actions_list, "\n")
        
        ''' Print stuff '''
        # if prob == 'PandaReacher':
        #     print("np.linalg.norm(goal_state-state) ", np.linalg.norm(goal_state-next_state[:3]), "\n")
        #     print("actions_list ", actions_list, "\n")
        # if prob == "MuJoCoReacher":
        #     print("np.linalg.norm(goal_state-state)=np.sqrt(next_state[-2]**2+next_state[-1]**2) ", np.sqrt(next_state[-2]**2+next_state[-1]**2), "\n")

    # if use_sampling:
    #     if do_RS:
    #         # Assuming `agent` is your RL model and `optimizer` is the optimizer
    #         torch.save({
    #             'model_state_dict': model_QRNN.state_dict(),
    #             'optimizer_state_dict': optimizer_QRNN.state_dict(),
    #         }, f"RS_{prob_vars.prob}_sampling_{prob_vars.change_prob}.pth")
    #     elif do_QRNN_step_rnd:
    #         # Assuming `agent` is your RL model and `optimizer` is the optimizer
    #         torch.save({
    #             'model_state_dict': model_QRNN.state_dict(),
    #             'optimizer_state_dict': optimizer_QRNN.state_dict(),
    #         }, f"QRNN_step_rnd_{prob_vars.prob}_sampling_{prob_vars.change_prob}.pth")
    #     else:
    #         # Assuming `agent` is your RL model and `optimizer` is the optimizer
    #         torch.save({
    #             'model_state_dict': model_QRNN.state_dict(),
    #             'optimizer_state_dict': optimizer_QRNN.state_dict(),
    #         }, f"QRNN_basic_{prob_vars.prob}_sampling_{prob_vars.change_prob}.pth")
    # if use_mid:
    #     if do_RS:
    #         # Assuming `agent` is your RL model and `optimizer` is the optimizer
    #         torch.save({
    #             'model_state_dict': model_QRNN.state_dict(),
    #             'optimizer_state_dict': optimizer_QRNN.state_dict(),
    #         }, f"RS_{prob_vars.prob}_mid_{prob_vars.change_prob}.pth")
    #     elif do_QRNN_step_rnd:
    #         # Assuming `agent` is your RL model and `optimizer` is the optimizer
    #         torch.save({
    #             'model_state_dict': model_QRNN.state_dict(),
    #             'optimizer_state_dict': optimizer_QRNN.state_dict(),
    #         }, f"QRNN_step_rnd_{prob_vars.prob}_mid_{prob_vars.change_prob}.pth")
    #     else:
    #         # Assuming `agent` is your RL model and `optimizer` is the optimizer
    #         torch.save({
    #             'model_state_dict': model_QRNN.state_dict(),
    #             'optimizer_state_dict': optimizer_QRNN.state_dict(),
    #         }, f"QRNN_basic_{prob_vars.prob}_mid_{prob_vars.change_prob}.pth")
    
    
    # # Save model parameters and optimizer state
    # torch.save({
    #     'model_QRNN_state_dict': model_QRNN.state_dict(),
    #     'optimizer_QRNN_state_dict': optimizer_QRNN.state_dict(),
    # }, f'saved_model_{prob_vars.prob}.pth')
    
    # # return episode_reward_list
    # if prob_vars.prob == "PandaReacher" or prob_vars.prob == "PandaPusher" or prob_vars.prob == "MuJoCoReacher" or prob_vars.prob == "MuJoCoPusher" or prob_vars.prob == "PandaReacherDense":
    #     return episode_reward_list, episode_success_rate
    # else:
    #     return episode_reward_list
   
env.close()   
 

  actions = torch.tensor([particles_t_array], dtype=torch.float32).reshape(len(particles),action_dim)


episode_reward  -69.9561997901729 

episode_reward  -109.56748521934125 

episode_reward  -75.66602197537864 

episode_reward  -91.77735393361179 

episode_reward  -102.13487126444171 

episode_reward  -55.96271363672825 

episode_reward  -72.67716447622718 

episode_reward  -88.56976282091034 

episode_reward  -88.98717025627585 

episode_reward  -93.35881628631014 

