In [None]:
# fundemental modules
import gymnasium as gym
import matplotlib.pyplot as plt
import numpy as np
import time
import pprint
from tqdm.notebook import trange

from ddqn import DDQNAgent # type: ignore
from utils import record_videos, load_config, show_videos
from reward import _reward
from obs import observation_shape

best_score = -1000.0
score_history = []
episode_lens = []
avg_history = []
std_history = []
avg_history_100 = []



In [None]:
data = load_config()
env = gym.make('racetrack-v0', render_mode = 'rgb_array')
env.configure(data) # type: ignore
# pprint.pprint(env.config) # type: ignore
(obs, info), done = env.reset(), False
# input = obs.flatten()
#observation config
proc = observation_shape(obs,info,2)
proc.reset()
input = proc.get_input()
print(input.shape)

In [None]:
agent = DDQNAgent(alpha=0.001, gamma=0.9, epsilon=1.0, obs_shape=input.shape,
                  batch_size=512, epsilon_dec=0.9995, epsilon_end=0.1, mem_size=100000,
                  min_mem_size=600, replace_target=1000, learning_rate=0.0003)
print("Agent is initialized.")


In [None]:
# main loop
for episode in trange(1000, desc='Test episodes'):

        # proc.reset()
        (observation, info), done = env.reset(), False
        proc.update_input(observation, info)
        observation = proc.get_input()
        # observation = observation.flatten()

        episode_reward = 0
        episode_len = 0

        while not done:

            if episode%100 == 0:
                 env.render()
            
            action, action_index = agent.get_action(observation, deterministic=False)
            new_observation, reward, done, truncated, new_info = env.step(action=[action])
            reward = _reward(new_info,new_observation)
            # new_observation = new_observation.flatten()
            
            proc.update_input(new_observation, info)
            new_observation = proc.get_input()

            episode_reward += reward # type: ignore
            episode_len +=1
            
            if new_info["rewards"]["on_road_reward"] == False:
                 done = True

            agent.remember(state=observation, action=action_index, done=done,
                            reward=reward, new_state=new_observation)
            agent.train()

            observation = new_observation

        episode_lens.append(episode_len)

        score_history.append(episode_reward)
        avg_score = np.mean(score_history)
        avg_history.append(avg_score)
        std_score = np.std(score_history)
        std_history.append(std_score)

        avg_score_100 = np.mean(score_history[-100:])
        avg_history_100.append(avg_score_100)

        if avg_score_100 > best_score:
            best_score = avg_score
            agent.save_model(episode)

        agent.tensorboard.update_stats(episode_rew = episode_reward,
                                       average_rew =avg_score,
                                       average_100_reward = avg_score_100,
                                       std_rew=std_score,
                                       epsilon=agent.epsilon,
                                       episode_len = episode_len)

        print("Last Info: ", new_info)
        print("Last Reward: ", reward)

        print('episode ', episode, 'score %.1f' % episode_reward, 'ep len', episode_len,
              'avg score %.1f' % avg_score, 'avg_score_100 %.1f' %avg_score_100,'std score %.1f' % std_score)

In [None]:
env = record_videos(env)

agent.load_model()
# main loop
for episode in trange(1, desc='Test episodes'):
        print("RESETTED")
        # proc.reset()
        (observation, info), done = env.reset(), False
        # proc.update_input(observation, info)
        # observation = proc.get_input()
        observation = observation.flatten()

        episode_reward = 0
        episode_len = 0


        while not done:

            action, action_index = agent.get_action(observation, deterministic=True)
            new_observation, reward, done, truncated, new_info = env.step(action=[action])
            reward = _reward(new_info,new_observation)

            new_observation = new_observation.flatten()
            
            # proc.update_input(new_observation, info)
            # new_observation = proc.get_input()

            episode_reward += reward # type: ignore
            episode_len +=1
            
            if new_info["rewards"]["on_road_reward"] == False:
                 done = True

            observation = new_observation

        print("Last Observation: ", observation.reshape((2,(len(observation)+1)//2)))
        print("Last Info: ", new_info)
        print("Last Reward: ", reward)

env.close()
show_videos()