In [4]:
!pip install flappy-bird-gymnasium
!pip install gymnasium

!pip install pyyaml

Collecting pyyaml
  Downloading PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.1 kB)
Downloading PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl (172 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m172.0/172.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: pyyaml
Successfully installed pyyaml-6.0.2


In [9]:
!pip install torch
!pip install numpy


377.85s - pydevd: Sending message related to process being replaced timed-out after 5 seconds




384.10s - pydevd: Sending message related to process being replaced timed-out after 5 seconds




In [11]:
import torch
from torch import nn
from torch import optim
from torch.nn import functional as F

class DQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=256):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [3]:
from collections import deque
import random

class ReplayBuffer():
    def __init__(self, capacity, seed=None):
        self.buffer = deque([], maxlen=capacity)
        if seed is not None:
            random.seed(seed)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)
    
    def __len__(self):
        return len(self.buffer)
    
    def append(self, transition):
        self.buffer.append(transition)
    

In [16]:
import flappy_bird_gymnasium
import gymnasium
import itertools
import yaml

device = 'cuda' if torch.cuda.is_available() else 'cpu'

class Agent:
    
    def __init__(self, config_path='hyperparameters.yml'):
        with open(config_path, 'r') as file:
            self.config = yaml.safe_load(file)
            self.config = self.config['cartpole1']
        self.replay_buffer_size = self.config['replay_buffer_size']
        self.batch_size = self.config['batch_size']
        self.gamma = self.config['gamma']
        self.learning_rate = self.config['learning_rate']
        self.epsilon_start = self.config['epsilon_start']
        self.epsilon_end = self.config['epsilon_end']
        self.epsilon_decay = self.config['epsilon_decay']
        self.epsilon_min = self.config['epsilon_min']
        self.target_update = self.config['target_update']
    
    def run(self, is_training=True, render=False): 
        # env = gymnasium.make("FlappyBird-v0", render_mode="human", use_lidar=True)
        env = gymnasium.make("CartPole-v1", render_mode="human")

        num_actions = env.action_space.n
        num_states = env.observation_space.shape[0]
        reward_episode = []
        epsilon_history = []    
        policy_dqn = DQN(num_states, num_actions).to(device)

        if is_training:
            buffer = ReplayBuffer(capacity=self.replay_buffer_size)

        for episode in itertools.count():
            state, _ = env.reset()
            state = torch.tensor(state, dtype=torch.float, device=device)

            episode_reward = 0.0
            terminated = False

            while not terminated:

                if is_training and random.random() < self.epsilon_start:
                    action = env.action_space.sample()
                    action = torch.tensor(action, dtype=torch.long, device=device)
                else:
                    with torch.no_grad():
                        action = policy_dqn(state.unsqueeze(0)).squeeze().argmax()


                # Processing:
                new_state, reward, terminated, _, info = env.step(action.item())
                
                episode_reward += reward

                new_state = torch.tensor(new_state, dtype=torch.float, device=device)
                reward = torch.tensor(reward, dtype=torch.float, device=device)

                if is_training:
                    buffer.append((state, action, reward, terminated, info))

                state = new_state

            reward_episode.append(episode_reward)
            self.epsilon_start = max(self.epsilon_min, self.epsilon_start * self.epsilon_decay)
            epsilon_history.append(self.epsilon_start)


if __name__ == '__main__':
    agent = Agent('hyperparameters.yml')
    agent.run(is_training=True, render=True)

KeyboardInterrupt: 