Exploration strategies are essential in reinforcement learning to balance exploration (trying new actions) and exploitation (choosing actions with known high value).

In [None]:
import numpy as np

class ExplorationStrategy:
    def __init__(self, num_actions):
        self.num_actions = num_actions

    def get_action(self, q_values):
        raise NotImplementedError("Subclasses should implement this method")

class EpsilonGreedy(ExplorationStrategy):
    def __init__(self, epsilon, num_actions):
        super().__init__(num_actions)
        self.epsilon = epsilon

    def get_action(self, q_values):
        if np.random.rand() < self.epsilon:
            return np.random.randint(self.num_actions)
        else:
            return np.argmax(q_values)

class SoftmaxExploration(ExplorationStrategy):
    def __init__(self, temperature, num_actions):
        super().__init__(num_actions)
        self.temperature = temperature

    def get_action(self, q_values):
        probabilities = np.exp(q_values / self.temperature) / np.sum(np.exp(q_values / self.temperature))
        return np.random.choice(self.num_actions, p=probabilities)

# Example Usage:

# Number of actions
num_actions = 4

# Q-values for each action
q_values = np.array([0.5, 1.0, 0.8, 0.3])

# Epsilon-Greedy with epsilon=0.1
epsilon_greedy = EpsilonGreedy(epsilon=0.1, num_actions=num_actions)
epsilon_greedy_action = epsilon_greedy.get_action(q_values)
print(f"Epsilon-Greedy Action: {epsilon_greedy_action}")

# Softmax Exploration with temperature=0.5
softmax_exploration = SoftmaxExploration(temperature=0.5, num_actions=num_actions)
softmax_action = softmax_exploration.get_action(q_values)
print(f"Softmax Action: {softmax_action}")


Epsilon-Greedy Action: 1
Softmax Action: 2


In this example, EpsilonGreedy implements the epsilon-greedy strategy, and SoftmaxExploration implements softmax exploration. You can create instances of these classes and use the get_action method to obtain the action to take based on the exploration strategy and current Q-values. Adjust the parameters like epsilon or temperature to control the level of exploration.

#DQN

Implementing a Deep Q-Network (DQN) in Python using TensorFlow involves several steps. Below is a simplified example. Note that a complete DQN implementation may involve additional features such as experience replay and target networks for stability.

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

class DQN:
    def __init__(self, state_dim, num_actions):
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.build_model()

    def build_model(self):
        model = models.Sequential([
            layers.Dense(64, activation='relu', input_shape=(self.state_dim,)),
            layers.Dense(64, activation='relu'),
            layers.Dense(self.num_actions, activation='linear')
        ])

        model.compile(optimizer='adam', loss='mse')

        self.model = model

    def train(self, states, targets):
        self.model.fit(states, targets, epochs=1, verbose=0)

    def predict(self, state):
        return self.model.predict(np.array([state]))[0]

def epsilon_greedy(action_values, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(len(action_values))
    else:
        return np.argmax(action_values)

# Example Usage:

# Environment parameters
state_dim = 4  # dimensionality of the state
num_actions = 2  # number of actions
epsilon = 0.1  # exploration parameter

# Create DQN agent
dqn_agent = DQN(state_dim=state_dim, num_actions=num_actions)

# Example training loop (replace with your environment interaction)
for episode in range(1000):
    state = np.random.rand(state_dim)  # replace with your environment's initial state
    done = False

    while not done:
        # Choose action using epsilon-greedy strategy
        action_values = dqn_agent.predict(state)
        action = epsilon_greedy(action_values, epsilon)

        # Take action, observe next state and reward (replace with your environment's step)
        next_state = np.random.rand(state_dim)
        reward = np.random.rand()

        # Calculate target Q-value
        target = reward + 0.99 * np.max(dqn_agent.predict(next_state))

        # Train the DQN agent
        dqn_agent.train(np.array([state]), np.array([[target if i == action else val for i, val in enumerate(action_values)]]))

        state = next_state

    # Optionally update epsilon or other parameters during training
    epsilon *= 0.99

# Save or use the trained DQN model as needed
dqn_agent.model.save("dqn_model.h5")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


KeyboardInterrupt: 

simplified example of a Deep Q-Network (DQN) implemented using PyTorch. This example assumes a simple environment, and you might need to adapt it for your specific use case

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random

class DQNNetwork(nn.Module):
    def __init__(self, state_dim, num_actions):
        super(DQNNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, num_actions)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self, state_dim, num_actions, epsilon=0.1, gamma=0.99, lr=0.001):
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.epsilon = epsilon
        self.gamma = gamma
        self.q_network = DQNNetwork(state_dim, num_actions)
        self.target_network = DQNNetwork(state_dim, num_actions)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, self.num_actions - 1)
        else:
            with torch.no_grad():
                q_values = self.q_network(torch.tensor(state, dtype=torch.float32))
                return torch.argmax(q_values).item()

    def train(self, state, action, reward, next_state, done):
        state_tensor = torch.tensor(state, dtype=torch.float32)
        next_state_tensor = torch.tensor(next_state, dtype=torch.float32)

        with torch.no_grad():
            if done:
                target = reward
            else:
                target = reward + self.gamma * torch.max(self.target_network(next_state_tensor))

        q_values = self.q_network(state_tensor).unsqueeze(0)
        q_values[0][action] = target

        self.optimizer.zero_grad()
        loss = self.loss_fn(self.q_network(state_tensor), q_values)
        loss.backward()
        self.optimizer.step()

        # Update target network periodically
        if done:
            self.update_target_network()

    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())

# Example Usage:

# Environment parameters
state_dim = 4  # dimensionality of the state
num_actions = 2  # number of actions

# Create DQN agent
dqn_agent = DQNAgent(state_dim=state_dim, num_actions=num_actions)

# Example training loop (replace with your environment interaction)
for episode in range(1000):
    state = np.random.rand(state_dim)  # replace with your environment's initial state
    done = False

    while not done:
        # Choose action using epsilon-greedy strategy
        action = dqn_agent.select_action(state)

        # Take action, observe next state and reward (replace with your environment's step)
        next_state = np.random.rand(state_dim)
        reward = np.random.rand()

        # Train the DQN agent
        dqn_agent.train(state, action, reward, next_state, done)

        state = next_state

    # Optionally update epsilon or other parameters during training
    dqn_agent.epsilon *= 0.99

# Save or use the trained DQN model as needed
torch.save(dqn_agent.q_network.state_dict(), "dqn_model.pth")


  return F.mse_loss(input, target, reduction=self.reduction)


adding experience replay, target networks, and fine-tuning parameters to improve stability and performance

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque

class DQNNetwork(nn.Module):
    def __init__(self, state_dim, num_actions):
        super(DQNNetwork, self).__init__()
        self.fc1 = nn.Linear(state_dim, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, num_actions)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class ExperienceReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def add_experience(self, experience):
        self.buffer.append(experience)

    def sample_batch(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)
        return (
            torch.tensor(states, dtype=torch.float32),
            torch.tensor(actions, dtype=torch.long),
            torch.tensor(rewards, dtype=torch.float32),
            torch.tensor(next_states, dtype=torch.float32),
            torch.tensor(dones, dtype=torch.float32)
        )

class DQNAgent:
    def __init__(self, state_dim, num_actions, epsilon=0.1, gamma=0.99, lr=0.001, batch_size=64, target_update_freq=1000, replay_buffer_capacity=10000):
        self.state_dim = state_dim
        self.num_actions = num_actions
        self.epsilon = epsilon
        self.gamma = gamma
        self.batch_size = batch_size
        self.target_update_freq = target_update_freq
        self.q_network = DQNNetwork(state_dim, num_actions)
        self.target_network = DQNNetwork(state_dim, num_actions)
        self.target_network.load_state_dict(self.q_network.state_dict())
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)
        self.loss_fn = nn.MSELoss()
        self.replay_buffer = ExperienceReplayBuffer(replay_buffer_capacity)
        self.steps = 0

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, self.num_actions - 1)
        else:
            with torch.no_grad():
                q_values = self.q_network(torch.tensor(state, dtype=torch.float32))
                return torch.argmax(q_values).item()

    def train(self, state, action, reward, next_state, done):
        self.replay_buffer.add_experience((state, action, reward, next_state, done))
        self.steps += 1

        if len(self.replay_buffer.buffer) >= self.batch_size:
            batch = self.replay_buffer.sample_batch(self.batch_size)
            self.update_q_network(*batch)

        # Update target network periodically
        if self.steps % self.target_update_freq == 0:
            self.update_target_network()

    def update_q_network(self, states, actions, rewards, next_states, dones):
        self.optimizer.zero_grad()

        q_values = self.q_network(states).gather(1, actions.unsqueeze(1))

        with torch.no_grad():
            target_values = rewards + (1 - dones) * self.gamma * torch.max(self.target_network(next_states), dim=1)[0].unsqueeze(1)

        loss = self.loss_fn(q_values, target_values)
        loss.backward()
        self.optimizer.step()

    def update_target_network(self):
        self.target_network.load_state_dict(self.q_network.state_dict())

# Example Usage:

# Environment parameters
state_dim = 4  # dimensionality of the state
num_actions = 2  # number of actions

# Create DQN agent
dqn_agent = DQNAgent(state_dim=state_dim, num_actions=num_actions)

# Example training loop (replace with your environment interaction)
for episode in range(1000):
    state = np.random.rand(state_dim)  # replace with your environment's initial state
    done = False

    while not done:
        # Choose action using epsilon-greedy strategy
        action = dqn_agent.select_action(state)

        # Take action, observe next state and reward (replace with your environment's step)
        next_state = np.random.rand(state_dim)
        reward = np.random.rand()

        # Train the DQN agent
        dqn_agent.train(state, action, reward, next_state, done)

        state = next_state

    # Optionally update epsilon or other parameters during training
    dqn_agent.epsilon *= 0.99

# Save or use the trained DQN model as needed
torch.save(dqn_agent.q_network.state_dict(), "dqn_model.pth")
