<a href="https://colab.research.google.com/github/mohmmdkhosravi/git-learning/blob/main/minProject4_Khosravi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
#1

# نصب کتابخانه‌های مورد نیاز
!pip install tensorflow gym keras matplotlib

# وارد کردن کتابخانه‌ها
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

# تعریف محیط Wumpus
class WumpusEnv:
    def __init__(self):
        self.grid_size = 4
        self.reset()

    def reset(self):
        self.agent_pos = [0, 0]
        self.gold_pos = [3, 3]
        self.wumpus_pos = [1, 1]
        self.pits = [[2, 2]]
        self.done = False
        return self.agent_pos

    def step(self, action):
        if action == 0:  # up
            self.agent_pos[0] = max(0, self.agent_pos[0] - 1)
        elif action == 1:  # down
            self.agent_pos[0] = min(self.grid_size - 1, self.agent_pos[0] + 1)
        elif action == 2:  # left
            self.agent_pos[1] = max(0, self.agent_pos[1] - 1)
        elif action == 3:  # right
            self.agent_pos[1] = min(self.grid_size - 1, self.agent_pos[1] + 1)

        if self.agent_pos == self.gold_pos:
            reward = 100
            self.done = True
        elif self.agent_pos == self.wumpus_pos or self.agent_pos in self.pits:
            reward = -100
            self.done = True
        else:
            reward = -1

        return self.agent_pos, reward, self.done

    def render(self):
        grid = np.zeros((self.grid_size, self.grid_size))
        grid[tuple(self.agent_pos)] = 1
        grid[tuple(self.gold_pos)] = 2
        grid[tuple(self.wumpus_pos)] = 3
        for pit in self.pits:
            grid[tuple(pit)] = 4
        print(grid)

# تعریف عامل DQN
EPISODES = 1000
LEARNING_RATE = 0.001
DISCOUNT_RATE = 0.95
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.exploration_rate = EXPLORATION_MAX

        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=LEARNING_RATE))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.exploration_rate:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + DISCOUNT_RATE * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.exploration_rate > EXPLORATION_MIN:
            self.exploration_rate *= EXPLORATION_DECAY

# آموزش عامل DQN
env = WumpusEnv()
state_size = 2
action_size = 4
agent = DQNAgent(state_size, action_size)
batch_size = 32
rewards = []

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        total_reward += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            rewards.append(total_reward)
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

    print(f"Episode: {e+1}/{EPISODES}, Total Reward: {total_reward}")

# ترسیم نمودار پاداش تجمعی
plt.plot(range(EPISODES), rewards)
plt.xlabel('Episode')
plt.ylabel('Cumulative Reward')
plt.title('Cumulative Reward over Episodes')
plt.show()






[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode: 3/1000, Total Reward: -116
Episode: 4/1000, Total Reward: -102
Episode: 5/1000, Total Reward: -102
Episode: 6/1000, Total Reward: -101
Episode: 7/1000, Total Reward: -103
Episode: 8/1000, Total Reward: -102
Episode: 9/1000, Total Reward: -110
Episode: 10/1000, Total Reward: -117
Episode: 11/1000, Total Reward: -108
Episode: 12/1000, Total Reward: -124
Episode: 13/1000, Total Reward: -105
Episode: 14/1000, Total Reward: -101


KeyboardInterrupt: 

In [None]:
#2
import gym
import numpy as np
from collections import deque
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

# پارامترهای آموزش
EPISODES = 1000
LEARNING_RATE = 0.001
DISCOUNT_RATE = 0.99
EXPLORATION_MAX = 1.0
EXPLORATION_MIN = 0.01
EXPLORATION_DECAY = 0.995

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.exploration_rate = EXPLORATION_MAX

        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=LEARNING_RATE))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.exploration_rate:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + DISCOUNT_RATE * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.exploration_rate > EXPLORATION_MIN:
            self.exploration_rate *= EXPLORATION_DECAY

env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
batch_size = 64
rewards = []

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    total_reward = 0
    for time in range(500):
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        total_reward += reward
        next_state = np.reshape(next_state, [1, state_size])
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            rewards.append(total_reward)
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

# ترسیم نمودار پاداش
plt.plot(range(EPISODES), rewards)
plt.xlabel('Episode')
plt.ylabel('Cumulative Reward')
plt.show()
