In [96]:
import random
import gym
import gymnasium
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense,Dropout
from collections import deque

from tensorflow.python.ops.gen_batch_ops import batch

In [97]:
env=gymnasium.make('MountainCar-v0',render_mode='human')

In [98]:
env.reset()

(array([-0.53141516,  0.        ], dtype=float32), {})

In [99]:
env.render()

In [100]:
env.observation_space.shape[0]

2

In [101]:
env.action_space.n


np.int64(3)

In [102]:
env.action_space.sample()

np.int64(1)

In [103]:
class DQN:
    def __init__(self,env):
        self.env=env
        self.memory=deque(maxlen=2000)
        self.gamma=0.99
        self.epsilon=1.0
        self.epsilon_min=0.01
        self.epsilon_decay=0.995
        self.learning_rate=0.005
        self.tau=0.125
        self.model=self.create_model()
        self.target_model=self.create_model()
    def create_model(self):
            model=Sequential()
            model.add(Dense(24,input_dim=self.env.observation_space.shape[0],activation='relu'))
            model.add(Dense(48,activation='relu'))
            model.add(Dense(24,activation='relu'))
            model.add(Dense(self.env.action_space.n))
            model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=self.learning_rate))
            return model
    def act(self,state):
            if np.random.random()<self.epsilon:
                return  self.env.action_space.sample()
            else:
                state=np.array(state,dtype=np.float32).reshape(1,-1)
                return np.argmax(self.model.predict(state,verbose=0))
    def replay(self, batch_size=32):
        if len(self.memory) < batch_size:
            return

        samples = random.sample(self.memory, batch_size)
        states = np.vstack([s[0] for s in samples])
        targets = self.model.predict(states, verbose=0)
        for i, sample in enumerate(samples):
            state, action, reward, new_state, done = sample
            if done:
                targets[i][action] = reward
            else:
                Q_future = max(self.target_model.predict(new_state, verbose=0)[0])
                targets[i][action] = reward + self.gamma * Q_future

        self.model.fit(states, targets, epochs=1, verbose=0)
    def remember(self,state,action,reward,new_state,done):
        self.memory.append([state,action,reward,new_state,done])
    def target_train(self):
            weights=self.model.get_weights()
            target_weights=self.target_model.get_weights()
            for i in range(len(target_weights)):
                target_weights[i]=weights[i]*self.tau+target_weights[i]*(1-self.tau)
            self.target_model.set_weights(target_weights)
    def save_model(self,fn):
         self.model.save(fn)



In [104]:
env=gymnasium.make('MountainCar-v0',render_mode='human')
gamma=0.9
epsilon=0.95

In [105]:
trails=100
trails_len=500
dpn_agent=DQN(env=env)

In [None]:
steps=[]
batch_size=50
for episode in range(trails):
    current_state,_=env.reset()
    current_state=current_state.reshape(1,2)
    steps_count=0
    total_reward=0
    for step in range(trails_len):
        steps_count+=1
        action=dpn_agent.act(current_state)
        new_state,reward,terminated,truncated,_=env.step(action)
        done= truncated or terminated
        new_state=new_state.reshape(1,2)
        dpn_agent.remember(current_state,action,reward,new_state,done)
        dpn_agent.replay(batch_size=batch_size)
        dpn_agent.target_train()
        current_state=new_state
        total_reward += reward
        print(f"\rEpisode {episode+1} - Step {steps_count}", end="")
        if done:
            break
    print(f"Episode {episode+1}/{trails} Steps: {steps_count}  Total reward: {total_reward:.2f} Epsilon: {dpn_agent.epsilon:.2f}")
    dpn_agent.save_model('parnia.keras')

    if dpn_agent.epsilon > dpn_agent.epsilon_min:
        dpn_agent.epsilon *= dpn_agent.epsilon_decay

Episode 1 - Step 200Episode 1/100 Steps: 200  Total reward: -200.00 Epsilon: 1.00
Episode 2 - Step 200Episode 2/100 Steps: 200  Total reward: -200.00 Epsilon: 0.99
Episode 3 - Step 200Episode 3/100 Steps: 200  Total reward: -200.00 Epsilon: 0.99
Episode 4 - Step 200Episode 4/100 Steps: 200  Total reward: -200.00 Epsilon: 0.99
Episode 5 - Step 200Episode 5/100 Steps: 200  Total reward: -200.00 Epsilon: 0.98
Episode 6 - Step 200Episode 6/100 Steps: 200  Total reward: -200.00 Epsilon: 0.98
Episode 7 - Step 200Episode 7/100 Steps: 200  Total reward: -200.00 Epsilon: 0.97
Episode 8 - Step 200Episode 8/100 Steps: 200  Total reward: -200.00 Epsilon: 0.97
Episode 9 - Step 200Episode 9/100 Steps: 200  Total reward: -200.00 Epsilon: 0.96
Episode 10 - Step 200Episode 10/100 Steps: 200  Total reward: -200.00 Epsilon: 0.96
Episode 11 - Step 200Episode 11/100 Steps: 200  Total reward: -200.00 Epsilon: 0.95
Episode 12 - Step 200Episode 12/100 Steps: 200  Total reward: -200.00 Epsilon: 0.95
Episode 13