In [151]:
import gymnasium
import numpy as np
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from collections import deque
import random



In [152]:
env=gymnasium.make('CartPole-v1',render_mode="human")

In [153]:
env.reset()

(array([-0.02996851, -0.02417919, -0.03962907,  0.02692981], dtype=float32),
 {})

In [154]:
env.render()

In [155]:
print(env.observation_space.shape)

(4,)


In [156]:
print(env.action_space.n)

2


In [157]:
env.action_space.sample()

np.int64(1)

In [158]:
class DQN:
    def __init__(self,env):
        self.env = env
        self.memory=deque(maxlen=2000)
        self.gamma=0.99
        self.epsilon=1.00
        self.epsilon_min=0.01
        self.epsilon_decay=0.995
        self.learning_rate=0.001
        self.tau=0.125
        self.model=self.create_model()
        self.target_model=self.create_model()
    def create_model(self):
            model = Sequential()
            input_dim = env.observation_space.shape[0]
            model.add(Dense(24, input_dim=input_dim,activation='relu'))
            model.add(Dense(34, activation='relu'))
            model.add(Dense(44, activation='relu'))
            model.add(Dense(self.env.action_space.n ,activation='linear'))
            model.compile(loss='mean_squared_error', optimizer=Adam(learning_rate=self.learning_rate))
            return model
    def act(self,state):
            if np.random.rand()<self.epsilon:
                return env.action_space.sample()
            else:
                state=np.array(state,dtype=np.float32).reshape(1,-1)
                return np.argmax(self.model.predict(state))
    def replay(self,batch_size=50):
        if len(self.memory)<batch_size:
            return

        samples = random.sample(self.memory, batch_size)
        states = np.vstack([s[0] for s in samples])
        targets = self.model.predict(states, verbose=0)
        for i, sample in enumerate(samples):
            state, action, reward, new_state, done = sample
            if done:
                targets[i][action] = reward
            else:
                Q_future=max(self.target_model.predict(new_state,verbose=0)[0])
                targets[i][action]=reward+self.gamma*Q_future

        self.model.fit(states,targets,epochs=1,verbose=0)

    def remember(self,state,action,reward,new_state,done):
        self.memory.append([state,action,reward,new_state,done])
    def target_train(self):
        weights=self.model.get_weights()
        target_weights=self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i]=weights[i]*self.tau+target_weights[i]*(1-self.tau)
        self.target_model.set_weights(target_weights)

    def save_model(self,fn):
        self.model.save(fn)



In [159]:
gamma=0.99
epsilon=0
dqn_model=DQN(env=env)

In [160]:
trails=1000
max_steps=100
batch_size=50
for episodes in range(trails):
    state,_=env.reset()
    c_state=state.reshape(1,-1)
    steps_counts=0
    total_reward=0
    for step in range(max_steps):
        steps_counts+=1
        action=dqn_model.act(c_state)
        new_state,reward,terminated,truncated,_=env.step(action)
        done=terminated or truncated
        dqn_model.remember(c_state,action,reward,new_state,done)
        c_state=new_state.reshape(1,-1).astype(np.float32)
        dqn_model.replay(batch_size=batch_size)
        dqn_model.target_train()
        total_reward+=reward
        print(f"\r Episode {episodes+1} steps {steps_counts}", end="")
        if done :
            break
    dqn_model.epsilon=max(dqn_model.epsilon_min,dqn_model.epsilon*dqn_model.epsilon_decay)
    print(f" Episode: {episodes+1}/{trails} steps: {steps_counts} total reward: {total_reward:.2f} Epsilon: {dqn_model.epsilon:.2f}")
    dqn_model.save_model("snow.keras")



 Episode 1 steps 11 Episode: 1/1000 steps: 11 total reward: 11.00 Epsilon: 0.99
 Episode 2 steps 13 Episode: 2/1000 steps: 13 total reward: 13.00 Epsilon: 0.99
 Episode 3 steps 18 Episode: 3/1000 steps: 18 total reward: 18.00 Epsilon: 0.99
 Episode 4 steps 7

ValueError: Exception encountered when calling Sequential.call().

[1mInvalid input shape for input Tensor("data:0", shape=(4,), dtype=float32). Expected shape (None, 4), but input has incompatible shape (4,)[0m

Arguments received by Sequential.call():
  • inputs=tf.Tensor(shape=(4,), dtype=float32)
  • training=False
  • mask=None
  • kwargs=<class 'inspect._empty'>