In [1]:
# use "pip install [PACKAGE_NAME]" to get any required packages you don't have

import gymnasium as gym
import numpy as np
import random
from keras import Sequential
from collections import deque
from keras.layers import Dense
from keras.optimizers import Adam
import matplotlib.pyplot as plt
from keras.activations import relu, linear

In [2]:
env = gym.make('MountainCar-v0', render_mode = "human")
#env.seed(134)
np.random.seed(458)

In [3]:
class DQN:

    """ Implementation of deep q learning algorithm """

    def __init__(self, action_space, state_space):

        self.action_space = action_space
        self.state_space = state_space
        self.epsilon = 1.0
        self.gamma = .95
        self.batch_size = 64
        self.epsilon_min = .01
        self.lr = 0.001
        self.epsilon_decay = .995
        self.memory = deque(maxlen=100000)
        self.model = self.build_model()

    def build_model(self):

        model = Sequential()
        model.add(Dense(20, input_dim=self.state_space, activation=relu))
        model.add(Dense(25, activation=relu))
        model.add(Dense(self.action_space, activation=linear))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.lr))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):

        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_space)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self):

        if len(self.memory) < self.batch_size:
            return

        minibatch = random.sample(self.memory, self.batch_size)
        states = np.array([i[0] for i in minibatch])
        actions = np.array([i[1] for i in minibatch])
        rewards = np.array([i[2] for i in minibatch])
        next_states = np.array([i[3] for i in minibatch])
        dones = np.array([i[4] for i in minibatch])

        states = np.squeeze(states)
        next_states = np.squeeze(next_states)

        targets = rewards + self.gamma*(np.amax(self.model.predict_on_batch(next_states), axis=1))*(1-dones)
        targets_full = self.model.predict_on_batch(states)

        ind = np.array([i for i in range(self.batch_size)])
        targets_full[[ind], [actions]] = targets

        self.model.fit(states, targets_full, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [4]:
def get_reward(state):

    if state[0] >= 0.5:
        print("Car has reached the goal")
        return 10
    if state[0] > -0.4:
        return (1+state[0])**2
    return 0

In [5]:
def train_dqn(episode):

    loss = []
    agent = DQN(env.action_space.n, env.observation_space.shape[0])
    for e in range(episode):
        state = env.reset()[0] # added [0]
        state = np.reshape(state, (1, 2))
        score = 0
        max_steps = 1000
        for i in range(max_steps):
            action = agent.act(state)
            env.render()
            next_state, reward, done, _ = env.step(action)[0:4] # added [0:4]
            reward = get_reward(next_state)
            score += reward
            next_state = np.reshape(next_state, (1, 2))
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            agent.replay()
            if done:
                print("episode: {}/{}, score: {}".format(e, episode, score))
                break
        loss.append(score)
    return loss


In [6]:
def random_policy(episode, step):

    for i_episode in range(episode):
        env.reset()
        for t in range(step):
            env.render()
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            if done:
                print("Episode finished after {} timesteps".format(t+1))
                break
            print("Starting next episode")

In [7]:
print(env.observation_space)
print(env.action_space)
episodes = 40
loss = train_dqn(episodes)
plt.plot([i+1 for i in range(episodes)], loss)
plt.show()

Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
Discrete(3)














Car has reached the goal
episode: 1/60, score: 147.36790899230033


Car has reached the goal
episode: 2/60, score: 66.28053028187298


Car has reached the goal
episode: 3/60, score: 56.02660352010889


Car has reached the goal
episode: 4/60, score: 53.91987566158464
















Car has reached the goal
episode: 6/60, score: 82.74469812068943


Car has reached the goal
episode: 7/60, score: 64.5468310629623


Car has reached the goal
episode: 8/60, score: 70.38255698705629


Car has reached the goal
episode: 9/60, score: 74.48441312273191


Car has reached the goal
episode: 10/60, score: 67.58095077241713










Car has reached the goal
episode: 11/60, score: 229.17058786305913


Car has reached the goal
episode: 12/60, score: 59.59501962570046


Car has reached the goal
episode: 13/60, score: 59.52467757312455


Car has reached the goal
episode: 14/60, score: 61.260333086122635














Car has reached the goal
episode: 16/60, score: 57.191050757723204


Car has reached the goal
episode: 17/60, score: 59.427405188203956
Car has reached the goal
episode: 18/60, score: 62.13527559306954




Car has reached the goal
episode: 19/60, score: 118.51038436305527


Car has reached the goal
episode: 20/60, score: 70.57809099504114






Car has reached the goal
episode: 21/60, score: 181.83423611945778


Car has reached the goal
episode: 22/60, score: 60.736478778394634


Car has reached the goal
episode: 23/60, score: 64.71759969956997


Car has reached the goal
episode: 24/60, score: 81.76483689498122
Car has reached the goal
episode: 25/60, score: 58.54074024170097


Car has reached the goal
episode: 26/60, score: 57.68765032249081


Car has reached the goal
episode: 27/60, score: 60.437553712190834


Car has reached the goal
episode: 28/60, score: 65.36838695586513


Car has reached the goal
episode: 29/60, score: 72.93923129531153


Car has reached the goal
episode: 30/60, score: 65.86450659035938


Car has reached the goal
episode: 31/60, score: 71.13754041505801




Car has reached the goal
episode: 32/60, score: 158.79616941725945


Car has reached the goal
episode: 33/60, score: 60.6551683111016


Car has reached the goal
episode: 34/60, score: 66.22496876081556


Car has reached the goal
episode: 35/60, score: 62.93968949555764


Car has reached the goal
episode: 36/60, score: 58.44512887349363


KeyboardInterrupt: 

In [8]:
plt.plot([i+1 for i in range(episodes)], loss)
plt.show()

NameError: name 'loss' is not defined