# Final Agent - Cart Pole

In [40]:
import gym
import random
import numpy as np
from collections import deque
from keras.models import Model, load_model
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential
import tensorflow as tf
import pandas as pd

#Creating the environment of the game
env = gym.make('CartPole-v1')

#Defining the neural network model
def NNModel(states, actions):
    
    #Input layer
    model = Sequential()
    model.add(Flatten(input_shape=(states)))
    
    #Hidden layer
    model.add(Dense(100, activation='relu'))
    
    #Output layer
    model.add(Dense(actions, activation='linear'))
    
    #Configures the model for training
    model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(lr=0.001), metrics=["accuracy"])
    model.summary()
    return model

#Our agent
class DQNAgent:
    def __init__(self,env):
        self.env=env
        self.states = env.observation_space.shape[0]
        self.actions = env.action_space.n
        
        #Parameters
        self.max_ep = 300
        self.memory = deque(maxlen=2000)
        self.gamma=0.95
        self.eps = 1.0
        self.batch_size = 32
        
        self.model=NNModel(states=(self.states,), actions = self.actions)
    
    #Adding an entry to our buffer
    def saveToBuff(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > 1000:
            if self.eps > 0.01:
                #Decreasing eps (the probability to take a random action)
                self.eps = self.eps * 0.99

    #Trains the model with experiences from memory
    def replay(self):
        if len(self.memory) < 1000:
            return
        # Randomly sample minibatch from the memory
        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))

        state = np.zeros((self.batch_size, self.states))
        next_state = np.zeros((self.batch_size, self.states))
        action, reward, done = [], [], []

        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3]
            done.append(minibatch[i][4])

        # Batch prediction
        target = self.model.predict(state)
        target_next = self.model.predict(next_state)

        for i in range(self.batch_size):
            # Update the Q value for the action used
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # DQN chooses the max Q value among next actions
                target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))

        # Train the Neural Network with batches
        self.model.fit(state, target, batch_size=self.batch_size, verbose=0)
    
    def run(self):
        for e in range(self.max_ep):
            state = self.env.reset()
            state = np.reshape(state, [1, self.states])
            done = False
            i = 0
            while not done:
                #Uncomment to display the task 
                #self.env.render()
                
                #Pick a random action if random between 0 and 1 is smaller than eps
                if np.random.random() <= self.eps:
                    action = random.randrange(self.actions)
                else:
                    #Pick the best possible action
                    action = np.argmax(self.model.predict(state))
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.states])
                #if the CartPole is still not out or it's the before last step
                if not done or i == self.env._max_episode_steps-1:
                    reward = reward
                else:
                    reward = -100
                self.saveToBuff(state, action, reward, next_state, done)
                state = next_state
                i += 1
                #If the CartPole is out or the score reached 500
                if done:                   
                    print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.max_ep, i, self.eps))
                    #If the max score is reached
                    if i == 500:
                        #Saving the model
                        self.model.save("cartpole-dqn.h5")
                        return
                self.replay()
    
    def test(self):
            #Loading the model
            self.model = load_model("cartpole-dqn.h5")
            SA=0
            for e in range(20):
                state = self.env.reset()
                state = np.reshape(state, [1, self.states])
                done = False
                i = 0
                while not done:
                    #Uncomment to display the task
                    #self.env.render()
                    action = np.argmax(self.model.predict(state))
                    next_state, reward, done, _ = self.env.step(action)
                    state = np.reshape(next_state, [1, self.states])
                    i += 1
                    if done:
                        print("episode: {}/{}, score: {}".format(e, 20, i))
                        SA+=i
                        break
            print("Average score over 20 runs : "+str(SA/20))
            self.env.close()
                
if __name__ == "__main__":
    agent = DQNAgent(env)
    agent.run()
    agent.test()

Model: "sequential_100"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_100 (Flatten)       (None, 4)                 0         
                                                                 
 dense_200 (Dense)           (None, 100)               500       
                                                                 
 dense_201 (Dense)           (None, 2)                 202       
                                                                 
Total params: 702
Trainable params: 702
Non-trainable params: 0
_________________________________________________________________
episode: 0/300, score: 39, e: 1.0
episode: 1/300, score: 13, e: 1.0
episode: 2/300, score: 17, e: 1.0
episode: 3/300, score: 14, e: 1.0
episode: 4/300, score: 31, e: 1.0
episode: 5/300, score: 15, e: 1.0
episode: 6/300, score: 26, e: 1.0
episode: 7/300, score: 11, e: 1.0
episode: 8/300, score: 15, e: 1.0
episode: 9/3

# Final Agent -  Mountain Car

In [41]:
import gym
import random
import numpy as np
from collections import deque
from keras.models import Model, load_model
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential
import tensorflow as tf
import pandas as pd

#Creating the environment of the game
env = gym.make('MountainCar-v0')

#Defining the neural network model
def NNModel(states, actions):
    #Input layer
    model = Sequential()
    model.add(Flatten(input_shape=(states)))    
    #Hidden layer
    model.add(Dense(100, activation='relu'))
    #Output layer
    model.add(Dense(actions, activation='linear'))
    #Configures the model for training
    model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(lr=0.001), metrics=["accuracy"])
    model.summary()
    return model

#Our agent
class DQNAgent:
    def __init__(self,env):
        self.env=env
        self.states = env.observation_space.shape[0]
        self.actions = env.action_space.n
        
        #Parameters
        self.max_ep = 300
        self.memory = deque(maxlen=10000)
        self.gamma=0.99
        self.eps = 1.0
        self.batch_size = 64
        
        self.model=NNModel(states=(self.states,), actions = self.actions)
        
    #Adding an entry to our buffer   
    def saveToBuff(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > 1000:
            if self.eps > 0.01:
                #Decreasing eps (the probability to take a random action)
                self.eps = self.eps * 0.999
                
    #Trains the model with experiences from memory
    def replay(self):
        if len(self.memory) < 1000:
            return
        # Randomly sample minibatch from the memory
        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))

        state = np.zeros((self.batch_size, self.states))
        next_state = np.zeros((self.batch_size, self.states))
        action, reward, done = [], [], []

        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3]
            done.append(minibatch[i][4])

        target = self.model.predict(state)
        target_next = self.model.predict(next_state)

        for i in range(self.batch_size):
            # Update the Q value for the action used
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # DQN chooses the max Q value among next actions
                target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))

        # Train the Neural Network with batches
        self.model.fit(state, target, batch_size=self.batch_size, verbose=0)

    def load(self, name):
        self.model = load_model(name)

    def save(self, name):
        self.model.save(name)
    
    def run(self):
        besti=-201
        for e in range(self.max_ep):
            state = self.env.reset()
            state = np.reshape(state, [1, self.states])
            done = False
            i = 0
            while not done:
                #Uncomment to display the task
                #self.env.render()
                
                #Pick a random action if random between 0 and 1 is smaller than eps
                if np.random.random() <= self.eps:
                    action = random.randrange(self.actions)
                else:
                    #Pick the best possible action
                    action = np.argmax(self.model.predict(state))
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.states])
                #If the Car is still not out or it's the before last step
                if done or i == self.env._max_episode_steps-1:
                    reward = reward
                else : reward=-200
                self.saveToBuff(state, action, reward, next_state, done)
                state = next_state
                i += 1
                #If the Car is reached the objective or run out of steps
                if done :
                    print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.max_ep, -i, self.eps))
                self.replay() 
            # If the score reached is better than the previous one
            if -i>besti:
                besti=-i
                #Saving the model
                self.save("mountain-dqn.h5")
                
    def test(self):
        #Loading the model
        self.model = load_model("mountain-dqn.h5")
        SA=0
        for e in range(300):
            state = self.env.reset()
            state = np.reshape(state, [1, self.states])
            done = False
            i = 0
            while not done:
                #Uncomment to display the task
                #self.env.render()
                action = np.argmax(self.model.predict(state))
                next_state, reward, done, _ = self.env.step(action)
                state = np.reshape(next_state, [1, self.states])
                i += 1
                if done:
                    print("episode: {}/{}, score: {}".format(e, 300, i))
                    SA+=i
                    break
        print("Score:", SA/300)
        self.env.close()
                
if __name__ == "__main__":
    agent = DQNAgent(env)
    agent.run()
    agent.test()

Model: "sequential_101"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_101 (Flatten)       (None, 2)                 0         
                                                                 
 dense_202 (Dense)           (None, 100)               300       
                                                                 
 dense_203 (Dense)           (None, 3)                 303       
                                                                 
Total params: 603
Trainable params: 603
Non-trainable params: 0
_________________________________________________________________
episode: 0/300, score: -200, e: 1.0
episode: 1/300, score: -200, e: 1.0
episode: 2/300, score: -200, e: 1.0
episode: 3/300, score: -200, e: 1.0
episode: 4/300, score: -200, e: 1.0
episode: 5/300, score: -200, e: 0.82
episode: 6/300, score: -200, e: 0.67
episode: 7/300, score: -200, e: 0.55
episode: 8/300, score: -200

episode: 192/300, score: -200, e: 0.01
episode: 193/300, score: -200, e: 0.01
episode: 194/300, score: -187, e: 0.01
episode: 195/300, score: -200, e: 0.01
episode: 196/300, score: -200, e: 0.01
episode: 197/300, score: -200, e: 0.01
episode: 198/300, score: -200, e: 0.01
episode: 199/300, score: -200, e: 0.01
episode: 200/300, score: -200, e: 0.01
episode: 201/300, score: -200, e: 0.01
episode: 202/300, score: -200, e: 0.01
episode: 203/300, score: -200, e: 0.01
episode: 204/300, score: -200, e: 0.01
episode: 205/300, score: -200, e: 0.01


KeyboardInterrupt: 

# Tuning parameters

In [None]:
import gym
import random
import numpy as np
from collections import deque
from keras.models import Model, load_model
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.models import Sequential
import tensorflow as tf
import pandas as pd

#Creating the environment of the game
env = gym.make('CartPole-v1')

#Parameters to try
memory=[2000,10000]
gamma=[0.95,0.96,0.97,0.98,0.99,1]
batch=[32,64]

#Defining the neural network model
def NNModel(states, actions):
    
    #Input layer
    model = Sequential()
    model.add(Flatten(input_shape=(states)))
    
    #Hidden layer
    model.add(Dense(100, activation='relu'))
    
    #Output layer
    model.add(Dense(actions, activation='linear'))
    
    #Configures the model for training
    model.compile(loss="mse", optimizer=tf.keras.optimizers.Adam(lr=0.001), metrics=["accuracy"])
    model.summary()
    return model

#Our agent
class DQNAgent:
    def __init__(self,env,memory,gamma,batch):
        self.env=env
        self.states = env.observation_space.shape[0]
        self.actions = env.action_space.n
        
        #Parameters
        self.max_ep = 300
        self.memory = deque(maxlen=memory)
        self.gamma=gamma
        self.eps = 1.0
        self.batch_size = batch
        
        self.m=memory
        self.g=gamma
        self.b=batch
        
        self.model=NNModel(states=(self.states,), actions = self.actions)
        
    #Adding an entry to our buffer
    def saveToBuff(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        if len(self.memory) > 1000:
            if self.eps > 0.01:
                #Decreasing eps (the probability to take a random action)
                self.eps = self.eps * 0.99

    #Trains the model with experiences from memory
    def replay(self):
        if len(self.memory) < 1000:
            return
        # Randomly sample minibatch from the memory
        minibatch = random.sample(self.memory, min(len(self.memory), self.batch_size))

        state = np.zeros((self.batch_size, self.states))
        next_state = np.zeros((self.batch_size, self.states))
        action, reward, done = [], [], []

        for i in range(self.batch_size):
            state[i] = minibatch[i][0]
            action.append(minibatch[i][1])
            reward.append(minibatch[i][2])
            next_state[i] = minibatch[i][3]
            done.append(minibatch[i][4])

        # Batch prediction
        target = self.model.predict(state)
        target_next = self.model.predict(next_state)

        for i in range(self.batch_size):
            # Update the Q value for the action used
            if done[i]:
                target[i][action[i]] = reward[i]
            else:
                # DQN chooses the max Q value among next actions
                target[i][action[i]] = reward[i] + self.gamma * (np.amax(target_next[i]))

        # Train the Neural Network with batches
        self.model.fit(state, target, batch_size=self.batch_size, verbose=0)

    def run(self):
        for e in range(self.max_ep):
            state = self.env.reset()
            state = np.reshape(state, [1, self.states])
            done = False
            i = 0
            while not done:
                #Uncomment to display the task 
                #self.env.render()
                
                #Pick a random action if random between 0 and 1 is smaller than eps
                if np.random.random() <= self.eps:
                    action = random.randrange(self.actions)
                else:
                    #Pick the best possible action
                    action = np.argmax(self.model.predict(state))
                next_state, reward, done, _ = self.env.step(action)
                next_state = np.reshape(next_state, [1, self.states])
                #if the CartPole is still not out or it's the before last step
                if not done or i == self.env._max_episode_steps-1:
                    reward = reward
                else:
                    reward = -100
                self.saveToBuff(state, action, reward, next_state, done)
                state = next_state
                i += 1
                #If the CartPole is out or the score reached 500
                if done:                   
                    print("episode: {}/{}, score: {}, e: {:.2}".format(e, self.max_ep, i, self.eps))
                    #If the max score is reached
                    if i == 500:
                        #Saving the model
                        self.model.save("cartpole-dqn"+" "+str(self.m)+" "+str(self.g)+" "+str(self.b)+".h5")
                        return
                self.replay()
    
    def test(self):
            #Loading the model
            self.model = load_model("cartpole-dqn"+" "+str(self.m)+" "+str(self.g)+" "+str(self.b)+".h5")
            #Score Average
            SA=0
            for e in range(20):
                state = self.env.reset()
                state = np.reshape(state, [1, self.states])
                done = False
                i = 0
                while not done:
                    #Uncomment to display the task 
                    #self.env.render()
                    action = np.argmax(self.model.predict(state))
                    next_state, reward, done, _ = self.env.step(action)
                    state = np.reshape(next_state, [1, self.states])
                    i += 1
                    if done:
                        print("episode: {}/{}, score: {}".format(e, 20, i))
                        SA+=i
                        break
            return SA/20
                
if __name__ == "__main__":
    for i in memory:
        for j in gamma:
            for k in batch:
                agent = DQNAgent(env,memory=i,gamma=j,batch=k)
                agent.run()
    scores_table = pd.DataFrame(columns=["memory","gamma","batch","score avg."])
    for i in memory:
        for j in gamma:
            for k in batch:
                agent = DQNAgent(env,memory=i,gamma=j,batch=k)
                agent.run()
                agent = DQNAgent(env,memory=i,gamma=j,batch=k)
                scores_table=scores_table.append({"memory":i,"gamma":j,"batch":k,"score avg.":agent.test()},ignore_index=True)
    display(scores_table)