In [1]:
import math

import numpy as np

import gym
from gym import spaces
from gym.utils import seeding

In [2]:
env = gym.make('MountainCar-v0')#to make new gym environment

In [3]:
env.reset()#parameters of given environment

array([-0.40670149,  0.        ])

In [4]:
env.action_space #params of o/p or things we can do with environment

Discrete(3)

In [5]:
env.observation_space

Box(-1.2000000476837158, 0.6000000238418579, (2,), float32)

In [6]:
env.reset()
for t in range(1000):
    random_action = env.action_space.sample()
    env.step(random_action)
    env.render()
env.close()

In [12]:
for e in range(20):#episode
    observation = env.reset()
    for t in range(50):
        env.render()
        action = env.action_space.sample()
        observation,reward,done,other_info = env.step(action)
        if done:
            #game episode over 
            print("Game episode {}/{} highscore :{}".format(e,20,t))
            break
env.close()
print("All 20 episodes are over")

All 20 episodes are over


In [13]:
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

In [14]:
class Agent:
    def __init__(self,state_size,action_size): #state_size ->i/p , action_size-> o/p
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95 #0.95 is the discount factor
        self.epsilon = 1.0#explosion rate : how much to act randomly
        self.epsilon_decay = 0.995  #decaying the epsilon
        self.epsilon_min = 0.01
        self.learning_rate = 0.001
        self.model = self._create_model()
    def _create_model(self):
        model = Sequential()
        model.add(Dense(24,input_dim=self.state_size,activation='relu')) #1st Hidden Layer
        model.add(Dense(24,activation='relu')) #2nd Hidden Layer
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
        return model
    def remember(self,state,action,reward,next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    def act(self,state):
        # Exploration vs Exploitation
        if np.random.rand()<=self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state) # predict reward value based upon current state
        return np.argmax(act_values[0]) #predict reward value based upon current
    def train(self,batch_size=32): #method that trains NN with experiences sampled from memory
        minibatch = random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch:
            
            if not done: #boolean 
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            else:
                target = reward
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state,target_f,epochs=1,verbose=0) #single epoch, x =state, y = target_f, loss--> target_f - 
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
    
    def load(self,name):
        self.model.load_weights(name)
    def save(self,name):
        self.model.save_weights(name)
        

In [15]:
n_episodes = 50
output_dir = "car_model/"

In [16]:
agent = Agent(state_size=4,action_size=2)
done = False
state_size = 2
action_size =3
batch_size = 32

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [17]:
agent = Agent(state_size, action_size) # initialise agent
done = False
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    
    for time in range(5000):
        env.render()
        action = agent.act(state) #action is 0 or 1
        next_state,reward,done,other_info = env.step(action) 
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remember(state,action,reward,next_state,done)
        state = next_state
        
        if done:
            print("Game Episode :{}/{}, High Score:{},Exploration Rate:{:.2}".format(e,n_episodes,time,agent.epsilon))
            break
            
    if len(agent.memory)>batch_size:
        agent.train(batch_size)
    
    if e%50==0:
        agent.save(output_dir+"weights_"+'{:04d}'.format(e)+".hdf5")
        
env.close()

Game Episode :0/50, High Score:199,Exploration Rate:1.0
Game Episode :1/50, High Score:199,Exploration Rate:0.99
Game Episode :2/50, High Score:199,Exploration Rate:0.99
Game Episode :3/50, High Score:199,Exploration Rate:0.99
Game Episode :4/50, High Score:199,Exploration Rate:0.98
Game Episode :5/50, High Score:199,Exploration Rate:0.98
Game Episode :6/50, High Score:199,Exploration Rate:0.97
Game Episode :7/50, High Score:199,Exploration Rate:0.97
Game Episode :8/50, High Score:199,Exploration Rate:0.96
Game Episode :9/50, High Score:199,Exploration Rate:0.96
Game Episode :10/50, High Score:199,Exploration Rate:0.95
Game Episode :11/50, High Score:199,Exploration Rate:0.95
Game Episode :12/50, High Score:199,Exploration Rate:0.94
Game Episode :13/50, High Score:199,Exploration Rate:0.94
Game Episode :14/50, High Score:199,Exploration Rate:0.93
Game Episode :15/50, High Score:199,Exploration Rate:0.93
Game Episode :16/50, High Score:199,Exploration Rate:0.92
Game Episode :17/50, High