In [64]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import math


In [95]:
class Agent:
    
    def __init__(self, no_state, no_actions, memory=2000, gamma=0.95,
                 max_eps=1, min_eps=0.01, decay = 0.995, lr=0.001):
        
        self.no_state = no_state
        self.no_actions = no_actions
        
        self.steps = 0
        self.memory = deque(maxlen=memory)
        self.gamma = gamma
        self.max_eps = max_eps
        self.epsilon = max_eps
        self.min_eps = min_eps
        self.epsilon_decay = decay
        self.learning_rate = lr
        
        self.model = Sequential()
        self.model.add(Dense(24, input_shape=(self.no_state,), 
                             activation='relu'))
        self.model.add(Dense(24, activation='relu'))
        self.model.add(Dense(self.no_actions, activation='linear'))
        
        opt = Adam(lr=lr)
        self.model.compile(optimizer=opt, loss='mse')
        
        
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.no_actions)
        
        else:
            percentages = self.model.predict(state)
            return np.argmax(percentages[0])           
                       
    def add_memory(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
        
#     def replay_from_memory(self, batch_size):
#         q = min(len(self.memory), batch_size)
#         choosen = random.sample(self.memory, q)
#         states, targets_f = [], []
        
        
#         for state, action, reward, next_action, Done in choosen:
#             target = reward
#             if not Done:
#                 target = reward + self.gamma * np.amax(
#                     self.model.predict(next_state)[0])
            
#             target_f = self.model.predict(state)
# #             print ("t", target_f)
#             target_f[0][action] = target
            
#             states.append(state[0])
#             targets_f.append(target_f[0])
            
#             self.model.fit(state, target_f, epochs=1, verbose=0)
        
#         history = self.model.fit(np.array(states), np.array(targets_f), 
#                                  epochs=1, verbose=0)
        
#         loss = history.history['loss'][0]
    
#         self.epsilon = self.min_eps + (self.max_eps - self.min_eps
#                     ) * math.exp(-self.epsilon_decay * self.steps)
#         self.steps += 1    
#         return loss

    def replay(self, batch_size):
        batch_size = min(batch_size, len(self.memory))
        minibatch = random.sample(self.memory, batch_size)
        states, targets_f = [], []
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target 
            # Filtering out states and targets for training
            states.append(state[0])
            targets_f.append(target_f[0])
        history = self.model.fit(np.array(states), np.array(targets_f), epochs=1, verbose=0)
        # Keeping track of loss
        loss = history.history['loss'][0]
        if self.epsilon > self.min_eps:
            self.epsilon *= self.epsilon_decay
        return loss

In [36]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = Agent(state_size, action_size)

In [37]:
done = False
batch_size = 32
EPISODES = 1000
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(1500):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.add_memory(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, time, agent.epsilon))
            break
        if len(agent.memory) > 0:
            loss = agent.replay(batch_size)

episode: 0/1000, score: 90, e: 0.9156971178387074
episode: 1/1000, score: 111, e: 0.820543445547202
episode: 2/1000, score: 110, e: 0.7361124866620463


KeyboardInterrupt: 

In [16]:
env = gym.make('LunarLander-v2')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = Agent(state_size, action_size)

In [23]:
state1 = env.reset() # returns 8 features
print (state)
state = np.reshape(state, [1, state_size])
print (state)

[[-0.00500565  1.4078237  -0.5070328  -0.13763143  0.00580709  0.11485048
   0.          0.        ]]
[[-0.00500565  1.4078237  -0.5070328  -0.13763143  0.00580709  0.11485048
   0.          0.        ]]


In [38]:
one = False
batch_size = 32
EPISODES = 1000
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(1500):
        # env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        agent.add_memory(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {}".format(e, EPISODES, time, agent.epsilon))
            break
        if len(agent.memory) > 0:
            loss = agent.replay(batch_size)

3

In [42]:
model = Sequential()
model.add(Dense(24, input_shape=(8,), 
                     activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(4, activation='linear'))

opt = Adam(lr=0.001)
model.compile(optimizer=opt, loss='mse')
        

In [51]:
probs = model.predict(state) # return 4 probabilities
print (probs)
print (probs[0])
action = np.argmax(probs[0])
action

[[-0.09844705  0.76626384  0.05284156 -0.95347285]]
[-0.09844705  0.76626384  0.05284156 -0.95347285]


1

In [52]:
next_state, reward, done, info = env.step(action)
# next state -> 8 features
# reward -> a scalar
# done -> T/F
# info -> empty dict


In [57]:
memory = deque(maxlen=20000)
memory

deque([])