In [1]:
from sklearn.datasets import load_digits
import random
import numpy as np
import pandas as pd
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import LearningRateScheduler, Callback

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load MNIST

In [2]:
digits = load_digits()

X, y = digits.data[:-1], digits.target[:-1]

In [3]:
input_dim = X.shape[1]
num_classes = np.unique(y).shape[0]
lr = 0.1

y = pd.get_dummies(y).values

# Define simple network and train it

In [4]:
def get_model(input_dim, num_classes, lr):
    
    model = Sequential()
    model.add(Dense(256, input_dim=input_dim, activation='relu'))
    model.add(Dense(256, activation='relu'))
    model.add(Dense(128, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=SGD(lr=lr),
                  metrics=['accuracy'])
    
    return model

In [5]:
def scheduler(epoch):
    return loss_history.lr[-1]

In [67]:
class LossHistory(Callback):
    
    def __init__(self, verbose, lr=0.01):
        self.verbose = verbose
        self.losses = [np.inf]
        self.acc = [0.0]
        self.lr = [lr]
 
    def on_epoch_end(self, epoch, logs={}):
        
        self.losses.append(logs.get('loss'))
        self.acc.append(logs.get('acc'))
#        self.lr.append(scheduler(len(self.losses)))
        if epoch % 2 == 0:
            print('learning rate: {}\n'.format(np.round(self.lr[-1], 4)))
        
#        if epoch % self.verbose == 0:
#            # you can access loss, accuracy in self.params['metrics']
#            print('{} - loss: {} - acc: {}\n'.format(epoch, self.losses[-1], self.acc[-1]))

In [97]:
model = get_model(input_dim, num_classes, lr)

loss_history = LossHistory(verbose=10)
lrate = LearningRateScheduler(scheduler)
callbacks_list = [loss_history, lrate]
history = model.fit(X, y, 
                   epochs=10000, 
                   batch_size=64, 
                   callbacks=callbacks_list, 
                   verbose=0)


learning rate: 0.1
learning rate: 0.1
learning rate: 0.1


KeyboardInterrupt: 

# Define DQN

In [11]:
class DQNAgent():
    
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [84]:
def reset_env(lr):
    """
    """
    model = get_model(input_dim, num_classes, lr)
    loss_history = LossHistory(verbose=1, lr=lr)
    lrate = LearningRateScheduler(scheduler)
    return model, loss_history, lrate


def step_env(action, model, loss_history, lrate, improve_coeff=0.0, num_epochs=1):
    """
    """
    # Next state
    if action == 0:
        loss_history.lr.append(loss_history.lr[-1])    
    elif action == 1:
        loss_history.lr.append(loss_history.lr[-1] / 2.0)
    elif action == 2:
        loss_history.lr.append(loss_history.lr[-1] * 2.0)

    callbacks_list = [loss_history, lrate]
    model.fit(X, y, 
               epochs=num_epochs, 
               batch_size=64, 
               callbacks=callbacks_list, 
               verbose=2)
    next_state = loss_history.losses[-1], loss_history.acc[-1], loss_history.lr[-1], \
    loss_history.losses[-2], loss_history.acc[-2]
    # Reward
    improvement = loss_history.acc[-1] - loss_history.acc[-2]
    reward = loss_history.acc[-1] + improve_coeff * improvement
    reward = -0.1 if improvement < 0.0 else reward
    # Done
    done = loss_history.acc[-1] == 1.0 or np.isnan(loss_history.losses[-1])
    return next_state, reward, done

In [86]:
EPISODES = 100
IMPROVE_COEFF = 0.0
MAX_EPOCHS = 30
NUM_EPOCHS = 1
LR = 0.01

state_size = 5
action_size = 3
agent = DQNAgent(state_size, action_size)
done = False
batch_size = 32

for e in range(EPISODES):
    
    state = [np.inf, 0.0, lr, np.inf, 0.0]
    state = np.reshape(state, [1, state_size])
    model, loss_history, lrate = reset_env(LR)
    
    for time in range(MAX_EPOCHS):
        
        # Agent acts, env updates
        action = agent.act(state)
        next_state, reward, done = step_env(action, model, loss_history, lrate, IMPROVE_COEFF, NUM_EPOCHS)
        next_state = np.reshape(next_state, [1, state_size])
        
        # Agent remembers
        agent.remember(state, action, reward, next_state, done)
        state = next_state

        if done:
            print("\nepisode: {}/{}, score: {}, e: {:.2}\n"
                  .format(e, EPISODES, loss_history.acc[-1], agent.epsilon))
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

learning rate: 0.01

1796/1796 - 1s - loss: 1.3418 - acc: 0.6464
learning rate: 0.005

1796/1796 - 0s - loss: 0.3712 - acc: 0.9092
learning rate: 0.005

1796/1796 - 0s - loss: 0.2648 - acc: 0.9365
learning rate: 0.0025

1796/1796 - 0s - loss: 0.2111 - acc: 0.9493
learning rate: 0.005

1796/1796 - 0s - loss: 0.1933 - acc: 0.9532
learning rate: 0.01

1796/1796 - 0s - loss: 0.2071 - acc: 0.9471
learning rate: 0.01

1796/1796 - 0s - loss: 0.1358 - acc: 0.9666
learning rate: 0.005

1796/1796 - 0s - loss: 0.1063 - acc: 0.9738
learning rate: 0.0025

1796/1796 - 0s - loss: 0.0934 - acc: 0.9827
learning rate: 0.005

1796/1796 - 0s - loss: 0.0911 - acc: 0.9816
learning rate: 0.005

1796/1796 - 0s - loss: 0.0841 - acc: 0.9850
learning rate: 0.01

1796/1796 - 0s - loss: 0.0891 - acc: 0.9794
learning rate: 0.02

1796/1796 - 0s - loss: 0.1681 - acc: 0.9532
learning rate: 0.04

1796/1796 - 0s - loss: 0.8929 - acc: 0.8207
learning rate: 0.08

1796/1796 - 0s - loss: 0.6460 - acc: 0.8435
learning rate: 

learning rate: 0.01

1796/1796 - 0s - loss: 0.1566 - acc: 0.9671
learning rate: 0.005

1796/1796 - 0s - loss: 0.1315 - acc: 0.9761
learning rate: 0.005

1796/1796 - 0s - loss: 0.1120 - acc: 0.9811
learning rate: 0.005

1796/1796 - 0s - loss: 0.1043 - acc: 0.9805
learning rate: 0.0025

1796/1796 - 0s - loss: 0.0953 - acc: 0.9839
learning rate: 0.005

1796/1796 - 0s - loss: 0.0917 - acc: 0.9827
learning rate: 0.005

1796/1796 - 0s - loss: 0.0856 - acc: 0.9839
learning rate: 0.005

1796/1796 - 0s - loss: 0.0804 - acc: 0.9855
learning rate: 0.005

1796/1796 - 0s - loss: 0.0772 - acc: 0.9844
learning rate: 0.005

1796/1796 - 0s - loss: 0.0696 - acc: 0.9916
learning rate: 0.005

1796/1796 - 0s - loss: 0.0669 - acc: 0.9905
learning rate: 0.005

1796/1796 - 0s - loss: 0.0631 - acc: 0.9911
learning rate: 0.01

1796/1796 - 0s - loss: 0.0628 - acc: 0.9894
learning rate: 0.01

1796/1796 - 0s - loss: 0.0572 - acc: 0.9905
learning rate: 0.01

1796/1796 - 0s - loss: 0.0519 - acc: 0.9911
learning rate

learning rate: 0.01

1796/1796 - 0s - loss: 0.0417 - acc: 0.9950
learning rate: 0.01

1796/1796 - 0s - loss: 0.0388 - acc: 0.9944
learning rate: 0.01

1796/1796 - 0s - loss: 0.0366 - acc: 0.9950
learning rate: 0.01

1796/1796 - 0s - loss: 0.0322 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0305 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0284 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0621 - acc: 0.9883
learning rate: 0.01

1796/1796 - 0s - loss: 0.0265 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0246 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0226 - acc: 0.9983
learning rate: 0.005

1796/1796 - 0s - loss: 0.0210 - acc: 0.9989
learning rate: 0.005

1796/1796 - 0s - loss: 0.0200 - acc: 0.9989
learning rate: 0.005

1796/1796 - 0s - loss: 0.0194 - acc: 0.9983
learning rate: 0.005

1796/1796 - 0s - loss: 0.0190 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0190 - acc: 0.9989
learning rate: 0.01



learning rate: 0.02

1796/1796 - 0s - loss: 0.0115 - acc: 0.9994
learning rate: 0.02

1796/1796 - 0s - loss: 0.0194 - acc: 0.9978
learning rate: 0.01

1796/1796 - 1s - loss: 1.1264 - acc: 0.7116
learning rate: 0.01

1796/1796 - 0s - loss: 0.3209 - acc: 0.9204
learning rate: 0.01

1796/1796 - 0s - loss: 0.2088 - acc: 0.9454
learning rate: 0.01

1796/1796 - 0s - loss: 0.1592 - acc: 0.9560
learning rate: 0.01

1796/1796 - 0s - loss: 0.1384 - acc: 0.9671
learning rate: 0.01

1796/1796 - 0s - loss: 0.1113 - acc: 0.9733
learning rate: 0.01

1796/1796 - 0s - loss: 0.0929 - acc: 0.9783
learning rate: 0.01

1796/1796 - 0s - loss: 0.0833 - acc: 0.9811
learning rate: 0.01

1796/1796 - 0s - loss: 0.0731 - acc: 0.9839
learning rate: 0.01

1796/1796 - 0s - loss: 0.0822 - acc: 0.9822
learning rate: 0.01

1796/1796 - 0s - loss: 0.0596 - acc: 0.9900
learning rate: 0.02

1796/1796 - 0s - loss: 0.0752 - acc: 0.9805
learning rate: 0.02

1796/1796 - 0s - loss: 0.0528 - acc: 0.9894
learning rate: 0.02

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.1582 - acc: 0.9621
learning rate: 0.01

1796/1796 - 0s - loss: 0.1241 - acc: 0.9722
learning rate: 0.01

1796/1796 - 0s - loss: 0.1242 - acc: 0.9666
learning rate: 0.005

1796/1796 - 0s - loss: 0.0918 - acc: 0.9794
learning rate: 0.005

1796/1796 - 0s - loss: 0.0762 - acc: 0.9866
learning rate: 0.005

1796/1796 - 0s - loss: 0.0695 - acc: 0.9878
learning rate: 0.005

1796/1796 - 0s - loss: 0.0664 - acc: 0.9878
learning rate: 0.005

1796/1796 - 0s - loss: 0.0649 - acc: 0.9866
learning rate: 0.005

1796/1796 - 0s - loss: 0.0579 - acc: 0.9905
learning rate: 0.005

1796/1796 - 0s - loss: 0.0544 - acc: 0.9933
learning rate: 0.005

1796/1796 - 0s - loss: 0.0514 - acc: 0.9939
learning rate: 0.005

1796/1796 - 0s - loss: 0.0486 - acc: 0.9955
learning rate: 0.005

1796/1796 - 0s - loss: 0.0473 - acc: 0.9950
learning rate: 0.0025

1796/1796 - 0s - loss: 0.0515 - acc: 0.9911
learning rate: 0.0012

1796/1796 - 0s - loss: 0.0422 - acc: 0.9961
learning ra

learning rate: 0.01

1796/1796 - 0s - loss: 0.0244 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0251 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0220 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0206 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0191 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.1954 - acc: 0.9699
learning rate: 0.01

1796/1796 - 0s - loss: 0.0236 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0199 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0179 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0160 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0162 - acc: 0.9989
learning rate: 0.01

1796/1796 - 1s - loss: 1.1891 - acc: 0.6927
learning rate: 0.01

1796/1796 - 0s - loss: 0.3323 - acc: 0.9115
learning rate: 0.01

1796/1796 - 0s - loss: 0.2003 - acc: 0.9527
learning rate: 0.01

1796/1796 - 0s - loss: 0.1628 - acc: 0.9577
learning rate: 0.01

1796

learning rate: 0.02

1796/1796 - 0s - loss: 0.0173 - acc: 0.9994
learning rate: 0.02

1796/1796 - 0s - loss: 0.0174 - acc: 0.9983
learning rate: 0.02

1796/1796 - 0s - loss: 0.0146 - acc: 0.9989
learning rate: 0.02

1796/1796 - 0s - loss: 0.0197 - acc: 0.9967
learning rate: 0.01

1796/1796 - 1s - loss: 1.2013 - acc: 0.6776
learning rate: 0.01

1796/1796 - 0s - loss: 0.2606 - acc: 0.9365
learning rate: 0.01

1796/1796 - 0s - loss: 0.1624 - acc: 0.9616
learning rate: 0.01

1796/1796 - 0s - loss: 0.1330 - acc: 0.9683
learning rate: 0.01

1796/1796 - 0s - loss: 0.2093 - acc: 0.9521
learning rate: 0.01

1796/1796 - 0s - loss: 0.0866 - acc: 0.9800
learning rate: 0.01

1796/1796 - 0s - loss: 0.0719 - acc: 0.9889
learning rate: 0.005

1796/1796 - 0s - loss: 0.0580 - acc: 0.9894
learning rate: 0.005

1796/1796 - 0s - loss: 0.0538 - acc: 0.9939
learning rate: 0.005

1796/1796 - 0s - loss: 0.0499 - acc: 0.9939
learning rate: 0.005

1796/1796 - 0s - loss: 0.0478 - acc: 0.9950
learning rate: 0.005


learning rate: 0.01

1796/1796 - 0s - loss: 0.1307 - acc: 0.9627
learning rate: 0.01

1796/1796 - 0s - loss: 0.1040 - acc: 0.9744
learning rate: 0.01

1796/1796 - 0s - loss: 0.0797 - acc: 0.9827
learning rate: 0.01

1796/1796 - 0s - loss: 0.0714 - acc: 0.9839
learning rate: 0.01

1796/1796 - 0s - loss: 0.0572 - acc: 0.9916
learning rate: 0.01

1796/1796 - 0s - loss: 0.0596 - acc: 0.9889
learning rate: 0.01

1796/1796 - 0s - loss: 0.0495 - acc: 0.9905
learning rate: 0.01

1796/1796 - 0s - loss: 0.0411 - acc: 0.9939
learning rate: 0.01

1796/1796 - 0s - loss: 0.0363 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0335 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0333 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0282 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0263 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0228 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0224 - acc: 0.9983
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.0314 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0295 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0267 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0249 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0247 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0392 - acc: 0.9939
learning rate: 0.01

1796/1796 - 0s - loss: 0.0230 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0216 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0196 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0262 - acc: 0.9944
learning rate: 0.01

1796/1796 - 0s - loss: 0.0171 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0159 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0157 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0162 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0140 - acc: 0.9989
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.0353 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0271 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0242 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0229 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0200 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0186 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0180 - acc: 0.9989
learning rate: 0.01

1796/1796 - 1s - loss: 1.0443 - acc: 0.6938
learning rate: 0.01

1796/1796 - 0s - loss: 0.2662 - acc: 0.9393
learning rate: 0.01

1796/1796 - 0s - loss: 0.1613 - acc: 0.9610
learning rate: 0.01

1796/1796 - 0s - loss: 0.1308 - acc: 0.9671
learning rate: 0.01

1796/1796 - 0s - loss: 0.2846 - acc: 0.9410
learning rate: 0.01

1796/1796 - 0s - loss: 0.0936 - acc: 0.9783
learning rate: 0.01

1796/1796 - 0s - loss: 0.1060 - acc: 0.9744
learning rate: 0.01

1796/1796 - 0s - loss: 0.0630 - acc: 0.9889
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.1297 - acc: 0.9733
learning rate: 0.01

1796/1796 - 0s - loss: 0.0984 - acc: 0.9800
learning rate: 0.01

1796/1796 - 0s - loss: 0.1406 - acc: 0.9649
learning rate: 0.01

1796/1796 - 0s - loss: 0.0995 - acc: 0.9727
learning rate: 0.01

1796/1796 - 0s - loss: 0.0663 - acc: 0.9861
learning rate: 0.01

1796/1796 - 0s - loss: 0.0573 - acc: 0.9905
learning rate: 0.01

1796/1796 - 0s - loss: 0.0531 - acc: 0.9894
learning rate: 0.01

1796/1796 - 0s - loss: 0.0439 - acc: 0.9928
learning rate: 0.01

1796/1796 - 0s - loss: 0.0394 - acc: 0.9950
learning rate: 0.01

1796/1796 - 0s - loss: 0.0368 - acc: 0.9961
learning rate: 0.01

1796/1796 - 0s - loss: 0.0344 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0318 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0285 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0264 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0277 - acc: 0.9972
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.0483 - acc: 0.9933
learning rate: 0.01

1796/1796 - 0s - loss: 0.0458 - acc: 0.9961
learning rate: 0.01

1796/1796 - 0s - loss: 0.0418 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0385 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0370 - acc: 0.9961
learning rate: 0.01

1796/1796 - 0s - loss: 0.0339 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0311 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0302 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0284 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0253 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0236 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0229 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0247 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0210 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0198 - acc: 0.9978
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.0277 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0252 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0233 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0222 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0224 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0195 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0197 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0167 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0169 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0154 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0188 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0142 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0132 - acc: 0.9989
learning rate: 0.01

1796/1796 - 1s - loss: 0.9773 - acc: 0.7116
learning rate: 0.01

1796/1796 - 0s - loss: 0.2974 - acc: 0.9293
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.0199 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0190 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0173 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0173 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0163 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0162 - acc: 0.9994
learning rate: 0.01

1796/1796 - 0s - loss: 0.0142 - acc: 0.9994
learning rate: 0.01

1796/1796 - 1s - loss: 1.2544 - acc: 0.7055
learning rate: 0.01

1796/1796 - 0s - loss: 0.2983 - acc: 0.9243
learning rate: 0.01

1796/1796 - 0s - loss: 0.1970 - acc: 0.9516
learning rate: 0.01

1796/1796 - 0s - loss: 0.1517 - acc: 0.9638
learning rate: 0.01

1796/1796 - 0s - loss: 0.1483 - acc: 0.9616
learning rate: 0.01

1796/1796 - 0s - loss: 0.1029 - acc: 0.9766
learning rate: 0.01

1796/1796 - 0s - loss: 0.0931 - acc: 0.9788
learning rate: 0.01

1796/1796 - 0s - loss: 0.0724 - acc: 0.9844
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.1291 - acc: 0.9694
learning rate: 0.01

1796/1796 - 0s - loss: 0.1018 - acc: 0.9783
learning rate: 0.01

1796/1796 - 0s - loss: 0.0901 - acc: 0.9811
learning rate: 0.01

1796/1796 - 0s - loss: 0.0697 - acc: 0.9889
learning rate: 0.01

1796/1796 - 0s - loss: 0.0670 - acc: 0.9883
learning rate: 0.01

1796/1796 - 0s - loss: 0.0542 - acc: 0.9928
learning rate: 0.01

1796/1796 - 0s - loss: 0.0480 - acc: 0.9922
learning rate: 0.01

1796/1796 - 0s - loss: 0.0425 - acc: 0.9950
learning rate: 0.01

1796/1796 - 0s - loss: 0.0402 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0376 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0324 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0304 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0328 - acc: 0.9933
learning rate: 0.01

1796/1796 - 0s - loss: 0.0266 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0254 - acc: 0.9972
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.0437 - acc: 0.9916
learning rate: 0.01

1796/1796 - 0s - loss: 0.0407 - acc: 0.9933
learning rate: 0.01

1796/1796 - 0s - loss: 0.0370 - acc: 0.9939
learning rate: 0.01

1796/1796 - 0s - loss: 0.0322 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0307 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0269 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0255 - acc: 0.9967
learning rate: 0.005

1796/1796 - 0s - loss: 0.0224 - acc: 0.9989
learning rate: 0.005

1796/1796 - 0s - loss: 0.0217 - acc: 0.9989
learning rate: 0.005

1796/1796 - 0s - loss: 0.0212 - acc: 0.9989
learning rate: 0.005

1796/1796 - 0s - loss: 0.0208 - acc: 0.9989
learning rate: 0.005

1796/1796 - 0s - loss: 0.0194 - acc: 0.9989
learning rate: 0.005

1796/1796 - 0s - loss: 0.0189 - acc: 0.9983
learning rate: 0.005

1796/1796 - 0s - loss: 0.0186 - acc: 0.9989
learning rate: 0.005

1796/1796 - 0s - loss: 0.0180 - acc: 0.9989
learning rate: 0.

learning rate: 0.01

1796/1796 - 0s - loss: 0.0236 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0211 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0194 - acc: 0.9994
learning rate: 0.01

1796/1796 - 0s - loss: 0.0196 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0179 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0163 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0153 - acc: 0.9994
learning rate: 0.01

1796/1796 - 0s - loss: 0.0147 - acc: 0.9994
learning rate: 0.01

1796/1796 - 0s - loss: 0.0140 - acc: 0.9994
learning rate: 0.01

1796/1796 - 0s - loss: 0.0131 - acc: 1.0000

episode: 69/100, score: 1.0, e: 0.01

learning rate: 0.01

1796/1796 - 1s - loss: 1.3230 - acc: 0.6676
learning rate: 0.01

1796/1796 - 0s - loss: 0.3175 - acc: 0.9226
learning rate: 0.01

1796/1796 - 0s - loss: 0.1941 - acc: 0.9605
learning rate: 0.01

1796/1796 - 0s - loss: 0.1434 - acc: 0.9694
learning rate: 0.01

1796/1796 - 0s - loss: 0.1143 

learning rate: 0.01

1796/1796 - 0s - loss: 0.3633 - acc: 0.8998
learning rate: 0.01

1796/1796 - 0s - loss: 0.2362 - acc: 0.9415
learning rate: 0.01

1796/1796 - 0s - loss: 0.1452 - acc: 0.9710
learning rate: 0.01

1796/1796 - 0s - loss: 0.1156 - acc: 0.9761
learning rate: 0.01

1796/1796 - 0s - loss: 0.0931 - acc: 0.9794
learning rate: 0.01

1796/1796 - 0s - loss: 0.0813 - acc: 0.9855
learning rate: 0.01

1796/1796 - 0s - loss: 0.0727 - acc: 0.9866
learning rate: 0.01

1796/1796 - 0s - loss: 0.1194 - acc: 0.9644
learning rate: 0.01

1796/1796 - 0s - loss: 0.0555 - acc: 0.9900
learning rate: 0.01

1796/1796 - 0s - loss: 0.0554 - acc: 0.9905
learning rate: 0.01

1796/1796 - 0s - loss: 0.0432 - acc: 0.9944
learning rate: 0.01

1796/1796 - 0s - loss: 0.0408 - acc: 0.9939
learning rate: 0.01

1796/1796 - 0s - loss: 0.0364 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0367 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0301 - acc: 0.9967
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.0687 - acc: 0.9861
learning rate: 0.01

1796/1796 - 0s - loss: 0.0603 - acc: 0.9911
learning rate: 0.01

1796/1796 - 0s - loss: 0.0583 - acc: 0.9883
learning rate: 0.01

1796/1796 - 0s - loss: 0.0484 - acc: 0.9939
learning rate: 0.01

1796/1796 - 0s - loss: 0.0448 - acc: 0.9928
learning rate: 0.01

1796/1796 - 0s - loss: 0.0409 - acc: 0.9944
learning rate: 0.01

1796/1796 - 0s - loss: 0.0380 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0339 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0318 - acc: 0.9955
learning rate: 0.01

1796/1796 - 0s - loss: 0.0298 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0274 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0257 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0264 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0227 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0216 - acc: 0.9983
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.0179 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0169 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0159 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0149 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0140 - acc: 0.9994
learning rate: 0.01

1796/1796 - 1s - loss: 1.0240 - acc: 0.7210
learning rate: 0.01

1796/1796 - 0s - loss: 0.2575 - acc: 0.9388
learning rate: 0.01

1796/1796 - 0s - loss: 0.1751 - acc: 0.9571
learning rate: 0.01

1796/1796 - 0s - loss: 0.2417 - acc: 0.9465
learning rate: 0.01

1796/1796 - 0s - loss: 0.1078 - acc: 0.9805
learning rate: 0.01

1796/1796 - 0s - loss: 0.1065 - acc: 0.9716
learning rate: 0.01

1796/1796 - 0s - loss: 0.0735 - acc: 0.9883
learning rate: 0.01

1796/1796 - 0s - loss: 0.0625 - acc: 0.9889
learning rate: 0.01

1796/1796 - 0s - loss: 0.0664 - acc: 0.9855
learning rate: 0.01

1796/1796 - 0s - loss: 0.1025 - acc: 0.9755
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.0591 - acc: 0.9928
learning rate: 0.01

1796/1796 - 0s - loss: 0.0531 - acc: 0.9922
learning rate: 0.01

1796/1796 - 0s - loss: 0.0477 - acc: 0.9933
learning rate: 0.01

1796/1796 - 0s - loss: 0.0469 - acc: 0.9933
learning rate: 0.01

1796/1796 - 0s - loss: 0.0397 - acc: 0.9961
learning rate: 0.01

1796/1796 - 0s - loss: 0.0369 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0422 - acc: 0.9922
learning rate: 0.01

1796/1796 - 0s - loss: 0.0339 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0280 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0292 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0269 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0247 - acc: 0.9972
learning rate: 0.01

1796/1796 - 0s - loss: 0.0232 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0216 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0207 - acc: 0.9978
learning rate: 0.01

1796

learning rate: 0.01

1796/1796 - 0s - loss: 0.0342 - acc: 0.9961
learning rate: 0.01

1796/1796 - 0s - loss: 0.0325 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0302 - acc: 0.9961
learning rate: 0.01

1796/1796 - 0s - loss: 0.0287 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0260 - acc: 0.9978
learning rate: 0.01

1796/1796 - 0s - loss: 0.0263 - acc: 0.9967
learning rate: 0.01

1796/1796 - 0s - loss: 0.0230 - acc: 0.9983
learning rate: 0.01

1796/1796 - 0s - loss: 0.0217 - acc: 0.9994
learning rate: 0.01

1796/1796 - 0s - loss: 0.0206 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0200 - acc: 0.9994
learning rate: 0.01

1796/1796 - 0s - loss: 0.0183 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0175 - acc: 0.9989
learning rate: 0.01

1796/1796 - 0s - loss: 0.0167 - acc: 0.9994
learning rate: 0.01

1796/1796 - 0s - loss: 0.0162 - acc: 0.9994
learning rate: 0.01

1796/1796 - 0s - loss: 0.0146 - acc: 1.0000

episode: 91/100, score: 

learning rate: 0.01

1796/1796 - 0s - loss: 0.0132 - acc: 0.9994
learning rate: 0.01

1796/1796 - 1s - loss: 1.1596 - acc: 0.6698
learning rate: 0.01

1796/1796 - 0s - loss: 0.3187 - acc: 0.9131
learning rate: 0.01

1796/1796 - 0s - loss: 0.2108 - acc: 0.9516
learning rate: 0.01

1796/1796 - 0s - loss: 0.1527 - acc: 0.9605
learning rate: 0.01

1796/1796 - 0s - loss: 0.1159 - acc: 0.9744
learning rate: 0.01

1796/1796 - 0s - loss: 0.1025 - acc: 0.9755
learning rate: 0.01

1796/1796 - 0s - loss: 0.0821 - acc: 0.9833
learning rate: 0.01

1796/1796 - 0s - loss: 0.0716 - acc: 0.9855
learning rate: 0.01

1796/1796 - 0s - loss: 0.0648 - acc: 0.9889
learning rate: 0.01

1796/1796 - 0s - loss: 0.0626 - acc: 0.9889
learning rate: 0.01

1796/1796 - 0s - loss: 0.0501 - acc: 0.9928
learning rate: 0.01

1796/1796 - 0s - loss: 0.2104 - acc: 0.9549
learning rate: 0.01

1796/1796 - 0s - loss: 0.0450 - acc: 0.9922
learning rate: 0.01

1796/1796 - 0s - loss: 0.0401 - acc: 0.9928
learning rate: 0.01

1796

In [88]:
state = [np.inf, 0.0, lr, np.inf, 0.0]
state = np.reshape(state, [1, state_size])
model, loss_history, lrate = reset_env(LR)

for time in range(30):

    # Agent acts, env updates
    action = agent.act(state)
    next_state, reward, done = step_env(action, model, loss_history, lrate, IMPROVE_COEFF, num_epochs=1)
    next_state = np.reshape(next_state, [1, state_size])
    state = next_state

    if done:
        print("\nepisode: {}/{}, score: {}, e: {:.2}\n"
              .format(e, EPISODES, loss_history.acc[-1], agent.epsilon))
        break

learning rate: 0.01

1796/1796 - 1s - loss: 1.3629 - acc: 0.6682
learning rate: 0.01

1796/1796 - 0s - loss: 0.3310 - acc: 0.9131
learning rate: 0.01

1796/1796 - 0s - loss: 0.2711 - acc: 0.9287
learning rate: 0.01

1796/1796 - 0s - loss: 0.2029 - acc: 0.9488
learning rate: 0.01

1796/1796 - 0s - loss: 0.1125 - acc: 0.9755
learning rate: 0.01

1796/1796 - 0s - loss: 0.0924 - acc: 0.9816
learning rate: 0.01

1796/1796 - 0s - loss: 0.0844 - acc: 0.9816
learning rate: 0.01

1796/1796 - 0s - loss: 0.0696 - acc: 0.9866
learning rate: 0.01

1796/1796 - 0s - loss: 0.0649 - acc: 0.9894
learning rate: 0.01

1796/1796 - 0s - loss: 0.0515 - acc: 0.9916
learning rate: 0.01

1796/1796 - 0s - loss: 0.0453 - acc: 0.9939
learning rate: 0.01

1796/1796 - 0s - loss: 0.0429 - acc: 0.9944
learning rate: 0.01

1796/1796 - 0s - loss: 0.0396 - acc: 0.9944
learning rate: 0.01

1796/1796 - 0s - loss: 0.0373 - acc: 0.9961
learning rate: 0.01

1796/1796 - 0s - loss: 0.0321 - acc: 0.9967
learning rate: 0.01

1796

In [89]:
import matplotlib.pyplot as plt
import seaborn
import plotly

In [90]:
import plotly.graph_objects as go
import pandas as pd

fig = go.Figure([go.Scatter(x=pd.Series(range(len(loss_history.acc))), y=pd.Series(loss_history.acc))])
fig.show()
fig = go.Figure([go.Scatter(x=pd.Series(range(len(loss_history.lr))), y=pd.Series(loss_history.lr))])
fig.show()

# Define environement

In [53]:
import gym

In [54]:
class DQNAgent():
    
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()
        
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))
        
    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action
    
    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma *
                          np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

In [56]:
EPISODES = 1000

env = gym.make('CartPole-v1')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)
# agent.load("./save/cartpole-dqn.h5")
done = False
batch_size = 32

for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for time in range(500):
        env.render()
        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward if not done else -10
        next_state = np.reshape(next_state, [1, state_size])
        
        agent.remember(state, action, reward, next_state, done)
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                  .format(e, EPISODES, time, agent.epsilon))
            break
        if len(agent.memory) > batch_size:
            agent.replay(batch_size)

episode: 0/1000, score: 16, e: 1.0
episode: 1/1000, score: 11, e: 1.0
episode: 2/1000, score: 16, e: 0.94
episode: 3/1000, score: 10, e: 0.89
episode: 4/1000, score: 55, e: 0.68
episode: 5/1000, score: 10, e: 0.64
episode: 6/1000, score: 19, e: 0.58
episode: 7/1000, score: 16, e: 0.54
episode: 8/1000, score: 11, e: 0.51
episode: 9/1000, score: 12, e: 0.48
episode: 10/1000, score: 10, e: 0.46
episode: 11/1000, score: 12, e: 0.43
episode: 12/1000, score: 9, e: 0.41
episode: 13/1000, score: 10, e: 0.39
episode: 14/1000, score: 8, e: 0.38
episode: 15/1000, score: 10, e: 0.36
episode: 16/1000, score: 8, e: 0.34
episode: 17/1000, score: 9, e: 0.33
episode: 18/1000, score: 8, e: 0.32
episode: 19/1000, score: 144, e: 0.15
episode: 20/1000, score: 125, e: 0.082
episode: 21/1000, score: 57, e: 0.062
episode: 22/1000, score: 43, e: 0.05
episode: 23/1000, score: 17, e: 0.046
episode: 24/1000, score: 50, e: 0.035
episode: 25/1000, score: 23, e: 0.032
episode: 26/1000, score: 20, e: 0.029


KeyboardInterrupt: 