In [104]:
import math
from random import randint, choice
from gym.envs.classic_control import CartPoleEnv
import tensorflow as tf
import pandas as pd
import numpy as np
import heapq
from random import random

In [109]:
T = 100
mu = 0.7
batch_size = 10
GAMMA = 0.5
learning_rate = 0.001

In [92]:
class Transaction:
    columns = ['state', 'action', 'reward', 'next_state', 'done', 'qvalue']
    actions = [[0, 1], [1, 0]]
    def __init__(self, item):
        self.item = dict(zip(Transaction.columns, item))
        
    def  __lt__(self, other):
        return self.item['id'] < other.item['id']
    
    def __repr__(self):
        return str(self.item)
    
    def value(self):
        return self.item['state'] + Transaction.actions[self.item['action']]
    
    def __getitem__(self, key):
        return self.item[key]
    
    def __setitem__(self, key, value):
        self.item[key] = value

class Memory:
    def __init__(self, max_size):
        MAX_TRANSACTION = Transaction(Transaction.columns)
        MAX_TRANSACTION.item['id'] = -1
        self.d = []
        self.id = 0
        self.max_size = max_size
        
    def insert(self, item):
        self.id += 1
        item['id'] = self.id
        #print(self.d)

        if len(self.d) == self.max_size:
            heapq.heapreplace(self.d, item)
        else:
            heapq.heappush(self.d, item)

    def batch(self, n):
        if len(self.d) < n:
            return np.array(heapq.nsmallest(n, self.d))
        else:
            return np.array(self.d)
        
def transform_to_Q_input(batch):
    return np.array([transaction.value() for transaction in batch])

def get_QValues(batch):
    return np.array([transaction['qvalue'] for transaction in batch])
    
D = Memory(2)
item1 = Transaction([np.array([0,1,2,3]), 1,0,np.array([3,2,1,0]),0,10])
item2 = Transaction([np.array([0,1,2,3]), 1,0,np.array([3,2,1,0]),0,10])
item3 = Transaction([np.array([0,1,2,3]), 1,0,np.array([3,2,1,0]),0,10])

D.insert(item1)
D.insert(item2)
D.insert(item3)

'''
print(D.d, D.batch(2), D.d)
item = Transaction([np.array([-1,1,2,3]), 1,0,np.array([3,2,1,0]),0,10])
D.insert(item)
print(D.d, D.batch(1))
'''

'\nprint(D.d, D.batch(2), D.d)\nitem = Transaction([np.array([-1,1,2,3]), 1,0,np.array([3,2,1,0]),0,10])\nD.insert(item)\nprint(D.d, D.batch(1))\n'

In [113]:
class QNetwork:
    def __init__(self, sess):
        self.state_size = 4
        self.action_size = 2
        self.input_size = self.state_size + self.action_size
        self.units = 4
        self.output_size = 1
        
        self.input = tf.placeholder(shape=(None, self.input_size), dtype=tf.float32)
        self.label = tf.placeholder(shape=(None), dtype=tf.float32)
        
        self.build_network()
        self.sess = sess
        self.sess.run(tf.global_variables_initializer())

    def build_network(self):
        self.layer_1 = tf.layers.dense(inputs=self.input, units=self.units)
        self.layer_2 = tf.layers.dense(inputs=self.layer_1, units=self.units/2)
        self.qvalue = tf.layers.dense(inputs=self.layer_2, units=self.output_size, activation=tf.nn.tanh)

        self.loss = tf.losses.mean_squared_error(labels=self.label, predictions=self.qvalue)
        self.train_step = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
        self.best_action = tf.argmax(self.qvalue, axis=1)
        
    def get_Q_value(self, batch):
        return self.sess.run(self.qvalue, feed_dict={self.input: batch})
        
    def get_best_action(self, state):
        batch = [0, 0]
        batch[0] = state + [0, 1]
        batch[1] = state + [1, 0]
        #print state, np.array(batch).shape
        
        best_action = self.sess.run(self.best_action, feed_dict={self.input: batch})
        return best_action
    
    def update(self, batch):
        inputs = transform_to_Q_input(batch)
        labels = get_QValues(batch)
        print(self.sess.run([self.train_step, self.loss], feed_dict={self.input: inputs, self.label: labels}))
        

In [114]:
def add_qvalue(batch, Q, gamma):
    _batch = transform_to_Q_input(batch)
    qvalues = Q.get_Q_value(_batch)
    for qvalue,transaction in zip(qvalues, batch):
        if not transaction['done']:
            transaction['qvalue'] = transaction['reward'] + qvalue*gamma
        else:
            transaction['qvalue'] = transaction['reward']

In [116]:
D = Memory(T)
sess = tf.Session()
Q = QNetwork(sess)

env = CartPoleEnv()
#env.theta_threshold_radians = 2 * math.pi * 1/4
state = env.reset()
state = state.tolist()

for _ in range(10000):
    #env.render()
    #action = env.action_space.sample()
    if random() > mu:
        action = env.action_space.sample()
    else:
        actoin = Q.get_best_action(state)
    
    next_state, reward, done, _ = env.step(action) # take a random action
    next_state = next_state.tolist()
    
    transaction = Transaction((state, action, reward, next_state, done, -1))
    D.insert(transaction)
    state = next_state
    
    batch = D.batch(batch_size)
    add_qvalue(batch, Q, GAMMA)
    Q.update(batch)
    
    #print(transaction, batch)
    
    if done:
        state = env.reset()
        state = state.tolist()

[33mWARN: gym.spaces.Box autodetected dtype as <type 'numpy.float32'>. Please provide explicit dtype.[0m
[None, 1.0365754]
[None, 1.017228]
[None, 0.9987379]
[None, 0.98094845]
[None, 0.9637286]
[None, 0.93469125]
[None, 0.92709386]
[None, 0.916861]
[None, 0.89207685]
[None, 0.8961821]
[None, 0.8957388]
[None, 0.89223075]
[None, 0.8865821]
[None, 0.8793884]
[None, 0.87104607]
[None, 0.8589957]
[None, 0.8486682]
[None, 0.8277637]
[None, 0.8256235]
[None, 0.8248285]
[None, 0.8252383]
[None, 0.8267529]
[None, 0.8293082]
[None, 0.83286834]
[None, 0.8374181]
[None, 0.84295285]
[None, 0.8494675]
[None, 0.8600713]
[None, 0.8557909]
[None, 0.852536]
[None, 0.85025287]
[None, 0.8488841]
[None, 0.8514272]
[None, 0.8518304]
[None, 0.85055256]
[None, 0.84584236]
[None, 0.8421131]
[None, 0.83933663]
[None, 0.8374713]
[None, 0.8364682]
[None, 0.83617246]
[None, 0.83019763]
[None, 0.8266646]
[None, 0.8222031]
[None, 0.8170437]
[None, 0.8113679]
[None, 0.80531716]
[None, 0.7990012]
[None, 0.7925053]

[None, 0.25827008]
[None, 0.25825423]
[None, 0.2582552]
[None, 0.25823602]
[None, 0.25820675]
[None, 0.2581664]
[None, 0.2580522]
[None, 0.2579551]
[None, 0.2578542]
[None, 0.25505847]
[None, 0.25754642]
[None, 0.2574184]
[None, 0.257292]
[None, 0.2571672]
[None, 0.25704378]
[None, 0.25692174]
[None, 0.25680104]
[None, 0.25668174]
[None, 0.2565307]
[None, 0.2536185]
[None, 0.25347036]
[None, 0.25594422]
[None, 0.25583243]
[None, 0.25572208]
[None, 0.25561294]
[None, 0.25547263]
[None, 0.25531733]
[None, 0.25242457]
[None, 0.25227284]
[None, 0.2521506]
[None, 0.25203505]
[None, 0.25192496]
[None, 0.25181964]
[None, 0.2517519]
[None, 0.25166982]
[None, 0.25158897]
[None, 0.2515092]
[None, 0.25147235]
[None, 0.2514506]
[None, 0.25135225]
[None, 0.25123215]
[None, 0.251096]
[None, 0.2509874]
[None, 0.25088456]
[None, 0.25078732]
[None, 0.2481218]
[None, 0.24806279]
[None, 0.24802117]
[None, 0.24796002]
[None, 0.24789903]
[None, 0.24783857]
[None, 0.2477396]
[None, 0.24763262]
[None, 0.2475

[None, 0.23889199]
[None, 0.24144536]
[None, 0.24143979]
[None, 0.24141997]
[None, 0.24140346]
[None, 0.2413914]
[None, 0.24138442]
[None, 0.23885298]
[None, 0.23885562]
[None, 0.23885982]
[None, 0.24141493]
[None, 0.24139874]
[None, 0.24138147]
[None, 0.24136357]
[None, 0.24134506]
[None, 0.24132647]
[None, 0.24130785]
[None, 0.23873803]
[None, 0.23873672]
[None, 0.23873748]
[None, 0.24130888]
[None, 0.24130464]
[None, 0.24129821]
[None, 0.24127847]
[None, 0.24124038]
[None, 0.24120156]
[None, 0.24116333]
[None, 0.24112639]
[None, 0.24109189]
[None, 0.24107173]
[None, 0.23853184]
[None, 0.23853752]
[None, 0.23854233]
[None, 0.2385455]
[None, 0.23854604]
[None, 0.23852955]
[None, 0.23851052]
[None, 0.23848955]
[None, 0.23845832]
[None, 0.23842803]
[None, 0.2383989]
[None, 0.23581962]
[None, 0.23580565]
[None, 0.23579465]
[None, 0.23579572]
[None, 0.23581316]
[None, 0.23582329]
[None, 0.23583461]
[None, 0.23584662]
[None, 0.23585874]
[None, 0.23842178]
[None, 0.23841235]
[None, 0.238401

[None, 0.23647597]
[None, 0.23650134]
[None, 0.23652625]
[None, 0.23654996]
[None, 0.23910025]
[None, 0.23910971]
[None, 0.23911738]
[None, 0.23912334]
[None, 0.23912768]
[None, 0.2391307]
[None, 0.23660417]
[None, 0.23660398]
[None, 0.23660561]
[None, 0.23913643]
[None, 0.23912947]
[None, 0.23912515]
[None, 0.23912416]
[None, 0.23912698]
[None, 0.23913406]
[None, 0.23664063]
[None, 0.23663893]
[None, 0.23664515]
[None, 0.23665157]
[None, 0.23919058]
[None, 0.23918669]
[None, 0.2391769]
[None, 0.23916858]
[None, 0.2391669]
[None, 0.23917203]
[None, 0.23916967]
[None, 0.23916137]
[None, 0.23916224]
[None, 0.2391686]
[None, 0.23917334]
[None, 0.23916994]
[None, 0.23916583]
[None, 0.23916137]
[None, 0.23915708]
[None, 0.23915254]
[None, 0.23914751]
[None, 0.23914228]
[None, 0.23912978]
[None, 0.23911907]
[None, 0.23911028]
[None, 0.23910998]
[None, 0.23911092]
[None, 0.23911288]
[None, 0.24161863]
[None, 0.2391196]
[None, 0.23911268]
[None, 0.23910752]
[None, 0.23910405]
[None, 0.23910274

[None, 0.23575507]
[None, 0.2357565]
[None, 0.23575513]
[None, 0.23575391]
[None, 0.23575285]
[None, 0.23574868]
[None, 0.23574528]
[None, 0.23574293]
[None, 0.23574187]
[None, 0.23574233]
[None, 0.23575318]
[None, 0.23576485]
[None, 0.2357704]
[None, 0.23577681]
[None, 0.23329675]
[None, 0.23329927]
[None, 0.23329812]
[None, 0.23329678]
[None, 0.23329522]
[None, 0.23328975]
[None, 0.23328458]
[None, 0.23328003]
[None, 0.2332763]
[None, 0.2332737]
[None, 0.23076998]
[None, 0.23076747]
[None, 0.23076504]
[None, 0.23076536]
[None, 0.23076245]
[None, 0.23075938]
[None, 0.23326267]
[None, 0.23326269]
[None, 0.23326008]
[None, 0.2332584]
[None, 0.23325789]
[None, 0.2307643]
[None, 0.23076464]
[None, 0.2307644]
[None, 0.23076715]
[None, 0.23076685]
[None, 0.2307643]
[None, 0.23076215]
[None, 0.23076065]
[None, 0.23075984]
[None, 0.23327334]
[None, 0.23327175]
[None, 0.23327066]
[None, 0.23327047]
[None, 0.23327114]
[None, 0.23076704]
[None, 0.23076552]
[None, 0.23076455]
[None, 0.23076712]
[

[None, 0.23287252]
[None, 0.23287153]
[None, 0.23287077]
[None, 0.23286885]
[None, 0.23286739]
[None, 0.23286778]
[None, 0.23286846]
[None, 0.23287012]
[None, 0.23287171]
[None, 0.23287305]
[None, 0.2303674]
[None, 0.23036973]
[None, 0.23037192]
[None, 0.2303725]
[None, 0.23037304]
[None, 0.230375]
[None, 0.23037677]
[None, 0.23037703]
[None, 0.23037705]
[None, 0.23037681]
[None, 0.23037636]
[None, 0.23037583]
[None, 0.22787]
[None, 0.22787163]
[None, 0.22787318]
[None, 0.22787468]
[None, 0.22787613]
[None, 0.2278774]
[None, 0.22787862]
[None, 0.22787851]
[None, 0.22787854]
[None, 0.23038657]
[None, 0.23038541]
[None, 0.23038423]
[None, 0.23038296]
[None, 0.23038162]
[None, 0.23038022]
[None, 0.23037866]
[None, 0.23037697]
[None, 0.23037693]
[None, 0.23037642]
[None, 0.23287031]
[None, 0.23287]
[None, 0.2328697]
[None, 0.23286948]
[None, 0.23036626]
[None, 0.2303663]
[None, 0.2303665]
[None, 0.2303668]
[None, 0.23036718]
[None, 0.23036851]
[None, 0.2303696]
[None, 0.23287688]
[None, 0.

[None, 0.22524434]
[None, 0.2252429]
[None, 0.2252427]
[None, 0.22524256]
[None, 0.22524163]
[None, 0.22524083]
[None, 0.22524114]
[None, 0.22524069]
[None, 0.22774497]
[None, 0.22774446]
[None, 0.22774394]
[None, 0.2277434]
[None, 0.22523828]
[None, 0.22523862]
[None, 0.22523911]
[None, 0.22524029]
[None, 0.22523974]
[None, 0.22774266]
[None, 0.22774155]
[None, 0.22774065]
[None, 0.22773984]
[None, 0.22773921]
[None, 0.22773884]
[None, 0.2277387]
[None, 0.22773889]
[None, 0.23024277]
[None, 0.23024258]
[None, 0.23024273]
[None, 0.2277418]
[None, 0.22774217]
[None, 0.2277426]
[None, 0.22774306]
[None, 0.22774424]
[None, 0.22774526]
[None, 0.2302502]
[None, 0.23025039]
[None, 0.23025034]
[None, 0.2302489]
[None, 0.22774374]
[None, 0.22774355]
[None, 0.22774431]
[None, 0.22774497]
[None, 0.22774535]
[None, 0.22774556]
[None, 0.22774546]
[None, 0.22774512]
[None, 0.22774374]
[None, 0.22774278]
[None, 0.22523797]
[None, 0.2252375]
[None, 0.227738]
[None, 0.22773857]
[None, 0.22773896]
[Non

[None, 0.23266876]
[None, 0.23266816]
[None, 0.23016475]
[None, 0.23016438]
[None, 0.23016396]
[None, 0.23016353]
[None, 0.23016377]
[None, 0.23016323]
[None, 0.2301627]
[None, 0.23266077]
[None, 0.23265979]
[None, 0.23265897]
[None, 0.23265833]
[None, 0.23265862]
[None, 0.23015776]
[None, 0.2301568]
[None, 0.23015659]
[None, 0.23015645]
[None, 0.23015635]
[None, 0.23015557]
[None, 0.23015483]
[None, 0.23015425]
[None, 0.23015381]
[None, 0.2301542]
[None, 0.22765344]
[None, 0.22765352]
[None, 0.22765367]
[None, 0.22765371]
[None, 0.2276538]
[None, 0.22765394]
[None, 0.22765405]
[None, 0.2276542]
[None, 0.22765437]
[None, 0.22765455]
[None, 0.22515273]
[None, 0.22515205]
[None, 0.2276497]
[None, 0.22764912]
[None, 0.22764863]
[None, 0.22764829]
[None, 0.22764868]
[None, 0.22764917]
[None, 0.22514786]
[None, 0.22514746]
[None, 0.2251479]
[None, 0.22514834]
[None, 0.22514886]
[None, 0.22514859]
[None, 0.2276503]
[None, 0.22764932]
[None, 0.22764839]
[None, 0.22764751]
[None, 0.22764683]
[

[None, 0.2351267]
[None, 0.23512627]
[None, 0.2376271]
[None, 0.23762691]
[None, 0.23762676]
[None, 0.23762657]
[None, 0.23762637]
[None, 0.23512441]
[None, 0.23512417]
[None, 0.23512325]
[None, 0.23512219]
[None, 0.23512115]
[None, 0.23512004]
[None, 0.235119]
[None, 0.23762019]
[None, 0.23762019]
[None, 0.23762026]
[None, 0.23511928]
[None, 0.23511907]
[None, 0.2351194]
[None, 0.235119]
[None, 0.23511797]
[None, 0.2351169]
[None, 0.23511583]
[None, 0.2351148]
[None, 0.23761506]
[None, 0.23761484]
[None, 0.23761469]
[None, 0.23761456]
[None, 0.23761448]
[None, 0.23761387]
[None, 0.23761328]
[None, 0.23761274]
[None, 0.23761232]
[None, 0.23761196]
[None, 0.23761179]
[None, 0.24011303]
[None, 0.24011242]
[None, 0.23761055]
[None, 0.23760991]
[None, 0.23760986]
[None, 0.23760983]
[None, 0.23760991]
[None, 0.23760997]
[None, 0.23761001]
[None, 0.23761085]
[None, 0.23511131]
[None, 0.23511167]
[None, 0.23511197]
[None, 0.23511152]
[None, 0.23511094]
[None, 0.23510976]
[None, 0.23510854]
[N

[None, 0.23258257]
[None, 0.23258252]
[None, 0.23258246]
[None, 0.23258281]
[None, 0.23258321]
[None, 0.23258354]
[None, 0.23258384]
[None, 0.23258369]
[None, 0.23508362]
[None, 0.23508368]
[None, 0.23508383]
[None, 0.2350839]
[None, 0.2350839]
[None, 0.23508391]
[None, 0.23508386]
[None, 0.23508383]
[None, 0.23758486]
[None, 0.23508336]
[None, 0.23508295]
[None, 0.23508252]
[None, 0.23508286]
[None, 0.23508325]
[None, 0.2350836]
[None, 0.23508398]
[None, 0.23508433]
[None, 0.2350843]
[None, 0.23508418]
[None, 0.23508362]
[None, 0.23508301]
[None, 0.23508242]
[None, 0.23508213]
[None, 0.23508188]
[None, 0.23508207]
[None, 0.23508222]
[None, 0.23508248]
[None, 0.23508218]
[None, 0.23508191]
[None, 0.23508157]
[None, 0.2350812]
[None, 0.23508081]
[None, 0.23508045]
[None, 0.23508003]
[None, 0.23507965]
[None, 0.23257807]
[None, 0.23507908]
[None, 0.23507856]
[None, 0.23507808]
[None, 0.23507763]
[None, 0.23507725]
[None, 0.2350768]
[None, 0.23507695]
[None, 0.2350772]
[None, 0.23507749]


[None, 0.23005415]
[None, 0.2300542]
[None, 0.23005423]
[None, 0.2300542]
[None, 0.2300542]
[None, 0.23005413]
[None, 0.23005435]
[None, 0.23255537]
[None, 0.23255542]
[None, 0.23005484]
[None, 0.23005503]
[None, 0.23005518]
[None, 0.23005538]
[None, 0.2300555]
[None, 0.23005557]
[None, 0.23005557]
[None, 0.23255615]
[None, 0.23255622]
[None, 0.23255627]
[None, 0.23005518]
[None, 0.23005484]
[None, 0.23005447]
[None, 0.2300541]
[None, 0.2300544]
[None, 0.23005463]
[None, 0.23005497]
[None, 0.23005524]
[None, 0.23005557]
[None, 0.23005578]
[None, 0.23005596]
[None, 0.23255646]
[None, 0.23255645]
[None, 0.23255642]
[None, 0.2325566]
[None, 0.23255679]
[None, 0.23255698]
[None, 0.23255686]
[None, 0.23255698]
[None, 0.23255707]
[None, 0.2325571]
[None, 0.2325571]
[None, 0.23505744]
[None, 0.23505728]
[None, 0.23505709]
[None, 0.23505709]
[None, 0.23505703]
[None, 0.232556]
[None, 0.23255576]
[None, 0.23255557]
[None, 0.23255531]
[None, 0.23255546]
[None, 0.23255552]
[None, 0.23255557]
[Non

[None, 0.23003796]
[None, 0.23003809]
[None, 0.23253867]
[None, 0.23253867]
[None, 0.23253848]
[None, 0.23253825]
[None, 0.23253804]
[None, 0.23253787]
[None, 0.2325376]
[None, 0.2325374]
[None, 0.23253722]
[None, 0.23503745]
[None, 0.23503713]
[None, 0.23503709]
[None, 0.23503697]
[None, 0.23503691]
[None, 0.23253702]
[None, 0.23253721]
[None, 0.23253737]
[None, 0.23253755]
[None, 0.23253775]
[None, 0.23253779]
[None, 0.23503825]
[None, 0.23503825]
[None, 0.23503838]
[None, 0.23503852]
[None, 0.23503847]
[None, 0.2325379]
[None, 0.23253775]
[None, 0.2325376]
[None, 0.23253736]
[None, 0.23253702]
[None, 0.23253672]
[None, 0.23253642]
[None, 0.23253623]
[None, 0.23253608]
[None, 0.23253587]
[None, 0.23253608]
[None, 0.23253635]
[None, 0.23503721]
[None, 0.23503743]
[None, 0.23503774]
[None, 0.23503801]
[None, 0.23503835]
[None, 0.23503862]
[None, 0.23503885]
[None, 0.235039]
[None, 0.23503911]
[None, 0.23753977]
[None, 0.2375398]
[None, 0.23753987]
[None, 0.23503946]
[None, 0.2350394]
[

[None, 0.23002827]
[None, 0.23002842]
[None, 0.23002857]
[None, 0.23002851]
[None, 0.23002857]
[None, 0.23002847]
[None, 0.23002839]
[None, 0.2325288]
[None, 0.23252891]
[None, 0.23252892]
[None, 0.23252888]
[None, 0.23252891]
[None, 0.23252891]
[None, 0.23252885]
[None, 0.23252864]
[None, 0.23252861]
[None, 0.23252861]
[None, 0.23252861]
[None, 0.23252837]
[None, 0.23252839]
[None, 0.23252822]
[None, 0.23252808]
[None, 0.23252812]
[None, 0.2325283]
[None, 0.23252846]
[None, 0.23252866]
[None, 0.23252861]
[None, 0.23252876]
[None, 0.232529]
[None, 0.2325292]
[None, 0.23252942]
[None, 0.23252949]
[None, 0.23252964]
[None, 0.23002937]
[None, 0.2300295]
[None, 0.23253018]
[None, 0.23253018]
[None, 0.23253013]
[None, 0.23252998]
[None, 0.23252976]
[None, 0.23252954]
[None, 0.2325294]
[None, 0.2325292]
[None, 0.2325292]
[None, 0.2325292]
[None, 0.23502967]
[None, 0.23502964]
[None, 0.2350296]
[None, 0.23502956]
[None, 0.2350296]
[None, 0.23502983]
[None, 0.23503008]
[None, 0.23252973]
[None

[None, 0.23501995]
[None, 0.23501992]
[None, 0.23501992]
[None, 0.23502012]
[None, 0.23502035]
[None, 0.23252046]
[None, 0.23252061]
[None, 0.23252055]
[None, 0.23252046]
[None, 0.23252027]
[None, 0.23252007]
[None, 0.23251985]
[None, 0.23251961]
[None, 0.23251939]
[None, 0.2325192]
[None, 0.23251916]
[None, 0.23251913]
[None, 0.2325192]
[None, 0.23251913]
[None, 0.23251905]
[None, 0.2325189]
[None, 0.2325189]
[None, 0.23251879]
[None, 0.23251894]
[None, 0.23251909]
[None, 0.23501946]
[None, 0.23501946]
[None, 0.23251913]
[None, 0.23251912]
[None, 0.23251913]
[None, 0.23251912]
[None, 0.23251909]
[None, 0.23251909]
[None, 0.23251906]
[None, 0.23251913]
[None, 0.23251913]
[None, 0.23251909]
[None, 0.23251924]
[None, 0.23251936]
[None, 0.23251933]
[None, 0.23251928]
[None, 0.23251913]
[None, 0.23251902]
[None, 0.23251894]
[None, 0.23251887]
[None, 0.23251879]
[None, 0.2325187]
[None, 0.23251878]
[None, 0.23251887]
[None, 0.23251885]
[None, 0.23251879]
[None, 0.23251885]
[None, 0.23501936

[None, 0.23001562]
[None, 0.23001572]
[None, 0.23251599]
[None, 0.23251596]
[None, 0.23251572]
[None, 0.23251587]
[None, 0.23251596]
[None, 0.23251596]
[None, 0.23251604]
[None, 0.23251604]
[None, 0.23251595]
[None, 0.23001572]
[None, 0.23251599]
[None, 0.2325161]
[None, 0.23251611]
[None, 0.23251611]
[None, 0.23251621]
[None, 0.23251623]
[None, 0.2325163]
[None, 0.23251626]
[None, 0.23251626]
[None, 0.23001592]
[None, 0.23001581]
[None, 0.23001581]
[None, 0.23001587]
[None, 0.23251621]
[None, 0.23251636]
[None, 0.2325165]
[None, 0.23251645]
[None, 0.23251641]
[None, 0.23251638]
[None, 0.2325163]
[None, 0.23251623]
[None, 0.2325161]
[None, 0.23501591]
[None, 0.23501584]
[None, 0.23501572]
[None, 0.23501563]
[None, 0.2350155]
[None, 0.23501557]
[None, 0.23501553]
[None, 0.23501553]
[None, 0.23501556]
[None, 0.23751578]
[None, 0.23751579]
[None, 0.23501568]
[None, 0.23501563]
[None, 0.23501553]
[None, 0.23501548]
[None, 0.23501538]
[None, 0.23501533]
[None, 0.23501533]
[None, 0.23751543]

[None, 0.23751284]
[None, 0.23751284]
[None, 0.23751274]
[None, 0.2375127]
[None, 0.23751265]
[None, 0.23751254]
[None, 0.23751247]
[None, 0.2375125]
[None, 0.23751253]
[None, 0.2375126]
[None, 0.23751262]
[None, 0.2375126]
[None, 0.24001276]
[None, 0.24001284]
[None, 0.24001303]
[None, 0.24001302]
[None, 0.24001315]
[None, 0.24001329]
[None, 0.24001333]
[None, 0.24001338]
[None, 0.24001338]
[None, 0.24001333]
[None, 0.24001323]
[None, 0.23751299]
[None, 0.23751289]
[None, 0.23751289]
[None, 0.24001299]
[None, 0.24001294]
[None, 0.24001287]
[None, 0.24001269]
[None, 0.24001253]
[None, 0.23751235]
[None, 0.2375122]
[None, 0.2375122]
[None, 0.23751226]
[None, 0.23751228]
[None, 0.23751223]
[None, 0.2375122]
[None, 0.24001254]
[None, 0.24001257]
[None, 0.24001254]
[None, 0.2375125]
[None, 0.23751253]
[None, 0.23751247]
[None, 0.2375125]
[None, 0.23751245]
[None, 0.23751238]
[None, 0.23751245]
[None, 0.24001275]
[None, 0.2400128]
[None, 0.24001282]
[None, 0.24001282]
[None, 0.24001282]
[No

[None, 0.24000762]
[None, 0.24000764]
[None, 0.24000767]
[None, 0.24000767]
[None, 0.24000774]
[None, 0.24000771]
[None, 0.24000767]
[None, 0.2400076]
[None, 0.24000742]
[None, 0.2400074]
[None, 0.24000737]
[None, 0.23750761]
[None, 0.24000771]
[None, 0.24000776]
[None, 0.24000776]
[None, 0.24000786]
[None, 0.24000798]
[None, 0.24000816]
[None, 0.24000816]
[None, 0.24000818]
[None, 0.24000818]
[None, 0.24000816]
[None, 0.24000816]
[None, 0.24000816]
[None, 0.24000818]
[None, 0.24000818]
[None, 0.23750801]
[None, 0.23750795]
[None, 0.23750801]
[None, 0.23750801]
[None, 0.23750798]
[None, 0.23750795]
[None, 0.23750791]
[None, 0.23750791]
[None, 0.23750798]
[None, 0.23750795]
[None, 0.23500791]
[None, 0.23500781]
[None, 0.23500776]
[None, 0.2350078]
[None, 0.23500781]
[None, 0.23500781]
[None, 0.23500773]
[None, 0.23500757]
[None, 0.23500766]
[None, 0.23250772]
[None, 0.23250772]
[None, 0.23250777]
[None, 0.23250774]
[None, 0.23250766]
[None, 0.23250766]
[None, 0.23250769]
[None, 0.232507

[None, 0.23500493]
[None, 0.23500498]
[None, 0.23500496]
[None, 0.23500498]
[None, 0.23500498]
[None, 0.2350049]
[None, 0.23250483]
[None, 0.2325049]
[None, 0.23250495]
[None, 0.23250493]
[None, 0.23250498]
[None, 0.23250493]
[None, 0.23250493]
[None, 0.23250495]
[None, 0.23250498]
[None, 0.23000483]
[None, 0.23000494]
[None, 0.23000498]
[None, 0.2300051]
[None, 0.23000506]
[None, 0.23000503]
[None, 0.23000488]
[None, 0.23000488]
[None, 0.23000494]
[None, 0.23000498]
[None, 0.23000488]
[None, 0.23000483]
[None, 0.23000479]
[None, 0.23000488]
[None, 0.2300048]
[None, 0.23000479]
[None, 0.23000483]
[None, 0.23000494]
[None, 0.23000498]
[None, 0.23000509]
[None, 0.23000509]
[None, 0.23000517]
[None, 0.23000517]
[None, 0.23000513]
[None, 0.23250502]
[None, 0.23250501]
[None, 0.23000488]
[None, 0.23000494]
[None, 0.23000498]
[None, 0.2300049]
[None, 0.23000494]
[None, 0.23000495]
[None, 0.2300049]
[None, 0.23000488]
[None, 0.23000495]
[None, 0.23000498]
[None, 0.23000498]
[None, 0.22750498]

[None, 0.23250356]
[None, 0.23250352]
[None, 0.23250349]
[None, 0.23000337]
[None, 0.2300033]
[None, 0.23000327]
[None, 0.23000322]
[None, 0.23000322]
[None, 0.23000318]
[None, 0.23250327]
[None, 0.23250322]
[None, 0.2325032]
[None, 0.23000322]
[None, 0.23000327]
[None, 0.23000339]
[None, 0.23000333]
[None, 0.23000333]
[None, 0.23000333]
[None, 0.23250318]
[None, 0.23250313]
[None, 0.2325031]
[None, 0.23250313]
[None, 0.23000307]
[None, 0.23000312]
[None, 0.23000312]
[None, 0.2300031]
[None, 0.23000303]
[None, 0.23000306]
[None, 0.230003]
[None, 0.23000303]
[None, 0.23000306]
[None, 0.23000307]
[None, 0.23000318]
[None, 0.23000312]
[None, 0.23000318]
[None, 0.2300032]
[None, 0.2300032]
[None, 0.23000325]
[None, 0.2300033]
[None, 0.23000333]
[None, 0.23000322]
[None, 0.23000327]
[None, 0.23000322]
[None, 0.23250322]
[None, 0.23250322]
[None, 0.23250322]
[None, 0.23250318]
[None, 0.2325031]
[None, 0.23250307]
[None, 0.23250306]
[None, 0.23250303]
[None, 0.232503]
[None, 0.23500301]
[None

[None, 0.23500164]
[None, 0.23500165]
[None, 0.23500164]
[None, 0.23500165]
[None, 0.23500171]
[None, 0.23500171]
[None, 0.23500176]
[None, 0.23500171]
[None, 0.2350018]
[None, 0.2350018]
[None, 0.23500171]
[None, 0.23500168]
[None, 0.23500165]
[None, 0.23500164]
[None, 0.23500161]
[None, 0.23750167]
[None, 0.23750174]
[None, 0.23750186]
[None, 0.235002]
[None, 0.23500206]
[None, 0.23500206]
[None, 0.23500206]
[None, 0.235002]
[None, 0.235002]
[None, 0.235002]
[None, 0.23750198]
[None, 0.23750193]
[None, 0.23500192]
[None, 0.23500198]
[None, 0.2350019]
[None, 0.2350019]
[None, 0.23500186]
[None, 0.23500183]
[None, 0.23500186]
[None, 0.23500186]
[None, 0.23500186]
[None, 0.23500188]
[None, 0.23500183]
[None, 0.23500186]
[None, 0.2350019]
[None, 0.23500188]
[None, 0.23750186]
[None, 0.2350019]
[None, 0.2350019]
[None, 0.23500195]
[None, 0.23500198]
[None, 0.235002]
[None, 0.235002]
[None, 0.23500203]
[None, 0.2350019]
[None, 0.2350019]
[None, 0.23250206]
[None, 0.23250206]
[None, 0.23250

[None, 0.23250085]
[None, 0.23250085]
[None, 0.23250084]
[None, 0.23250088]
[None, 0.23250084]
[None, 0.23250084]
[None, 0.23250085]
[None, 0.23250085]
[None, 0.23250088]
[None, 0.23250091]
[None, 0.23250084]
[None, 0.23250085]
[None, 0.23250088]
[None, 0.23250088]
[None, 0.23000085]
[None, 0.23000078]
[None, 0.23000076]
[None, 0.23000073]
[None, 0.23000073]
[None, 0.23000078]
[None, 0.23000078]
[None, 0.23000082]
[None, 0.23250084]
[None, 0.23000088]
[None, 0.23000088]
[None, 0.2300009]
[None, 0.23000097]
[None, 0.23000097]
[None, 0.230001]
[None, 0.23000103]
[None, 0.230001]
[None, 0.22750098]
[None, 0.227501]
[None, 0.23000097]
[None, 0.23000097]
[None, 0.23000103]
[None, 0.23000097]
[None, 0.23000103]
[None, 0.23000105]
[None, 0.23000109]
[None, 0.22750108]
[None, 0.22750105]
[None, 0.22750108]
[None, 0.23000108]
[None, 0.23000108]
[None, 0.23000109]
[None, 0.23000108]
[None, 0.23000109]
[None, 0.23000109]
[None, 0.23000112]
[None, 0.23000108]
[None, 0.23000108]
[None, 0.23000105]


In [91]:
env.reset()
for _ in range(1000):
    #env.render()
    actoin = Q.get_best_action(state)
    
    next_state, reward, done, _ = env.step(action) # take a random action
    next_state = next_state.tolist()
    print(next_state, reward)
    
    if done:
        break

([0.03325931826711125, -0.1800216926925826, -0.012120367192087696, 0.2842336338884332], 1.0)
([0.0296588844132596, -0.37496869702879376, -0.006435694514319032, 0.5730693611773274], 1.0)
([0.022159510472683726, -0.569999824439548, 0.005025692709227517, 0.8637179128149124], 1.0)
([0.010759513983892766, -0.7651898317521835, 0.022300050965525765, 1.1579767737067883], 1.0)
([-0.004544282651150904, -0.9605952126135293, 0.04545958643966153, 1.4575676394343404], 1.0)
([-0.023756186903421492, -1.1562444676935333, 0.07461093922834834, 1.7640988470804295], 1.0)
([-0.046881076257292154, -1.352126523103784, 0.10989291616995693, 2.0790197534438115], 1.0)
([-0.07392360671936783, -1.548176824081446, 0.15147331123883317, 2.403564824903905], 1.0)
([-0.10488714320099675, -1.7442608882016808, 0.19954460773691127, 2.738686181812374], 1.0)
([-0.13977236096503037, -1.9401555403675912, 0.25431833137315873, 3.0849749992790327], 1.0)


In [29]:
action = env.action_space.sample()
print(action)
env.step(action)

0


(array([ 0.04173736, -0.17609484, -0.02005421,  0.30048321]), 1.0, False, {})