In [19]:
from sample_bot import *
import shutil
import argparse
from datetime import datetime

import socketio
import eventlet
import eventlet.wsgi
from flask import Flask

import math
from queue import Queue

import sys
sys.path.append('/home/louner/school/ml/reinforcement-learning/')
from lab import *

import tensorflow.contrib.slim as slim
from random import random, choice

from time import sleep

In [20]:
batch_size = 100
max_size = batch_size*10
t_max = 5
reload_window = 100
reward_decay = 0.99
last_state = None
last_action = None
last_reward = None
learning_rate = 0.001
mu = 0.999

In [21]:
# 11 possible steering angles, 0 and 5 uniformly selected angle from 0~pi and 0~-1*pi
#ANGLES = [-3.141592653589793, -2.5132741228718345, -1.8849555921538759, -1.2566370614359172, -0.6283185307179586, 0, 0.6283185307179586, 1.2566370614359172, 1.8849555921538759, 2.5132741228718345, 3.141592653589793]
ANGLES = [-0.349065850398866, -0.3316125578789227, -0.31415926535897937, -0.29670597283903605, -0.2792526803190927, -0.2617993877991494, -0.24434609527920612, -0.22689280275926282, -0.20943951023931953, -0.19198621771937624, -0.17453292519943295, -0.15707963267948966, -0.13962634015954636, -0.12217304763960306, -0.10471975511965977, -0.08726646259971647, -0.06981317007977318, -0.05235987755982989, -0.03490658503988659, -0.017453292519943295, 0, 0.017453292519943295, 0.03490658503988659, 0.05235987755982989, 0.06981317007977318, 0.08726646259971647, 0.10471975511965977, 0.12217304763960306, 0.13962634015954636, 0.15707963267948966, 0.17453292519943295, 0.19198621771937624, 0.20943951023931953, 0.22689280275926282, 0.24434609527920612, 0.2617993877991494, 0.2792526803190927, 0.29670597283903605, 0.31415926535897937, 0.3316125578789227, 0.349065850398866]
#ANGLES = [0.0]
# 3 possible throttle options
THROTTLES = [0.0, 0.5, 1.0]
#THROTTLES = [0.0, 0.1]
ACTIONS = [(angle, throttle) for angle in ANGLES for throttle in THROTTLES] #33 actions

In [22]:
def send_control(steering_angle, throttle):
    sio.emit(
        "steer",
        data={
            'steering_angle': str(steering_angle),
            'throttle': str(throttle)
        },  
        skip_sid=True)
    
def calculate_reward(steering_angle, speed):
    return speed * math.cos(steering_angle)

class Action:
    def __init__(self, action_idx):
        self.action_idx = action_idx
        self.steering_angle = ACTIONS[action_idx][0]
        self.throttle = ACTIONS[action_idx][1]
        
    def __repr__(self):
        return str({'throttle': self.throttle, 'angle': self.steering_angle})

In [23]:
class Transaction:
    columns = ['state', 'action', 'reward', 'next_state', 'advantage']
    
    def __init__(self, item):
        self.item = dict(zip(Transaction.columns, item))
    
    def  __lt__(self, other):
        #sort by time
        return self.item['id'] < other.item['id']
        #sort by gained label
        #return self.item['label'] < other.item['label']
    
    def __repr__(self):
        return str(self.item)
        
    def __getitem__(self, key):
        return self.item[key]
    
    def __setitem__(self, key, value):
        self.item[key] = value


In [24]:
# policy & value network only different at last layer
# common part is cnn1-cnn1-fc
def build_network():
    with tf.variable_scope('Action-Critic-Network-Common', reuse=tf.AUTO_REUSE):
        #inputs = tf.placeholder(shape=(None, 108, 320, 3), dtype=tf.float32) # track image
        inputs = tf.placeholder(shape=(None, 240, 320, 3), dtype=tf.float32) # track image
        layer_1 = slim.conv2d(inputs=inputs, num_outputs=16, kernel_size=(8, 8), stride=(8, 8), padding='VALID', activation_fn=tf.nn.elu)
        layer_2 = slim.conv2d(inputs=layer_1, num_outputs=32, kernel_size=(4, 4), stride=(4, 4), padding='VALID', activation_fn=tf.nn.elu)
        fc_layer = tf.layers.dense(inputs=slim.flatten(layer_2), units=256, activation=tf.nn.relu)
    return inputs, fc_layer

In [25]:
# P(state) = action probability
class PolicyNetwork:
    def __init__(self):
        self.action_size = len(ACTIONS)
        
        self.advantage = tf.placeholder(shape=(None, self.action_size), dtype=tf.float32)
        
        inputs, output = build_network()
        self.inputs = inputs
    
        self.metrics = {}
        self.metrics['loss'] = []
        
        with tf.variable_scope('policy-network'):
            self.actions_values = tf.layers.dense(output, units=self.action_size, activation=tf.nn.softmax) # batch_size, action_size
            self.best_action = tf.argmax(self.actions_values, axis=1)

            self.loss = tf.reduce_sum(tf.log(self.actions_values) * self.advantage)*-1
            self.train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def get_action(self, state):
        best_action = self.sess.run(self.best_action, feed_dict={self.inputs: [state]})[0]
        return best_action
    
    def update(self, batch):
        states = [transaction['state'] for transaction in batch]
        advantages = []
        for transaction in batch:
            # unselected actions get 0 advantage, selected action get reward - current_state_value advantage
            advantage = [0]*self.action_size
            advantage[transaction['action']] = transaction['advantage']

            advantages.append(advantage)
            
        advantages = np.array(advantages)
        
        _, loss = self.sess.run([self.train_op, self.loss], feed_dict={self.inputs: states, self.advantage: advantages})
        self.metrics['loss'].append(loss)

In [26]:
# V(state) = state_value
class ValueNetwork:
    def __init__(self):
        self.output_size = 1
        self.model_folder = './model/value_network'
        
        self.rewards = tf.placeholder(shape=(None, 1), dtype=tf.float32)
        inputs, output = build_network()
        
        with tf.variable_scope('value-network'):
            self.inputs = inputs
            self.state_value = tf.layers.dense(output, units=self.output_size)

            self.loss = tf.reduce_sum(tf.square(self.state_value - self.rewards))
            self.train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)

        self.sess = tf.Session()
        self.target_sess = tf.Session()
        
        self.sess.run(tf.global_variables_initializer())
        self.target_sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
        
        self.metrics = {}
        self.metrics['loss'] = []
    
    def get_state_values(self, states):
        state_values = self.target_sess.run(self.state_value, feed_dict={self.inputs: states})
        return state_values
    
    def get_state_value(self, state):
        return self.get_state_values([state])[0]
    
    def update(self, batch):
        inputs = [transaction['state'] for transaction in batch]
        rewards = [transaction['reward'] for transaction in batch]
        
        _, loss = self.sess.run([self.train_op, self.loss], feed_dict={self.inputs: inputs, self.rewards: rewards})
        self.metrics['loss'].append(loss)
    
    def reload(self, model_name):
        self.saver.save(self.sess, '%s/%s'%(self.model_folder, model_name))
        self.saver.restore(self.target_sess, tf.train.latest_checkpoint(self.model_folder))

In [27]:
# when t_max transactions are collected, calculate first transaction's reward/advantage
def add_advantage(transaction, transactions, value_network):
    final_state = transactions[-1]['state']
    final_state_value = value_network.get_state_value(final_state)
    
    reward = final_state_value
    for t in reversed(transactions):
        reward += t['reward']
        reward *= reward_decay
        
    transaction['reward'] = reward
    
    current_state = transaction['state']
    current_state_value = value_network.get_state_value(current_state)
    
    advantage = reward - current_state_value
    transaction['advantage'] = advantage

In [32]:
def end_and_restart_new_game():
    print('restarting...')
    pyautogui.press('esc')
    pyautogui.click()

In [33]:
class ACNDrive(AutoDrive):
    def __init__(self, *args, **kwargs):
        AutoDrive.__init__(self, *args, **kwargs)
        
        self.policy_network = PolicyNetwork()
        self.value_network = ValueNetwork()
        
        self.memory = Memory(max_size=max_size)
        self.transactions = []
        self.round_id = 1
        
        self.last_state = None
        self.last_action = None
        self.last_reward = None
        
        self.mu = 0.5

    def on_dashboard(self, src_img, last_steering_angle, speed, throttle, info):        
        # get angle to calculate reward
        try:
            track_img     = ImageProcessor.preprocess(src_img)
            current_angle = ImageProcessor.find_steering_angle_by_color(track_img, last_steering_angle, debug = self.debug)
            steering_angle = self._steering_pid.update(-current_angle)
            throttle       = self._throttle_pid.update(speed)
        except:
            end_and_restart_new_game()
        
        # update last transaction
        # because can't get next state right after executing action QQ
        state = src_img
        if self.last_state is not None:
            transaction = Transaction([self.last_state, self.last_action, self.last_reward, state, None])
            self.transactions.append(transaction)
        
        # select action according to current state
        action = self.policy_network.get_action(state)
        
        if random() < self.mu:
            action = randint(a=0, b=len(ACTIONS)-1)
        action = Action(action)
        steering_angle, throttle = action.steering_angle, action.throttle
        
        self.last_action = action.action_idx
        self.last_reward = calculate_reward(steering_angle, speed)
        self.last_state = state
        
        # execute action
        self._car.control(steering_angle, throttle)
        self.round_id += 1
        
        print(len(drive.memory.d), len(self.transactions), self.last_reward, steering_angle)
        
        # update policy & value network
        batch = self.memory.batch(batch_size)
        if batch.any():
            self.policy_network.update(batch)
            self.value_network.update(batch)
        
        # when storing t_max transactions, update first transaction's label, pop & push it into memory
        if len(self.transactions) == t_max:
            transaction = self.transactions[0]
            add_advantage(transaction, self.transactions, self.value_network)
            self.memory.insert(transaction)
            self.transactions = self.transactions[1:]
        
        # reload policy & value network periodically
        if self.round_id % reload_window == 0:
            #policy_network.reload() #policy network need reload ?
            self.value_network.reload(str(info))
            
        #self.mu *= self.mu

In [34]:
tf.reset_default_graph()

sio = socketio.Server()
record_folder = './records/'
car = Car(control_function = send_control)
#drive = AutoDrive(car, record_folder=record_folder)
drive = ACNDrive(car, record_folder=record_folder)

@sio.on('telemetry')
def telemetry(sid, dashboard):
    if dashboard:
        car.on_dashboard(dashboard)
    else:
        sio.emit('manual', data={}, skip_sid=True)

@sio.on('connect')
def connect(sid, environ):
    car.control(0, 0)

In [None]:
app = socketio.Middleware(sio, Flask(__name__))
eventlet.wsgi.server(eventlet.listen(('', 4567)), app)

(5752) wsgi starting up on http://0.0.0.0:4567
(5752) accepted ('127.0.0.1', 59772)


0 0 0.2778903424767491 -0.22689280275926282
0 1 0.2851565626586028 -0.017453292519943295
0 2 0.2727381164006577 -0.29670597283903605
0 3 0.02382706810510278 0.3316125578789227
0 4 0.024737005022881134 -0.19198621771937624
0 5 0.024737005022881134 -0.19198621771937624
1 5 0.15980976085162385 0.06981317007977318
2 5 0.1572566747883158 -0.19198621771937624
3 5 0.15053875784990253 -0.349065850398866
4 5 0.298157239824282 -0.22689280275926282
5 5 0.34937771640820076 -0.349065850398866
6 5 0.35154380640782595 -0.3316125578789227
7 5 0.36615152256993894 -0.17453292519943295
8 5 0.4924289862421671 0.2617993877991494
9 5 0.5379175656165388 0.31415926535897937
10 5 0.5314901463165097 -0.349065850398866
11 5 0.5314901463165097 -0.349065850398866
12 5 0.6346683960788024 -0.349065850398866
13 5 0.6860695824357916 -0.349065850398866
14 5 0.7283215130946976 0.06981317007977318
15 5 0.7261004358083764 -0.10471975511965977
16 5 0.8135858710764394 -0.349065850398866
17 5 0.8135858710764394 -0.3490658503

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


29 5 1.0969971655054693 -0.349065850398866
restarting...
30 5 1.1589229092152609 -0.349065850398866
restarting...
31 5 1.221297609178979 0.13962634015954636


(5752) accepted ('127.0.0.1', 60244)
(5752) accepted ('127.0.0.1', 44262)


32 5 0.4014953104519235 -0.15707963267948966
33 5 0.38873788329897385 0.29670597283903605
34 5 0.3819850503494717 -0.349065850398866
35 5 0.023680254043804888 -0.349065850398866
36 5 0.09070235175056415 -0.19198621771937624
37 5 0.08682759816061793 -0.349065850398866
38 5 0.21979410400182395 -0.349065850398866
39 5 0.2325739236445123 -0.349065850398866
40 5 0.2325739236445123 0.349065850398866
41 5 0.21462579458750145 -0.349065850398866
42 5 0.20786000771784294 -0.349065850398866
43 5 0.20786000771784294 -0.349065850398866
44 5 0.3422360524902278 -0.349065850398866
45 5 0.3733398782382414 -0.349065850398866
46 5 0.3733398782382414 -0.349065850398866
47 5 0.5112645398548038 -0.2617993877991494
48 5 0.5598415486123554 0.3316125578789227
49 5 0.5563920007673363 0.349065850398866
50 5 0.517850273122711 -0.31415926535897937
51 5 0.5305445002755605 0.22689280275926282
52 5 0.5437537816738655 -0.05235987755982989
53 5 0.7235633180051494 -0.349065850398866
54 5 0.7235633180051494 -0.3490658503

(5752) accepted ('127.0.0.1', 56007)


62 5 0.406438088081073 0.017453292519943295
63 5 0.3819850503494717 -0.349065850398866
64 5 0.3819850503494717 -0.349065850398866
65 5 0.022082776588468845 -0.349065850398866
66 5 0.013437604477238489 -0.349065850398866
67 5 0.014193409968470905 0.12217304763960306
68 5 0.0005977168188550472 -0.08726646259971647
69 5 0.0002819077862357725 -0.349065850398866
70 5 0.0002819077862357725 -0.349065850398866
71 5 0.24029542331236406 -0.12217304763960306
72 5 0.30410924006924295 -0.15707963267948966
73 5 0.28933135793998116 -0.349065850398866
74 5 0.5911564561839718 0.06981317007977318
75 5 0.5603143079001551 -0.3316125578789227
76 5 0.5667061983836948 0.29670597283903605
77 5 0.5045288195457949 -0.22689280275926282
78 5 0.48657283904294335 -0.349065850398866
79 5 0.516538665224537 0.06981317007977318
80 5 0.7895187533450857 -0.03490658503988659
81 5 0.8272433125673297 -0.10471975511965977
82 5 0.8318 0
83 5 1.0141162763521523 -0.349065850398866
84 5 1.0935203028085614 -0.349065850398866
85 5

(5752) accepted ('127.0.0.1', 35947)


89 5 0.3960814313351981 -0.22689280275926282
90 5 0.3819850503494717 -0.349065850398866
91 5 0.406438088081073 -0.017453292519943295
92 5 0.01620393862880914 -0.24434609527920612
93 5 0.11422188760704793 0.31415926535897937
INFO:tensorflow:Restoring parameters from ./model/value_network/{'lap': 1, 'status': 0, 'elapsed': '0.060'}
94 5 0.11285708375638759 -0.349065850398866
95 5 0.26029485595769664 -0.349065850398866
96 5 0.3177100750877156 -0.349065850398866
97 5 0.33763664570052143 -0.05235987755982989
98 5 0.556141958019852 -0.06981317007977318
99 5 0.49850693532692436 -0.349065850398866
100 5 0.5304192022804656 0.017453292519943295
101 5 0.8361053952358176 0.2617993877991494
102 5 0.8133979325522823 -0.349065850398866
103 5 0.8133979325522823 -0.349065850398866
restarting...
104 5 1.1250000056048894 -0.349065850398866
restarting...
105 5 1.1550701694700385 -0.349065850398866
restarting...
106 5 1.1550701694700385 -0.349065850398866


In [1]:
len(drive.memory.d), drive.memory.id, 

NameError: name 'drive' is not defined

In [None]:
%%bash
cd ../Linux/
sh StartGame_local.sh &

In [17]:
largest_angle = math.pi/9
angle, step_angle = 0, largest_angle/20.0
angles = [angle]

while angle < largest_angle:
    angle += step_angle
    angles.append(angle)
    angles.append(angle*-1)
print(sorted(angles))

[-0.349065850398866, -0.3316125578789227, -0.31415926535897937, -0.29670597283903605, -0.2792526803190927, -0.2617993877991494, -0.24434609527920612, -0.22689280275926282, -0.20943951023931953, -0.19198621771937624, -0.17453292519943295, -0.15707963267948966, -0.13962634015954636, -0.12217304763960306, -0.10471975511965977, -0.08726646259971647, -0.06981317007977318, -0.05235987755982989, -0.03490658503988659, -0.017453292519943295, 0, 0.017453292519943295, 0.03490658503988659, 0.05235987755982989, 0.06981317007977318, 0.08726646259971647, 0.10471975511965977, 0.12217304763960306, 0.13962634015954636, 0.15707963267948966, 0.17453292519943295, 0.19198621771937624, 0.20943951023931953, 0.22689280275926282, 0.24434609527920612, 0.2617993877991494, 0.2792526803190927, 0.29670597283903605, 0.31415926535897937, 0.3316125578789227, 0.349065850398866]


In [15]:
import pyautogui
for i in range(10):
    pyautogui.press('esc')
    #pyautogui.click()
    print(i)
    sleep(1)
    
    

0
1
2
3
4
5


KeyboardInterrupt: 

In [6]:
pyautogui.KEYBOARD_KEYS




['\t',
 '\n',
 '\r',
 ' ',
 '!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '{',
 '|',
 '}',
 '~',
 'accept',
 'add',
 'alt',
 'altleft',
 'altright',
 'apps',
 'backspace',
 'browserback',
 'browserfavorites',
 'browserforward',
 'browserhome',
 'browserrefresh',
 'browsersearch',
 'browserstop',
 'capslock',
 'clear',
 'convert',
 'ctrl',
 'ctrlleft',
 'ctrlright',
 'decimal',
 'del',
 'delete',
 'divide',
 'down',
 'end',
 'enter',
 'esc',
 'escape',
 'execute',
 'f1',
 'f10',
 'f11',
 'f12',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f18',
 'f19',
 'f2',
 'f20',
 'f21',
 'f22',
 'f23',
 'f24',
 'f3',
 'f4',
 'f5',
 'f6',
 'f7',
 'f8',
 'f9',
 'final',
