In [None]:
%%bash
pkill -f "sh StartGame_local"

In [1]:
from sample_bot import *
import shutil
import argparse
from datetime import datetime

import socketio
import eventlet
import eventlet.wsgi
from flask import Flask

import math
from queue import Queue

import sys
sys.path.append('/home/louner/school/ml/reinforcement-learning/')
from lab import *

import tensorflow.contrib.slim as slim

In [2]:
batch_size = 100
max_size = batch_size*10
t_max = 5
reload_window = 100
reward_decay = 0.99
last_state = None
last_action = None
last_reward = None
learning_rate = 0.001

In [3]:
# 11 possible steering angles, 0 and 5 uniformly selected angle from 0~pi and 0~-1*pi
ANGLES = [-3.141592653589793, -2.5132741228718345, -1.8849555921538759, -1.2566370614359172, -0.6283185307179586, 0, 0.6283185307179586, 1.2566370614359172, 1.8849555921538759, 2.5132741228718345, 3.141592653589793]
# 3 possible throttle options
THROTTLES = [0.0, 0.5, 1.0]
ACTIONS = [(angle, throttle) for angle in ANGLES for throttle in THROTTLES] #33 actions

In [4]:
def send_control(steering_angle, throttle):
    sio.emit(
        "steer",
        data={
            'steering_angle': str(steering_angle),
            'throttle': str(throttle)
        },  
        skip_sid=True)
    
def calculate_reward(steering_angle, speed):
    return speed * math.cos(steering_angle)

class Action:
    def __init__(self, action_idx):
        self.action_idx = action_idx
        self.steering_angle = ACTIONS[action_idx][0]
        self.throttle = ACTIONS[action_idx][1]
        
    def __repr__(self):
        return str({'throttle': self.throttle, 'angle': self.steering_angle})

In [5]:
class Transaction:
    columns = ['state', 'action', 'reward', 'next_state', 'advantage']
    
    def __init__(self, item):
        self.item = dict(zip(Transaction.columns, item))
    
    def  __lt__(self, other):
        #sort by time
        return self.item['id'] < other.item['id']
        #sort by gained label
        #return self.item['label'] < other.item['label']
    
    def __repr__(self):
        return str(self.item)
        
    def __getitem__(self, key):
        return self.item[key]
    
    def __setitem__(self, key, value):
        self.item[key] = value


In [23]:
# policy & value network only different at last layer
# common part is cnn1-cnn1-fc
def build_network():
    with tf.variable_scope('Action-Critic-Network-Common', reuse=tf.AUTO_REUSE):
        inputs = tf.placeholder(shape=(None, 108, 320, 3), dtype=tf.float32)
        layer_1 = slim.conv2d(inputs=inputs, num_outputs=16, kernel_size=(8, 8), stride=(8, 8), padding='VALID', activation_fn=tf.nn.elu)
        layer_2 = slim.conv2d(inputs=layer_1, num_outputs=32, kernel_size=(4, 4), stride=(4, 4), padding='VALID', activation_fn=tf.nn.elu)
        fc_layer = tf.layers.dense(inputs=slim.flatten(layer_2), units=256, activation=tf.nn.relu)
    return inputs, fc_layer

In [32]:
# P(state) = action probability
class PolicyNetwork:
    def __init__(self):
        self.action_size = 33
        
        self.advantage = tf.placeholder(shape=(None, self.action_size), dtype=tf.float32)
        
        inputs, output = build_network()
        self.inputs = inputs
        
        with tf.variable_scope('policy-network'):
            self.actions_values = tf.layers.dense(output, units=self.action_size, activation=tf.nn.softmax) # batch_size, action_size
            self.best_action = tf.argmax(self.actions_values, axis=1)

            self.loss = tf.reduce_sum(tf.log(self.actions_values) * self.advantage)*-1
            self.train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())
    
    def get_action(self, state):
        best_action = self.sess.run(self.best_action, feed_dict={self.inputs: [state]})[0]
        return best_action
    
    def update(self, batch):
        states = [transaction.state for transaction in batch]
        advantages = []
        for transaction in batch:
            # unselected actions get 0 advantage, selected action get reward - current_state_value advantage
            advantage = [0]*self.action_size
            advantage[transaction.action] = transaction.advantage

            advantages.append(advantage)
            
        advantages = np.array(advantages)
        
        return self.sess.run(self.train_op, feed_dict={self.inputs: states, self.advantage: advantages})

In [13]:
# V(state) = state_value
class ValueNetwork:
    def __init__(self):
        self.output_size = 1
        self.model_folder = './model/value_network'
        
        self.rewards = tf.placeholder(shape=(None, 1), dtype=tf.float32)
        inputs, output = build_network()
        
        with tf.variable_scope('value-network'):
            self.inputs = inputs
            self.state_value = tf.layers.dense(output, units=self.output_size)

            self.loss = tf.reduce_sum(tf.square(self.state_value - self.rewards))
            self.train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)

        self.sess = tf.Session()
        self.target_sess = tf.Session()
        
        self.sess.run(tf.global_variables_initializer())
        self.target_sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver()
    
    def get_state_values(self, states):
        state_values = self.target_sess.run(self.state_value, feed_dict={self.inputs: states})
        return state_values
    
    def get_state_value(self, state):
        return self.get_state_values([state])[0]
    
    def update(self, batch):
        inputs = [transaction.state for transaction in batch]
        rewards = [transaction.reward for transaction in batch]
        
        return self.sess.run(self.train_op, feed_dict={self.inputs: inputs, self.rewards: rewards})
    
    def reload(self, model_name):
        self.saver.save(self.sess, '%s/%s'%(self.model_folder, model_name))
        self.saver.restore(self.target_sess, tf.train.latest_checkpoint(self.model_folder))

In [9]:
# when t_max transactions are collected, calculate first transaction's reward/advantage
def add_advantage(transaction, transactions, value_network):
    final_state = transactions[-1].state
    final_state_value = value_network.get_state_value(final_state)
    
    reward = final_state_value
    for t in reversed(transactions):
        reward += t.reward
        reward *= reward_decay
        
    transaction.reward = reward
    
    current_state = transaction.state
    current_state_value = value_network.get_state_value(current_state)
    
    advantage = reward - current_state_value
    transaction.advantage = advantage

In [43]:
class ACNDrive(AutoDrive):
    def __init__(self, *args, **kwargs):
        AutoDrive.__init__(self, *args, **kwargs)
        
        self.policy_network = PolicyNetwork()
        self.value_network = ValueNetwork()
        
        self.memory = Memory(max_size=max_size)
        self.transactions = []
        self.round_id = 1
        
        self.last_state = None
        self.last_action = None
        self.last_reward = None

    def on_dashboard(self, src_img, last_steering_angle, speed, throttle, info):        
        # get angle to calculate reward
        track_img     = ImageProcessor.preprocess(src_img)
        current_angle = ImageProcessor.find_steering_angle_by_color(track_img, last_steering_angle, debug = self.debug)
        steering_angle = self._steering_pid.update(-current_angle)
        throttle       = self._throttle_pid.update(speed)
        
        # update last transaction
        # because can't get next state right after executing action QQ
        state = track_img
        if self.last_state is not None:
            transaction = Transaction([self.last_state, self.last_action, self.last_reward, state, None])
            self.transactions.append(transaction)
        
        # select action according to current state
        action = self.policy_network.get_action(state)
        action = Action(action)
        steering_angle, throttle = action.steering_angle, action.throttle
        
        self.last_action = action.action_idx
        self.last_reward = calculate_reward(steering_angle, speed)
        self.last_state = state
        
        # execute action
        self._car.control(steering_angle, throttle)
        self.round_id += 1
        
        # update policy & value network
        batch = self.memory.batch(batch_size)
        if batch:
            self.policy_network.update(batch)
            self.value_network.update(batch)
        
        # when storing t_max transactions, update first transaction's label, pop & push it into memory
        if len(self.transactions) == max_size:
            transaction = self.transactions[0]
            add_advantage(transaction, self.transactions, self.value_network)
            self.memory.insert(transaction)
            transactions = transactions[1:]
        
        # reload policy & value network periodically
        if self.round_id % reload_window == 0:
            #policy_network.reload() #policy network need reload ?
            self.value_network.reload()

In [44]:
tf.reset_default_graph()

sio = socketio.Server()
record_folder = './records/'
car = Car(control_function = send_control)
#drive = AutoDrive(car, record_folder=record_folder)
drive = ACNDrive(car, record_folder=record_folder)

@sio.on('telemetry')
def telemetry(sid, dashboard):
    if dashboard:
        car.on_dashboard(dashboard)
    else:
        sio.emit('manual', data={}, skip_sid=True)

@sio.on('connect')
def connect(sid, environ):
    car.control(0, 0)

app = socketio.Middleware(sio, Flask(__name__))
eventlet.wsgi.server(eventlet.listen(('', 4567)), app)

(17058) wsgi starting up on http://0.0.0.0:4567
wsgi exiting
(17058) wsgi exited, is_accepting=True


In [None]:
%%bash
cd ../Linux/
sh StartGame_local.sh &

In [10]:
angle, step_angle = 0, math.pi/5.0
angles = [angle]

while angle < math.pi:
    angle += step_angle
    angles.append(angle)
    angles.append(angle*-1)
print(sorted(angles))

[-3.141592653589793, -2.5132741228718345, -1.8849555921538759, -1.2566370614359172, -0.6283185307179586, 0, 0.6283185307179586, 1.2566370614359172, 1.8849555921538759, 2.5132741228718345, 3.141592653589793]
