-
Notifications
You must be signed in to change notification settings - Fork 0
/
interaction_env.py
337 lines (295 loc) · 15.4 KB
/
interaction_env.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
import pyduktape
import numpy as np
import json
import random as rd
import math
from model_predictor.predictor import Predictor
import tensorflow as tf
# import keras
from config import n_actions, acceleration, velocity_decay, action_length, env_width, env_height, num_feats, predictor_input_frames, dragging_bonus
# n_actions = 6 # 1 no action + 4 directions acc + 1 click
# # directions_tan = [(np.cos(i*np.pi/8.),np.sin(i*np.pi/8.)) for i in range(16)]
# acceleration = 1 # in meter/s/frame (speed meter/s change in each frame)
# Could be smaller: 0.1 meter/s/frame
# velocity_decay = 0.9 # velocity in meter/s decay rate per frame if not accelerate
# action_length = 5 # frames
# env_width = 6
# env_height = 4
# num_feats = 22
loss_weight = np.array([1.6973/6.3432, 1.0517/6.3432, 1.7830/6.3432, 1.8112/6.3432,
1.6973/6.3432, 1.0517/6.3432, 1.7830/6.3432, 1.8112/6.3432,
1.6973/6.3432, 1.0517/6.3432, 1.7830/6.3432, 1.8112/6.3432,
1.6973/6.3432, 1.0517/6.3432, 1.7830/6.3432, 1.8112/6.3432])
class Interaction_env:
# Class instance varialbes:
# 1. self.timesteps: current time steps, increment after each action,
# used to retrieve trajectories data from JS context
# 2. self.state: latest state (action_length frames) {'last_trajectory':[action_length,22], 'caught_any', 'vx', 'vy'}
# (velocity is the final velocity from last action)
# 3. self.context: js environment
# 4. self.cond: a dictionary stores {'starting locations', 'starting velocities', 'local forces', 'mass'}
def __init__(self, world_setup_idx=None, predictor=None, predictor_sess=None):
self.timesteps = 0
self.world_setup_idx = world_setup_idx
# Read in world_setup
with open('./js_simulator/json/world_setup.json') as data_file:
self.world_setup = json.load(data_file)
# if world_setup is None:
# self.world_setup = rd.sample(self.world_setup, 1)[0]
# else:
# # self.world_setup = self.world_setup[4] # First experiment setup: [0,3,0,3,-3,0,"B"]
# self.world_setup = self.world_setup[world_setup] # Second experiment setup: [0,0,0,0,0,0,"same"]
# Read in starting_state
with open('./js_simulator/json/starting_state.json') as data_file:
self.starting_state = json.load(data_file)
# Create js environment
self.context = pyduktape.DuktapeContext()
# Load js box2d graphic library
js_file = open("./js_simulator/js/box2d.js",'r')
js = js_file.read()
self.context.eval_js(js)
# Load world environment script
js_file = open("./js_simulator/js/control_world.js",'r')
js = js_file.read()
self.context.eval_js(js)
self.cond = {}
if predictor is None:
predictor_graph = tf.Graph()
with predictor_graph.as_default():
self.predictor = Predictor(lr=1e-4, name='predictor', num_feats=22, n_state=16, input_frames=5, train=True, batch_size=1)
self.sess = tf.InteractiveSession(graph = predictor_graph)
self.sess.run(tf.global_variables_initializer())
self.predictor.saver.restore(self.sess, "./model_predictor/checkpoints/pretrained_model_predictor.ckpt") # 1e-5 learning rate, 20 epochs
else:
self.predictor = predictor
self.sess = predictor_sess
self.trajectory_history = []
def reset(self):
if self.world_setup_idx is None:
world_setup = rd.sample(self.world_setup, 1)[0]
else:
world_setup = self.world_setup[self.world_setup_idx]
starting_state = rd.sample(self.starting_state, 1)[0]
self.trajectory_history = []
# Set starting conditions
self.cond = {'sls':[{'x':starting_state[0], 'y':starting_state[1]}, {'x':starting_state[4], 'y':starting_state[5]},
{'x':starting_state[8], 'y':starting_state[9]}, {'x':starting_state[12], 'y':starting_state[13]}],
'svs':[{'x':rd.uniform(-10.0, 10.0), 'y':rd.uniform(-10.0, 10.0)}, {'x':rd.uniform(-10.0, 10.0), 'y':rd.uniform(-10.0, 10.0)},
{'x':rd.uniform(-10.0, 10.0), 'y':rd.uniform(-10.0, 10.0)}, {'x':rd.uniform(-10.0, 10.0), 'y':rd.uniform(-10.0, 10.0)}]
}
# self.cond = {'sls':[{'x':rd.uniform(0.0, 6.0), 'y':rd.uniform(0.0, 4.0)}, {'x':rd.uniform(0.0, 6.0), 'y':rd.uniform(0.0, 4.0)},
# {'x':rd.uniform(0.0, 6.0), 'y':rd.uniform(0.0, 4.0)}, {'x':rd.uniform(0.0, 6.0), 'y':rd.uniform(0.0, 4.0)}],
# 'svs':[{'x':rd.uniform(-10.0, 10.0), 'y':rd.uniform(-10.0, 10.0)}, {'x':rd.uniform(-10.0, 10.0), 'y':rd.uniform(-10.0, 10.0)},
# {'x':rd.uniform(-10.0, 10.0), 'y':rd.uniform(-10.0, 10.0)}, {'x':rd.uniform(-10.0, 10.0), 'y':rd.uniform(-10.0, 10.0)}]
# }
# Zero condition
# self.cond = {'sls':[{'x':3, 'y':2}, {'x':1, 'y':1},
# {'x':5, 'y':3}, {'x':1, 'y':3}],
# 'svs':[{'x':0, 'y':0}, {'x':0, 'y':0},
# {'x':0, 'y':0}, {'x':0, 'y':0}]
# }
# Set local forces
self.cond['lf'] = [[0.0,float(world_setup[0]), float(world_setup[1]), float(world_setup[2])],
[0.0, 0.0, float(world_setup[3]), float(world_setup[4])],
[0.0, 0.0, 0.0, float(world_setup[5])],
[0.0, 0.0, 0.0, 0.0]]
# Zero condition
# self.cond['lf'] = [[0.0, 0.0, 0.0, 0.0],
# [0.0, 0.0, 0.0, 0.0],
# [0.0, 0.0, 0.0, 0.0],
# [0.0, 0.0, 0.0, 0.0]]
# Set mass
if world_setup[6]=='A':
self.cond['mass'] = [2,1,1,1]
elif world_setup[6]=='B':
self.cond['mass'] = [1,2,1,1]
else:
self.cond['mass'] = [1,1,1,1]
# Set timeout (action_length) in js simulator
self.cond['timeout']= action_length
self.context.set_globals(cond=self.cond)
# initial control path for the first action_length frames
path = {'x':[3]*action_length, 'y':[2]*action_length, 'obj':[0]*action_length, 'click':False}
# Feed control path to js context
self.context.set_globals(control_path=path)
# Build js environment, get trajectory for the first action_length frames
trajectory = self.context.eval_js("Run();") # [action_length,22]
trajectory = json.loads(trajectory)
self.trajectory_history.extend(trajectory)
trajectory = np.array(trajectory) # Convert to python object
# self.state['last_trajectory'] = trajectory
# self.state['caught_object'] = 0
self.state = {'last_trajectory':trajectory,
'caught_any':False,
'vx':0,
'vy':0}
return trajectory
def act(self, actionID):
# actionID taking value from 0 to 5
# [0:none, 1:up, 2:down, 3:right, 4:left, 5:click]
if_click = False
direction = 0 # taking value [0,1,2,3,4]: [none, up, down, right ,left]
if actionID < 5:
direction = actionID
elif actionID == 5:
if_click = True
# Generate control path from action space given actionID
control_path_ = self.action_generation(if_click, direction)
# Feed control path to js context
self.context.set_globals(control_path=control_path_)
# Run the simulation
trajectory = self.context.eval_js("action_forward();") # [action_length,22]
trajectory = json.loads(trajectory)
self.trajectory_history.extend(trajectory)
trajectory = np.array(trajectory) # Convert to python object
reward, is_done, pretrained_loss = self.reward_cal(trajectory)
# Update self.state
self.state['last_trajectory'] = trajectory
return trajectory, reward, is_done, pretrained_loss
# Calculate reward given trajectory [action_length:22] of one action length
def reward_cal(self, trajectory):
reward = 0
is_done = False
# train predictor network, add reward for loss drop
# pretrained_loss, trained_loss = self.predictor_test(trajectory)
# reward += np.abs(pretrained_loss - trained_loss)
# calculate predictor loss, add to reward
predictor_loss= self.predictor_test(trajectory)
# predictor_loss = 0
reward += predictor_loss
control_object = np.sum(trajectory[0,:4])
if control_object > 0:
for i in range(4):
if trajectory[0,i] == 1:
control_object = i+1
break
if (not self.state['caught_any']) and control_object == 0:
# agent has not caught any yet
# calculate the difference of distance from mouse to the closest object in the last frame of last action
# give reward by distance decrease
f0 = self.state['last_trajectory'][-1]
min_dist = 100
min_dist_x = 0
min_dist_y = 0
close_obj = 0
mouse_x1 = f0[4]
mouse_y1 = f0[5]
# find closest obj in f0
for obj in range(4):
obj_x1 = f0[4*obj+6]
obj_y1 = f0[4*obj+7]
dist = ((obj_x1-mouse_x1)**2+(obj_y1-mouse_y1)**2)**0.5
if dist<min_dist:
min_dist = dist
min_dist_x = abs(obj_x1-mouse_x1)
min_dist_y = abs(obj_y1-mouse_y1)
close_obj = obj
f5 = trajectory[-1]
# dist5 = ((f5[4*close_obj+6]-f5[4])**2+(f5[4*close_obj+7]-f5[5])**2)**0.5
dist5_x = abs(f5[4*close_obj+6]-f5[4])
dist5_y = abs(f5[4*close_obj+7]-f5[5])
# max possible reward is 2
if min_dist_x > 0 and min_dist_y > 0:
reward += min((max((min_dist_x - dist5_x)/min_dist_x, 0) + max((min_dist_y - dist5_y)/min_dist_y, 0)),2)
elif min_dist_x == 0 and min_dist_y > 0:
reward += min((1 + max((min_dist_y - dist5_y)/min_dist_y, 0)),2)
elif min_dist_x > 0 and min_dist_y == 0:
reward += min((max((min_dist_x - dist5_x)/min_dist_x, 0) + 1),2)
elif min_dist_x == 0 and min_dist_y == 0:
reward += 2
elif (not self.state['caught_any']) and control_object != 0:
# object caught
reward += 5
reward += dragging_bonus * predictor_loss
self.state['caught_any'] = True
# is_done = True
elif self.state['caught_any'] and control_object == 0:
# released last caught object
self.state['caught_any'] = False
elif self.state['caught_any'] and control_object != 0:
# add bonus reward for decrease predictor loss when agent is dragging
reward += dragging_bonus * predictor_loss
return reward, is_done, predictor_loss
def predictor_test(self, trajectory):
batch_num = action_length
# sequence = trajectory.reshape((1,action_length,num_feats))
# mean_weitghted_batch_losses = np.zeros(batch_num)
# for b in range(batch_num):
# if b <= predictor_input_frames:
# inputs = np.concatenate((self.state['last_trajectory'][-predictor_input_frames+b:,:], trajectory[:b,:]), axis=0)
# else:
# inputs = trajectory[b-predictor_input_frames:b]
# # labels = sequence[:,b+self.predictor.input_frames,-16:]
# labels = trajectory[b,-16:].reshape((1,16))
# # inputs = sequence[:,b:b+self.predictor.input_frames,:]
# inputs = inputs.reshape((1,predictor_input_frames,num_feats))
# _train_step, _weitghted_batch_losses, _batch_losses = self.sess.run(
# [self.predictor.train_step, self.predictor.weitghted_batch_losses, self.predictor.batch_losses],
# {self.predictor.batch_labels: labels, self.predictor.state_t: inputs, self.predictor.loss_weight:loss_weight}
# )
# mean_weitghted_batch_losses[b] = np.sum(_weitghted_batch_losses)/4
# mean_weitghted_batch_losses = np.mean(mean_weitghted_batch_losses)
# After training stpes, use the updated predictor to calculate loss again:
mean_weitghted_batch_losses_after = np.zeros(batch_num)
for b in range(batch_num):
if b <= predictor_input_frames:
inputs = np.concatenate((self.state['last_trajectory'][-predictor_input_frames+b:,:], trajectory[:b,:]), axis=0)
else:
inputs = trajectory[b-predictor_input_frames:b]
# labels = sequence[:,b+self.predictor.input_frames,-16:]
labels = trajectory[b,-16:].reshape((1,16))
inputs = inputs.reshape((1,predictor_input_frames,num_feats))
prediction = np.array(self.sess.run([self.predictor.prediction], {self.predictor.state_t: inputs}))
batch_loss_ = np.reshape(np.square(np.subtract(labels, prediction)), 16)
mean_weitghted_batch_losses_after[b] = np.sum(np.multiply(batch_loss_, loss_weight))/4
mean_weitghted_batch_losses_after = np.mean(mean_weitghted_batch_losses_after)
return mean_weitghted_batch_losses_after
# Calculate control path in action_length frames (1/60s per frame), v in meter/s
def action_generation(self, if_click, direction):
control_path = {'x':[], 'y':[], 'obj':[0]*action_length, 'click':if_click}
vx = self.state['vx']
vy = self.state['vy']
last_mouse_x = self.state['last_trajectory'][-1][4]
last_mouse_y = self.state['last_trajectory'][-1][5]
# if direction == 0:
# acce_x = acceleration
# acce_y = 0
# elif direction == 1:
# acce_x = -acceleration
# acce_y = 0
# elif direction == 2:
# acce_x = 0
# acce_y = acceleration
# elif direction == 3:
# acce_x = 0
# acce_y = -acceleration
# direction taking value [0,1,2,3,4]: [none, up, down, right ,left]
for _ in range(action_length):
if direction == 0:
vx *= velocity_decay
vy *= velocity_decay
elif direction == 1 or direction == 2:
vx += acceleration*(3-2*direction) # 1:acceleration*1; 2:acceleration*-1
vy *= velocity_decay
elif direction == 3 or direction == 4:
vx *= velocity_decay
vy += acceleration*(7-2*direction) # 3:acceleration*1; 4:acceleration*-1
last_mouse_x += vx/60
last_mouse_x = max(last_mouse_x, 0)
last_mouse_x = min(last_mouse_x, env_width)
last_mouse_y += vy/60
last_mouse_y = max(last_mouse_y, 0)
last_mouse_y = min(last_mouse_y, env_height)
control_path['x'].append(last_mouse_x)
control_path['y'].append(last_mouse_y)
self.state['vx'] = vx
self.state['vy'] = vy
return control_path
def destory(self):
self.context.eval_js("Destory();")
return self.trajectory_history
# with open('./active_training_data/random_data.json') as data_file:
# data = json.load(data_file)
# data.append(self.trajectory_history)
# with open('./active_training_data/random_data.json', 'w') as data_file:
# json.dump(data, data_file, indent=4)