This repository has been archived by the owner on Apr 6, 2018. It is now read-only.
/
docker_remote.py
389 lines (326 loc) · 14.7 KB
/
docker_remote.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
from __future__ import absolute_import
import base64
import logging
import os
import pipes
import sys
import threading
import uuid
import time, random
import docker
import six.moves.urllib.parse as urlparse
from gym.utils import closer
from universe import error
from universe.remotes import healthcheck, remote
from universe import error, utils
from universe.remotes.compose import container, log_printer, progress_stream
logger = logging.getLogger(__name__)
docker_closer = closer.Closer()
def random_alphanumeric(length=14):
buf = []
while len(buf) < length:
entropy = base64.encodestring(uuid.uuid4().bytes).decode('utf-8')
bytes = [c for c in entropy if c.isalnum()]
buf += bytes
return ''.join(buf)[:length]
def pretty_command(command):
return ' '.join(pipes.quote(c) for c in command)
class DockerManager(object):
def __init__(self, runtime, n, reuse=False, start_timeout=None):
super(DockerManager, self).__init__()
self.runtime = runtime
self.supports_reconnect = False
self.connect_vnc = True
self.connect_rewarder = True
self._assigner = PortAssigner(reuse=reuse)
self._popped = False
self.lock = threading.Lock()
self.envs = []
self._n = n
if start_timeout is None:
start_timeout = 2 * self._n + 5
self.start_timeout = start_timeout
self._start()
def allocate(self, handles, initial=False, params={}):
self._handles = handles
def pop(self, n=None):
"""Call from main thread. Returns the list of newly-available (handle, env) pairs."""
if self._popped:
assert n is None
return []
self._popped = True
envs = []
for i, instance in enumerate(self.instances):
env = remote.Remote(
handle=self._handles[i],
vnc_address='{}:{}'.format(instance.host, instance.vnc_port),
vnc_password='openai',
rewarder_address='{}:{}'.format(instance.host, instance.rewarder_port),
rewarder_password='openai',
)
envs.append(env)
return envs
def _start(self):
self.instances = [DockerInstance(self._assigner, self.runtime, label=str(i)) for i in range(self._n)]
[instance.start() for instance in self.instances]
self.start_logging(self.instances)
self.healthcheck(self.instances)
def close(self):
with self.lock:
[instance.close() for instance in self.instances]
def start_logging(self, instances):
containers = [instance._container for instance in instances]
labels = [str(instance.label) for instance in instances]
if all(instance.reusing for instance in instances):
# All containers are being reused, so only bother showing
# a subset of the backlog.
tail = 0
else:
# At least one container is new, so just show
# everything. It'd be nice to have finer-grained control,
# but this would require patching the log printer.
tail = 'all'
log_printer.build(containers, labels, log_args={'tail': tail})
def healthcheck(self, instances):
# Wait for boot
healthcheck.run(
['{}:{}'.format(instance.assigner.info['host'], instance.vnc_port) for instance in instances],
['{}:{}'.format(instance.assigner.info['host'], instance.rewarder_port) for instance in instances],
start_timeout=30,
)
def get_client():
"""
Set DOCKER_HOST (and probably DOCKER_TLS_VERIFY and DOCKER_CERT_PATH) to connect to a docker instance through TCP.
Leave DOCKER_HOST unset and it will use the default, typically unix:/var/run/docker.sock
It also needs to know how to connect to ports on the docker container after creating it.
Set DOCKER_NET_HOST to provide an IP address to connect to the VNC ports on
otherwise if DOCKER_HOST has a hostname, it will connect to the VNC ports using that name.
otherwise it connects using localhost
"""
info = {}
host = os.environ.get('DOCKER_HOST')
net_host = os.environ.get('DOCKER_NET_HOST')
client_api_version = os.environ.get('DOCKER_API_VERSION')
if not client_api_version:
client_api_version = "auto"
# IP to use for started containers
if net_host:
info['host'] = net_host
elif host:
info['host'] = urlparse.urlparse(host).netloc.split(':')[0]
else:
info['host'] = 'localhost'
verify = os.environ.get('DOCKER_TLS_VERIFY') == '1'
if verify: # use TLS
assert_hostname = None
cert_path = os.environ.get('DOCKER_CERT_PATH')
if cert_path:
client_cert = (os.path.join(cert_path, 'cert.pem'), os.path.join(cert_path, 'key.pem'))
ca_cert = os.path.join(cert_path, 'ca.pem')
else:
client_cert = ca_cert = None
tls_config = docker.tls.TLSConfig(
client_cert=client_cert,
ca_cert=ca_cert,
verify=verify,
assert_hostname=assert_hostname,
)
return docker.Client(base_url=host, tls=tls_config, version=client_api_version), info
else:
return docker.Client(base_url=host, version=client_api_version), info
class PortAssigner(object):
def __init__(self, reuse=False):
self.reuse = reuse
self.instance_id = 'universe-' + random_alphanumeric(length=6)
self.client, self.info = get_client()
self._next_port = 5900
self._refresh_ports()
def _refresh_ports(self):
ports = {}
for container in self.client.containers():
for port in container['Ports']:
# {u'IP': u'0.0.0.0', u'Type': u'tcp', u'PublicPort': 5000, u'PrivatePort': 500}
if port['Type'] == 'tcp' and 'PublicPort' in port:
ports[port['PublicPort']] = container['Id']
logger.info('Ports used: %s', ports.keys())
self._ports = ports
def allocate_ports(self, num):
if self.reuse and self._next_port in self._ports:
vnc_id = self._ports[self._next_port]
rewarder_id = self._ports.get(self._next_port+10000)
# Reuse an existing docker container if it exists
if (self._next_port+10000) not in self._ports:
raise error.Error("Port {} was allocated but {} was not. This indicates unexpected state with spun-up VNC docker instances.".format(self._next_port, self._next_port+1))
elif vnc_id != rewarder_id:
raise error.Error("Port {} is exposed from {} while {} is exposed from {}. Both should come from a single Docker instance running your environment.".format(vnc_id, self._next_port, rewarder_id, self._next_port+10000))
base = self._next_port
self._next_port += 1
return base, base+10000, vnc_id
elif not self.reuse:
# Otherwise, allocate find the lowest free pair of
# ports. This doesn't work for the reuse case since on
# restart we won't remember where we spun up our
# containers.
while self._next_port in self._ports or (self._next_port+10000) in self._ports:
self._next_port += 1
base = self._next_port
self._next_port += 1
# And get started!
return base, base+10000, None
class DockerInstance(object):
def __init__(self, assigner, runtime, label='main'):
self._docker_closer_id = docker_closer.register(self)
self.label = label
self.assigner = assigner
self.name='{}-{}'.format(self.assigner.instance_id, self.label),
self.runtime = runtime
self._container_id = None
self._closed = False
self._container = None
self.host = self.assigner.info['host']
self.vnc_port = None
self.rewarder_port = None
self.reusing = None
self.started = False
def start(self, attempts=None):
if attempts is None:
# If we're reusing, we don't scan through ports for a free
# one.
if not self.assigner.reuse:
attempts = 20
else:
attempts = 1
for attempt in range(attempts):
self._spawn()
e = self._start()
if e is None:
return
time.sleep(random.uniform(1.0, 5.0))
self.assigner._refresh_ports()
raise error.Error('[{}] Could not start container after {} attempts. Last error: {}'.format(self.label, attempts, e))
def _spawn(self):
if self.runtime.image is None:
raise error.Error('No image specified')
assert self._container_id is None
self.vnc_port, self.rewarder_port, self._container_id = self.assigner.allocate_ports(2)
if self._container_id is not None:
logger.info('[%s] Reusing container %s on ports %s and %s', self.label, self._container_id[:12], self.vnc_port, self.rewarder_port)
self.reusing = True
self.started = True
return
self.reusing = False
logger.info('[%s] Creating container: image=%s. Run the same thing by hand as: %s',
self.label,
self.runtime.image,
pretty_command(self.runtime.cli_command(self.vnc_port, self.rewarder_port)))
try:
container = self._spawn_container()
except docker.errors.NotFound as e:
# Looks like we need to pull the image
assert 'No such image' in e.explanation.decode('utf-8'), 'Expected NotFound error message message to include "No such image", but it was: {}. This is probably just a bug in this assertion and the assumption was incorrect'.format(e.explanation)
logger.info('Image %s not present locally; pulling', self.runtime.image)
self._pull_image()
# If we called pull_image from multiple processes (as we do with universe-starter-agent A3C)
# these will all return at the same time. We probably all got the same port numbers before the pull started,
# so wait a short random time and refresh our port numbers
time.sleep(random.uniform(0.5, 2.5))
self.assigner._refresh_ports()
self.vnc_port, self.rewarder_port, self._container_id = self.assigner.allocate_ports(2)
if self._container_id is not None:
logger.info('[%s] Reusing container %s on ports %s and %s', self.label, self._container_id[:12], self.vnc_port, self.rewarder_port)
self.reusing = True
self.started = True
return
# Try spawning again.
container = self._spawn_container()
self._container_id = container['Id']
def _pull_image(self):
output = self.client.pull(self.runtime.image, stream=True)
return progress_stream.get_digest_from_pull(
progress_stream.stream_output(output, sys.stdout))
# docker-compose uses this:
# try:
# except StreamOutputError as e:
# if not ignore_pull_failures:
# raise
# else:
# log.error(six.text_type(e))
def _spawn_container(self):
# launch instance, and refresh if error
container = self.client.create_container(
image=self.runtime.image,
command=self.runtime.command,
# environment=self.runtime.environment,
name=self.name,
host_config=self.client.create_host_config(
port_bindings={
5900: self.vnc_port,
15900: self.rewarder_port,
},
**self.runtime.host_config),
labels={
'com.openai.automanaged': 'true',
}
)
return container
def _start(self):
# Need to start up the container!
if not self.started:
logger.debug('[%s] Starting container: id=%s', self.label, self._container_id)
try:
self.client.start(container=self._container_id)
except docker.errors.APIError as e:
if 'port is already allocated' in str(e.explanation):
logger.info('[%s] Could not start container: %s', self.label, e)
self._remove()
return e
else:
raise
else:
self.started = True
self._container = container.Container.from_id(self.client, self._container_id)
return None
def _remove(self):
logger.info("Killing and removing container: id=%s", self._container_id)
try:
self.client.remove_container(container=self._container_id, force=True)
except docker.errors.APIError as e:
# This seems to happen occasionally when we try to delete a container immediately after creating it.
# But although we get an error trying to remove it, it usually goes away shortly
# A typical error message is
# Driver aufs failed to remove root filesystem 0015803583d91741d25fce28ae0ef540b436853d1c90061caacaef97e3682403: \
# rename /var/lib/docker/aufs/mnt/69a72854511f1fbb9d7cb0ef0ce0787e573af0887c1213ba3a0c3a0cfd71efd2 \
# /var/lib/docker/aufs/mnt/69a72854511f1fbb9d7cb0ef0ce0787e573af0887c1213ba3a0c3a0cfd71efd2-removing: \
# device or resource busy
# Just proceed as if it had gone away
if 'device or resource busy' in str(e.explanation):
logger.info("[%s] Could not remove container: %s. You can always kill all automanaged environments on this Docker daemon via: docker rm -f $(docker ps -q -a -f 'label=com.openai.automanaged=true')", self.label, e)
self._container_id = None
return e
else:
raise
self._container_id = None
def __del__(self):
self.close()
def close(self):
if self._closed:
return
docker_closer.unregister(self._docker_closer_id)
# Make sure 1. we were the onse who started it, 2. it's
# actually been started, and 3. we're meant to kill it.
if self._container_id and not self.assigner.reuse:
self._remove()
self._closed = True
@property
def client(self):
return self.assigner.client
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
from universe.runtimes import registration
# docker run --name test --rm -ti -p 5900:5900 -p 15900:15900 quay.io/openai/universe.gym-core
instance = DockerManager(
runtime=registration.runtime_spec('gym-core'),
n=2,
)
instance.start()
import ipdb;ipdb.set_trace()