-
Notifications
You must be signed in to change notification settings - Fork 61
/
publish.py
349 lines (304 loc) · 13.4 KB
/
publish.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
import os
import time
import hashlib
import logging
import mimetypes
import multiprocessing
from django.conf import settings
from multiprocessing.pool import ThreadPool
from bakery import DEFAULT_GZIP_CONTENT_TYPES
from bakery.management.commands import (
BasePublishCommand,
get_s3_client
)
from django.core.urlresolvers import get_callable
from django.core.management.base import CommandError
logger = logging.getLogger(__name__)
class Command(BasePublishCommand):
help = "Syncs the build directory with Amazon s3 bucket"
# Default permissions for the files published to s3
DEFAULT_ACL = 'public-read'
# Error messages we might use below
build_missing_msg = "Build directory does not exist. Cannot publish something before you build it."
build_unconfig_msg = "Build directory unconfigured. Set BUILD_DIR in settings.py or provide it with --build-dir"
bucket_unconfig_msg = "Bucket unconfigured. Set AWS_BUCKET_NAME in settings.py or provide it with --aws-bucket-name"
views_unconfig_msg = "Bakery views unconfigured. Set BAKERY_VIEWS in settings.py or provide a list as arguments."
def add_arguments(self, parser):
parser.add_argument(
"--build-dir",
action="store",
dest="build_dir",
default='',
help="Specify the path of the build directory. Will use settings.BUILD_DIR by default."
)
parser.add_argument(
"--aws-bucket-name",
action="store",
dest="aws_bucket_name",
default='',
help="Specify the AWS bucket to sync with. Will use settings.AWS_BUCKET_NAME by default."
)
parser.add_argument(
"--force",
action="store_true",
dest="force",
default="",
help="Force a republish of all items in the build directory"
)
parser.add_argument(
"--dry-run",
action="store_true",
dest="dry_run",
default="",
help="Display the output of what would have been uploaded removed, but without actually publishing."
)
parser.add_argument(
"--no-delete",
action="store_true",
dest="no_delete",
default=False,
help=("Keep files in S3, even if they do not exist in the build directory.")
)
parser.add_argument(
"--no-pooling",
action="store_true",
dest="no_pooling",
default=False,
help=("Run uploads one by one rather than pooling them to run concurrently.")
)
def handle(self, *args, **options):
"""
Sync files in the build directory to a specified S3 bucket
"""
# Counts and such we can use to keep tabs on this as they progress
self.uploaded_files = 0
self.uploaded_file_list = []
self.deleted_files = 0
self.deleted_file_list = []
self.start_time = time.time()
# Configure all the options we're going to use
self.set_options(options)
# Initialize the boto connection
logger.debug("Connecting to s3")
if self.verbosity > 2:
self.stdout.write("Connecting to s3")
self.s3_client, self.s3_resource = get_s3_client()
# Grab our bucket
logger.debug("Retriving bucket {}".format(self.aws_bucket_name))
if self.verbosity > 2:
self.stdout.write("Retriving bucket {}".format(self.aws_bucket_name))
self.bucket = self.s3_resource.Bucket(self.aws_bucket_name)
# Get a list of all keys in our s3 bucket
logger.debug("Retrieving objects now published in bucket")
if self.verbosity > 2:
self.stdout.write("Retrieving objects now published in bucket")
self.s3_obj_dict = self.get_all_objects_in_bucket(
self.aws_bucket_name,
self.s3_client
)
# Get a list of all the local files in our build directory
logger.debug("Retrieving files built locally")
if self.verbosity > 2:
self.stdout.write("Retrieving files built locally")
self.local_file_list = self.get_local_file_list()
# Sync local files with s3 bucket
logger.debug("Syncing local files with bucket")
if self.verbosity > 2:
self.stdout.write("Syncing local files with bucket")
self.sync_with_s3()
# Delete anything that's left in our keys dict
if not self.dry_run and not self.no_delete:
self.deleted_file_list = list(self.s3_obj_dict.keys())
self.deleted_files = len(self.deleted_file_list)
if self.deleted_files:
logger.debug("Deleting %s keys" % self.deleted_files)
if self.verbosity > 0:
self.stdout.write("Deleting %s keys" % self.deleted_files)
self.batch_delete_s3_objects(
self.deleted_file_list,
self.aws_bucket_name
)
# Run any post publish hooks on the views
if not hasattr(settings, 'BAKERY_VIEWS'):
raise CommandError(self.views_unconfig_msg)
for view_str in settings.BAKERY_VIEWS:
view = get_callable(view_str)()
if hasattr(view, 'post_publish'):
getattr(view, 'post_publish')(self.bucket)
# We're finished, print the final output
elapsed_time = time.time() - self.start_time
msg = "Publish completed, %d uploaded and %d deleted files in %.2f seconds" % (
self.uploaded_files,
self.deleted_files,
elapsed_time
)
logger.info(msg)
if self.verbosity > 0:
self.stdout.write(msg)
if self.dry_run:
logger.info("Publish executed with the --dry-run option. No content was changed on S3.")
if self.verbosity > 0:
self.stdout.write("Publish executed with the --dry-run option. No content was changed on S3.")
def set_options(self, options):
"""
Configure all the many options we'll need to make this happen.
"""
self.verbosity = int(options.get('verbosity'))
# Will we be gzipping?
self.gzip = getattr(settings, 'BAKERY_GZIP', False)
# And if so what content types will we be gzipping?
self.gzip_content_types = getattr(
settings,
'GZIP_CONTENT_TYPES',
DEFAULT_GZIP_CONTENT_TYPES
)
# What ACL (i.e. security permissions) will be giving the files on S3?
self.acl = getattr(settings, 'DEFAULT_ACL', self.DEFAULT_ACL)
# Should we set cache-control headers?
self.cache_control = getattr(settings, 'BAKERY_CACHE_CONTROL', {})
# If the user specifies a build directory...
if options.get('build_dir'):
# ... validate that it is good.
if not os.path.exists(options.get('build_dir')):
raise CommandError(self.build_missing_msg)
# Go ahead and use it
self.build_dir = options.get("build_dir")
# If the user does not specify a build dir...
else:
# Check if it is set in settings.py
if not hasattr(settings, 'BUILD_DIR'):
raise CommandError(self.build_unconfig_msg)
# Then make sure it actually exists
if not os.path.exists(settings.BUILD_DIR):
raise CommandError(self.build_missing_msg)
# Go ahead and use it
self.build_dir = settings.BUILD_DIR
# If the user provides a bucket name, use that.
if options.get("aws_bucket_name"):
self.aws_bucket_name = options.get("aws_bucket_name")
else:
# Otherwise try to find it the settings
if not hasattr(settings, 'AWS_BUCKET_NAME'):
raise CommandError(self.bucket_unconfig_msg)
self.aws_bucket_name = settings.AWS_BUCKET_NAME
# If the user sets the --force option
if options.get('force'):
self.force_publish = True
else:
self.force_publish = False
# set the --dry-run option
if options.get('dry_run'):
self.dry_run = True
if self.verbosity > 0:
logger.info("Executing with the --dry-run option set.")
else:
self.dry_run = False
self.no_delete = options.get('no_delete')
self.no_pooling = options.get('no_pooling')
def get_local_file_list(self):
"""
Walk the local build directory and create a list of relative and
absolute paths to files.
"""
file_list = []
for (dirpath, dirnames, filenames) in os.walk(self.build_dir):
for fname in filenames:
# relative path, to sync with the S3 key
local_key = os.path.join(
os.path.relpath(dirpath, self.build_dir),
fname
)
if local_key.startswith('./'):
local_key = local_key[2:]
file_list.append(local_key)
return file_list
def sync_with_s3(self):
"""
Walk through our self.local_files list, and match them with the list
of keys in the S3 bucket.
"""
# Create a list to put all the files we're going to update
self.update_list = []
# Figure out which files need to be updated and upload all these files
logger.debug("Comparing {} local files with bucket".format(len(self.local_file_list)))
if self.no_pooling:
[self.compare_local_file(f) for f in self.local_file_list]
else:
cpu_count = multiprocessing.cpu_count()
logger.debug("Pooling local file comparison on {} CPUs".format(cpu_count))
pool = ThreadPool(processes=cpu_count)
pool.map(self.compare_local_file, self.local_file_list)
logger.debug("Uploading {} new or updated files to bucket".format(len(self.update_list)))
if self.no_pooling:
[self.upload_to_s3(*u) for u in self.update_list]
else:
logger.debug("Pooling s3 uploads on {} CPUs".format(cpu_count))
pool = ThreadPool(processes=cpu_count)
pool.map(self.pooled_upload_to_s3, self.update_list)
def compare_local_file(self, file_key):
"""
Compares a local version of a file with what's already published.
If an update is needed, the file's key is added self.update_list.
"""
# Where is the file?
file_path = os.path.join(self.build_dir, file_key)
# Does it exist in our s3 object list?
if file_key in self.s3_obj_dict:
# If force_publish is true, we don't need to bother opening the files
if self.force_publish:
self.update_list.append((file_key, file_path))
else:
# If it's not (the default) let's open it up and convert it to a hexdigest
local_data = open(file_path, "rb").read()
local_md5 = hashlib.md5(local_data).hexdigest()
# Now lets compare it to the hexdigest of what's on s3
s3_md5 = self.s3_obj_dict[file_key].get('ETag').strip('"')
# If their md5 hexdigests match, do nothing
if s3_md5 == local_md5:
logger.debug("{} has not changed".format(file_key))
# Unless we want ot publish everything no matter what, then add it to the update list
# And if they don't match, we want to add it as well
else:
logger.debug("{} has changed".format(file_key))
self.update_list.append((file_key, file_path))
# Remove the file from the s3 dict, we don't need it anymore
del self.s3_obj_dict[file_key]
# if the file doesn't exist, queue it for creation
else:
logger.debug("{} has been added".format(file_key))
self.update_list.append((file_key, file_path))
def pooled_upload_to_s3(self, payload):
"""
A passthrough for our ThreadPool because it can't take two arguments.
So all we're doing here is split the list into args for the real
upload function.
"""
self.upload_to_s3(*payload)
def upload_to_s3(self, key, filename):
"""
Set the content type and gzip headers if applicable
and upload the item to S3
"""
extra_args = {'ACL': self.acl}
# guess and add the mimetype to header
content_type = mimetypes.guess_type(filename)[0]
if content_type:
extra_args['ContentType'] = content_type
# add the gzip headers, if necessary
if self.gzip and content_type in self.gzip_content_types:
extra_args['ContentEncoding'] = 'gzip'
# add the cache-control headers if necessary
if content_type in self.cache_control:
extra_args['CacheControl'] = ''.join((
'max-age=',
str(self.cache_control[content_type])
))
# access and write the contents from the file
if not self.dry_run:
logger.debug("Uploading %s" % filename)
if self.verbosity > 0:
self.stdout.write("Uploading %s" % filename)
s3_obj = self.s3_resource.Object(self.aws_bucket_name, key)
s3_obj.upload_file(filename, ExtraArgs=extra_args)
self.uploaded_files += 1
self.uploaded_file_list.append(filename)