mrjob is now prepared if EMR ever starts using subseconds in timestamps

mochi · Jul 12, 2011 · d5eb7da · d5eb7da
1 parent c2dcde9
commit d5eb7da
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 11 deletions.
diff --git a/mrjob/emr.py b/mrjob/emr.py
@@ -81,6 +81,10 @@
 # regex for matching job log URIs
 JOB_LOG_URI_RE = re.compile(r'^.*?/jobs/.+?_(?P<mystery_string_1>\d+)_job_(?P<timestamp>\d+)_(?P<step_num>\d+)_hadoop_streamjob(?P<mystery_string_2>\d+).jar$')
 
+# sometimes AWS gives us seconds as a decimal, which we can't parse
+# with boto.utils.ISO8601
+SUBSECOND_RE = re.compile('\.[0-9]+')
+
 # map from AWS region to EMR endpoint
 # see http://docs.amazonwebservices.com/ElasticMapReduce/latest/DeveloperGuide/index.html?ConceptsRequestEndpoints.html
 REGION_TO_EMR_ENDPOINT = {
@@ -121,11 +125,13 @@ def s3_key_to_uri(s3_key):
     return 's3://%s/%s' % (s3_key.bucket.name, s3_key.name)
 
 
-def _to_timestamp(iso8601_time):
+def iso8601_to_timestamp(iso8601_time):
+    iso8601_time = SUBSECOND_RE.sub('', iso8601_time)
     return time.mktime(time.strptime(iso8601_time, boto.utils.ISO8601))
 
 
-def _to_datetime(iso8601_time):
+def iso8601_to_datetime(iso8601_time):
+    iso8601_time = SUBSECOND_RE.sub('', iso8601_time)
     return datetime.datetime.strptime(iso8601_time, boto.utils.ISO8601)
 
 
@@ -179,7 +185,7 @@ def describe_all_job_flows(emr_conn, states=None, jobflow_ids=None,
             # set created_before to be just after the start time of
             # the first job returned, to deal with job flows started
             # in the same second
-            min_create_time = min(_to_datetime(jf.creationdatetime)
+            min_create_time = min(iso8601_to_datetime(jf.creationdatetime)
                                   for jf in job_flows)
             created_before = min_create_time + datetime.timedelta(seconds=1)
             # if someone managed to start 501 job flows in the same second,
@@ -992,8 +998,8 @@ def _wait_for_job_to_complete(self):
 
                 if (hasattr(step, 'startdatetime') and
                     hasattr(step, 'enddatetime')):
-                    start_time = _to_timestamp(step.startdatetime)
-                    end_time = _to_timestamp(step.enddatetime)
+                    start_time = iso8601_to_timestamp(step.startdatetime)
+                    end_time = iso8601_to_timestamp(step.enddatetime)
                     total_step_time += end_time - start_time
 
             if not step_states:

diff --git a/mrjob/tools/emr/s3_tmpwatch.py b/mrjob/tools/emr/s3_tmpwatch.py
@@ -23,14 +23,10 @@
 except ImportError:
     boto = None
 
-from mrjob.emr import EMRJobRunner, parse_s3_uri
+from mrjob.emr import EMRJobRunner, iso8601_to_datetime, parse_s3_uri
 from mrjob.util import log_to_stream
 
 
-# sometimes S3 gives us seconds as a decimal, which we can't parse
-# with boto.utils.ISO8601
-SUBSECOND_RE = re.compile('\.[0-9]+')
-
 log = logging.getLogger('mrjob.tools.emr.s3_tmpwatch')
 
 
@@ -71,7 +67,7 @@ def s3_cleanup(glob_path, time_old, dry_run=False, conf_path=None):
         bucket = s3_conn.get_bucket(bucket_name)
 
         for key in bucket.list(key_name):
-            last_modified = datetime.strptime(SUBSECOND_RE.sub('', key.last_modified), boto.utils.ISO8601)
+            last_modified = iso8601_to_datetime(key.last_modified)
             age = datetime.utcnow() - last_modified
             if age > time_old:
                 # Delete it