## Use Boto3 to submit files for processing to existing job queue

In [1]:
import boto3
import s3fs
from concurrent.futures import ThreadPoolExecutor, as_completed

client = boto3.client('batch')
fs = s3fs.S3FileSystem()

files = ['s3://' + file for file in fs.ls('nyc-tlc/trip data') if 'yellow' in file]
print('File count', len(files))
files[:5]

File count 102


['s3://nyc-tlc/trip data/yellow_tripdata_2009-01.csv',
 's3://nyc-tlc/trip data/yellow_tripdata_2009-02.csv',
 's3://nyc-tlc/trip data/yellow_tripdata_2009-03.csv',
 's3://nyc-tlc/trip data/yellow_tripdata_2009-04.csv',
 's3://nyc-tlc/trip data/yellow_tripdata_2009-05.csv']

## Ensure Queue exists

In [2]:
client.describe_job_queues(jobQueues=['Taxi-Data-Cleaner-Queue'])

{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
   'content-length': '369',
   'content-type': 'application/json',
   'date': 'Sun, 01 Oct 2017 12:19:12 GMT',
   'via': '1.1 fae00924177d29b933b8883256846843.cloudfront.net (CloudFront)',
   'x-amz-cf-id': 'gwbNPsaTkQywwZncQsoki6mCmFZ7IOh6bGpfNYRP9zo3c2Qv6-Xi1w==',
   'x-amzn-requestid': 'bbf779f4-a6a2-11e7-92b7-efaee1e6f4e4',
   'x-amzn-trace-id': 'sampled=0;root=1-59d0dd40-5fd965bc788fcc49cb74a9fb',
   'x-cache': 'Miss from cloudfront'},
  'HTTPStatusCode': 200,
  'RequestId': 'bbf779f4-a6a2-11e7-92b7-efaee1e6f4e4',
  'RetryAttempts': 0},
 'jobQueues': [{'computeEnvironmentOrder': [{'computeEnvironment': 'arn:aws:batch:us-east-1:755632011865:compute-environment/NYC-Taxi-Cleaning-Env',
     'order': 1}],
   'jobQueueArn': 'arn:aws:batch:us-east-1:755632011865:job-queue/Taxi-Data-Cleaner-Queue',
   'jobQueueName': 'Taxi-Data-Cleaner-Queue',
   'priority': 1,
   'state': 'ENABLED',
   'status': 'VALID',
   'statusReason'

In [3]:
def submit_job(file):
    """
    Given s3://file/location, submit this to the taxi-cleaner queue 
    
    Returns: http response from boto3 submit_job() 
    """
    # Pluck out year and month of this file
    year, month = file[-11:-4].split('-')

    # build container command
    command = ('python,/workdir/core/batch_service/clean_routine.py,--s3-file,{file},'
               '--output-bucket,s3://milesg-taxi-data-east'.format(file=file)).split(',')

    # Submit job w/ boto3 client
    job = client.submit_job(jobName='TaxiData-{}-{}'.format(year, month),
                            jobQueue='Taxi-Data-Cleaner-Queue',
                            jobDefinition='Taxi-Data-Cleaner:6',
                            containerOverrides={'command': command},
                            retryStrategy={'attempts': 3}
                           )
    return job


# Submit the jobs via threading, don't wait on http responses
with ThreadPoolExecutor(max_workers=10) as executor:
    
    futures = {executor.submit(submit_job, file): file for file in files}
    for future in as_completed(futures):
        file = futures[future]
        if not future.exception():
            print('File: {} submitted with {} response.'
                  .format(file, future.result().get('ResponseMetadata').get('HTTPStatusCode')))
        else:
            print('File: {} submitted in error: {}'.format(file, future.exception()))

File: s3://nyc-tlc/trip data/yellow_tripdata_2009-01.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-09.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-10.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-11.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-04.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-06.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2010-01.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-07.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-12.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-05.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-02.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-08.c

File: s3://nyc-tlc/trip data/yellow_tripdata_2016-07.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2017-01.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2017-02.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2016-08.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2016-12.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2016-10.csv submitted with 200 response.


In [4]:
future.result()

{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
   'content-length': '77',
   'content-type': 'application/json',
   'date': 'Sun, 01 Oct 2017 10:58:14 GMT',
   'via': '1.1 aadee5866bd4cbf7a3aa07ff8ac5a149.cloudfront.net (CloudFront)',
   'x-amz-cf-id': 'jMsE61M2mzdIxvFCsrhvXrNZO0TOI-IbG5kF3-bJx7p-3y4SPyoqmw==',
   'x-amzn-requestid': '6c170569-a697-11e7-9cc1-3389f704a5ef',
   'x-amzn-trace-id': 'sampled=0;root=1-59d0ca46-d8b7c41ba4bc2868a0ba6a37',
   'x-cache': 'Miss from cloudfront'},
  'HTTPStatusCode': 200,
  'RequestId': '6c170569-a697-11e7-9cc1-3389f704a5ef',
  'RetryAttempts': 3},
 'jobId': '486f3570-17b3-4c40-bdac-71f5c61d1bde',
 'jobName': 'TaxiData-2016-09'}