## Use Boto3 to submit files for processing to existing job queue

In [8]:
import boto3
import s3fs
from concurrent.futures import ThreadPoolExecutor, as_completed

client = boto3.client('batch', region_name='eu-west-1')
fs = s3fs.S3FileSystem()

files = ['s3://' + file for file in fs.ls('nyc-tlc/trip data') if 'yellow' in file]
print('File count', len(files))
files = files[:5]
files

File count 102


['s3://nyc-tlc/trip data/yellow_tripdata_2009-01.csv',
 's3://nyc-tlc/trip data/yellow_tripdata_2009-02.csv',
 's3://nyc-tlc/trip data/yellow_tripdata_2009-03.csv',
 's3://nyc-tlc/trip data/yellow_tripdata_2009-04.csv',
 's3://nyc-tlc/trip data/yellow_tripdata_2009-05.csv']

## Ensure Queue exists

In [7]:
client.describe_job_queues(jobQueues=['Taxi-Cleaner-Queue'])

{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
   'content-length': '353',
   'content-type': 'application/json',
   'date': 'Wed, 06 Dec 2017 07:38:08 GMT',
   'via': '1.1 dd638904a9a9c873c81d978c8e71b733.cloudfront.net (CloudFront)',
   'x-amz-cf-id': 'qwnZG63c0pUe14oKMcaOs5x3Uqkf0JsvIsRoRYWfJwlHzVEU-b87mQ==',
   'x-amzn-requestid': '672bf32c-da58-11e7-b2a1-8d56f89b9d67',
   'x-amzn-trace-id': 'sampled=0;root=1-5a279e60-b4479471bb8d7816d1aa2f2a',
   'x-cache': 'Miss from cloudfront'},
  'HTTPStatusCode': 200,
  'RequestId': '672bf32c-da58-11e7-b2a1-8d56f89b9d67',
  'RetryAttempts': 0},
 'jobQueues': [{'computeEnvironmentOrder': [{'computeEnvironment': 'arn:aws:batch:eu-west-1:755632011865:compute-environment/Tax-Cleaner-Env',
     'order': 1}],
   'jobQueueArn': 'arn:aws:batch:eu-west-1:755632011865:job-queue/Taxi-Cleaner-Queue',
   'jobQueueName': 'Taxi-Cleaner-Queue',
   'priority': 1,
   'state': 'ENABLED',
   'status': 'VALID',
   'statusReason': 'JobQueue Heal

In [9]:
def submit_job(file):
    """
    Given s3://file/location, submit this to the taxi-cleaner queue 
    
    Parameters
    ----------
    file: string - S3 URI expected to be from 'nyc-tlc/trip data' bucket
    
    Returns 
    -------
    http dict response from boto3 submit_job() 
    """
    # Pluck out year and month of this file
    year, month = file[-11:-4].split('-')

    # build container command
    command = ('python,/workdir/core/batch_service/clean_routine.py,--s3-file,{file},'
               '--output-bucket,s3://milesg-taxi-data-east'.format(file=file)).split(',')

    # Submit job w/ boto3 client
    job = client.submit_job(jobName='TaxiData-{}-{}'.format(year, month),
                            jobQueue='Taxi-Cleaner-Queue',
                            jobDefinition='Data-Cleaner:1',
                            containerOverrides={'command': command},
                            retryStrategy={'attempts': 3}
                           )
    return job


# Submit the jobs via threading, don't wait on http responses
with ThreadPoolExecutor(max_workers=10) as executor:
    
    futures = {executor.submit(submit_job, file): file for file in files}
    for future in as_completed(futures):
        file = futures[future]
        if not future.exception():
            print('File: {} submitted with {} response.'
                  .format(file, future.result().get('ResponseMetadata').get('HTTPStatusCode')))
        else:
            print('File: {} submitted in error: {}'.format(file, future.exception()))

File: s3://nyc-tlc/trip data/yellow_tripdata_2009-03.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-05.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-04.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-01.csv submitted with 200 response.
File: s3://nyc-tlc/trip data/yellow_tripdata_2009-02.csv submitted with 200 response.


In [10]:
future.result()

{'ResponseMetadata': {'HTTPHeaders': {'connection': 'keep-alive',
   'content-length': '77',
   'content-type': 'application/json',
   'date': 'Wed, 06 Dec 2017 07:41:03 GMT',
   'via': '1.1 11a727876922c83c000e3ada668fa181.cloudfront.net (CloudFront)',
   'x-amz-cf-id': 'GmryV-EHaIy8QrzwkXp4dgEYZiUBjxMYHA0AcPva0oVVC1ekmokR1g==',
   'x-amzn-requestid': 'c96c88c9-da58-11e7-a94d-8da7b7ad37e6',
   'x-amzn-trace-id': 'sampled=0;root=1-5a279f04-3cc6dab08fe7b84be74e1147',
   'x-cache': 'Miss from cloudfront'},
  'HTTPStatusCode': 200,
  'RequestId': 'c96c88c9-da58-11e7-a94d-8da7b7ad37e6',
  'RetryAttempts': 0},
 'jobId': '97c1ecb6-d305-47c2-a0c9-37a89f51731a',
 'jobName': 'TaxiData-2009-02'}