# Runs Sorel job

Captures job initiation information

In [1]:
import boto3
import time
import datetime
from collections import Counter

In [2]:
def logmsg(msg):
    print(msg)

def logmsg01(msg):
    logmsg(f'{datetime.datetime.now():%Y-%m-%d %H:%M:%S}:  {msg}')

In [3]:
FILTER_SIZE = 2
MAX_CONCUR_JOBS = 16
NUM_JOBS_TO_RUN = 256

TARG_BUCKET = 'sorel-20m-demo'

JOB_NAME = 'transfer-sorel-binaries'

In [4]:
session = boto3.Session(profile_name='personal')

Clean out target

In [5]:
def get_obj_to_delete(prefix='output'):
    return s3.Bucket(TARG_BUCKET).objects.filter(Prefix=prefix)

def num_obj(objects):
    return len([o for o in objects])

# Power shell command:
#   ('0001', '0002', '001','002','003','3') | %{write-host $_;aws s3 --profile personal rm s3://sorel-20m-demo/output/$_/ --recursive}
s3 = session.resource('s3')

objects = get_obj_to_delete()
logmsg(f'Keys to delete:  {num_obj(objects)}')
while True:
    # Will only delete 1000 objects
    response = objects.delete()
    if 'Errors' in response:
        logmsg(f'Errors encountered when performing delete')
        logmsg(response['Errors'])
        break
    objects = get_obj_to_delete()
    if num_obj(objects) == 0:
        break
    logmsg(f'{num_obj(objects)} remain to delete')
logmsg('Delete complete')


Keys to delete:  0
Delete complete


In [6]:
padding = FILTER_SIZE

job_status = {f'{i:0{padding}x}':{'JobRunId':None, 'JobRunState':None} for i in range(NUM_JOBS_TO_RUN)}
num_success = 0

In [11]:
glue_client = session.client('glue', region_name='us-east-1')

def run_job(filter_chars):
    response = glue_client.start_job_run(
                    JobName=JOB_NAME,
                    Arguments={
                        "--enable-auto-scaling":"true",
                        "--enable-metrics":"true",
                        "--enable-spark-ui":"true",
                        "--spark-event-logs-path":'s3://aws-glue-assets-883375387566-us-east-1/sparkHistoryLogs/',
                        "--enable-continuous-cloudwatch-log":"true",
                        "--filter_chars":filter_chars
                    },
                    WorkerType='G.1X',
                    NumberOfWorkers=5,
                )
    job_status[filter_chars] = {'JobRunId':response["JobRunId"], 'JobRunState':'RUNNING'}
    logmsg01(f'Creating job with filter_chars={filter_chars}, JobRunId={response["JobRunId"]}')
    return response

In [20]:
logmsg01('Process begins')
start_time = datetime.datetime.now()

while True:
    # Update the status of each job that is running
    num_running = 0
    for filter_chars, job_info in job_status.items():
        if job_info['JobRunState'] == 'RUNNING':
            response = glue_client.get_job_run(JobName=JOB_NAME, RunId=job_info['JobRunId'])
            job_run = response['JobRun']
            run_status =   {'JobRunId': job_run['Id'],
                            'JobRunState': job_run['JobRunState'], 
                            'StartedOn':job_run['StartedOn'], 
                            'LastModifiedOn': job_run['LastModifiedOn'], 
                            'ExecutionTime': job_run['ExecutionTime']}
            job_info['JobRunState'] = run_status['JobRunState']

        if job_info['JobRunState'] == 'RUNNING':
            num_running += 1

    # Start any jobs that are required and within the MAX_CONCUR_JOBS
    for filter_chars, job_info in job_status.items():
        if job_info['JobRunState'] in ['SUCCEEDED', 'RUNNING']:
            continue
        if num_running >= MAX_CONCUR_JOBS:
            break  
        # Start an instance of the job for the current parameter
        response = run_job(filter_chars)
        num_running += 1

    status_summary = Counter(d['JobRunState'] for d in job_status.values())
    
    num_success = status_summary.get('SUCCEEDED', 0)
    if num_success >= NUM_JOBS_TO_RUN:
        break
    
    logmsg01(status_summary)
    time.sleep(60)
    
status_summary = Counter(d['JobRunState'] for d in job_status.values())
logmsg01(status_summary)
end_time = datetime.datetime.now()
logmsg(f'Process duration = {end_time - start_time}')
logmsg01('Process complete')


2023-03-20 04:21:49:  Process begins
2023-03-20 04:22:34:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:23:36:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:24:37:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:25:39:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:26:41:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:27:43:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:28:44:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:29:46:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:30:48:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:31:49:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:32:51:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:33:53:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:34:55:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:35:57:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:36:59:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:38:01:  Counter({None: 240, 'RUNNING': 16})
2023-03-20 04:39:03:  Counter({None

TypeError: string indices must be integers

Counter({'SUCCEEDED': 256})
2023-03-20 14:59:25:  Process complete
