# Get AWS File Names

In [2]:
import boto3

## Connect to S3

In [3]:
conn = boto3.client('s3')

In [4]:
s3 = boto3.resource('s3')

## Get List of All Files

In [12]:
all_objects_list = []
mybucket = 'nutrition-podcast-audio'
for key in conn.list_objects(Bucket=mybucket)['Contents']:
    all_objects_list.append(key['Key'])

In [13]:
all_objects_list[:3]

['.write_access_check_file.temp',
 'mp3-files-test-set/',
 'mp3-files-test-set/Episode # 11_ How Stress Makes Plants More Nutritious and How This Can Teach Us a Lesson About Stress in our Own Lives.mp3']

## Get List of mp3 File Names in Test Set Folder

In [14]:
test_set_string = 'mp3-files-test-set/Episode'

In [15]:
test_set_list = [s for s in all_objects_list if test_set_string in s]


In [16]:
test_set_list

['mp3-files-test-set/Episode # 11_ How Stress Makes Plants More Nutritious and How This Can Teach Us a Lesson About Stress in our Own Lives.mp3',
 'mp3-files-test-set/Episode # 16_ Seven Simple Strategies for Making Weight Loss Automatic.mp3',
 'mp3-files-test-set/Episode # 179_ If Drug Companies Could Patent This Plant Compound, It Would Be a Billion Dollar Drug.mp3',
 "mp3-files-test-set/Episode # 54_ Why you Shouldn't Place too Much Emphasis on Fasting Blood Measures.mp3"]

## Create List of Job Names based on Episode Number

In [17]:
job_name_test = test_set_list[1]

In [18]:
job_name_test

'mp3-files-test-set/Episode # 16_ Seven Simple Strategies for Making Weight Loss Automatic.mp3'

In [19]:
job_name_test.split('/')

['mp3-files-test-set',
 'Episode # 16_ Seven Simple Strategies for Making Weight Loss Automatic.mp3']

In [20]:
job_name_test.lstrip('mp3-files-test-set/')

'Episode # 16_ Seven Simple Strategies for Making Weight Loss Automatic.mp3'

In [21]:
job_name_test.partition("_")

('mp3-files-test-set/Episode # 16',
 '_',
 ' Seven Simple Strategies for Making Weight Loss Automatic.mp3')

In [49]:
head, sep, tail = job_name_test.partition("_")

In [52]:
head = head.lstrip('mp3-files-test-set/')

In [53]:
head.replace(" ", "_")

'Episode_#_16'

In [68]:
job_name_list = []
for s in test_set_list:
    jname = s.lstrip('mp3-files-test-set/')
    head, sep, tail = jname.partition("_")
    head = head.replace(" ", "")
    head = head.replace("#", "-")
    job_name_list.append(head)



In [69]:
job_name_list

['Episode-11', 'Episode-16', 'Episode-179', 'Episode-54']

### Decided it might be better to just have the job name and file name be the same (but will having .mp3 mess it up?

# Test Regular Transcription Job

In [26]:
import time

In [27]:
transcribe = boto3.client('transcribe', region_name="us-west-2")

In [None]:
url = '{}/{}/{}'.format(conn.meta.endpoint_url, mybucket, test_set_list[0])
print(url)

In [29]:
job_name = "podcast_test_3"
job_uri = url
output_bucket = "nutrition-podcast-audio"
transcribe.start_transcription_job(
    TranscriptionJobName=job_name,
    Media={'MediaFileUri': job_uri},
    MediaFormat='mp3',
    LanguageCode='en-US',
    OutputBucketName=output_bucket
)
while True:
    status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
        break
    print("Not ready yet...")
    time.sleep(5)
print(status)

Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
{'TranscriptionJob': {'TranscriptionJobName': 'p

# Zip list of file and job names

In [79]:
test_set_list

['mp3-files-test-set/Episode # 11_ How Stress Makes Plants More Nutritious and How This Can Teach Us a Lesson About Stress in our Own Lives.mp3',
 'mp3-files-test-set/Episode # 16_ Seven Simple Strategies for Making Weight Loss Automatic.mp3',
 'mp3-files-test-set/Episode # 179_ If Drug Companies Could Patent This Plant Compound, It Would Be a Billion Dollar Drug.mp3',
 "mp3-files-test-set/Episode # 54_ Why you Shouldn't Place too Much Emphasis on Fasting Blood Measures.mp3"]

In [80]:
test_uri_list = []
for fname in test_set_list:
    url = '{}/{}/{}'.format(conn.meta.endpoint_url, mybucket, fname)
    test_uri_list.append(url)

In [81]:
job_name_list

['Episode-11', 'Episode-16', 'Episode-179', 'Episode-54']

In [82]:
file_job_names = list(zip(test_uri_list, job_name_list))

In [83]:
for f, j in file_job_names:
    print(f)
    print(j)

https://s3.amazonaws.com/nutrition-podcast-audio/mp3-files-test-set/Episode # 11_ How Stress Makes Plants More Nutritious and How This Can Teach Us a Lesson About Stress in our Own Lives.mp3
Episode-11
https://s3.amazonaws.com/nutrition-podcast-audio/mp3-files-test-set/Episode # 16_ Seven Simple Strategies for Making Weight Loss Automatic.mp3
Episode-16
https://s3.amazonaws.com/nutrition-podcast-audio/mp3-files-test-set/Episode # 179_ If Drug Companies Could Patent This Plant Compound, It Would Be a Billion Dollar Drug.mp3
Episode-179
https://s3.amazonaws.com/nutrition-podcast-audio/mp3-files-test-set/Episode # 54_ Why you Shouldn't Place too Much Emphasis on Fasting Blood Measures.mp3
Episode-54


# Bulk Transcription

In [84]:
output_bucket = "nutrition-podcast-audio"
for f, j in file_job_names:
    job_name = j
    job_uri = f
    
    transcribe.start_transcription_job(
    TranscriptionJobName=job_name,
    Media={'MediaFileUri': job_uri},
    MediaFormat='mp3',
    LanguageCode='en-US',
    OutputBucketName=output_bucket)

    while True:
        status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
        if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
            break
        print("Not ready yet...")
        time.sleep(5)
    print(status)
    

Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
Not ready yet...
{'TranscriptionJob': {'TranscriptionJobName': 'E

In [None]:
try:
    s3.Bucket(BUCKET_NAME).download_file(KEY, 'my_local_image.jpg')
except botocore.exceptions.ClientError as e:
    if e.response['Error']['Code'] == "404":
        print("The object does not exist.")
    else:
        raise

In [None]:
import os
import boto3

#intiate s3 resource
s3 = boto3.resource('s3')

# select bucket
my_bucket = s3.Bucket('my_bucket_name')

# download file into current directory
for object in my_bucket.objects.all():
    my_bucket.download_file(object.key, os.path.join(os.curdir, object.key))