In [5]:
#Replace name with your unique bucket name that was used in textToSpeechMP3.ipynb
insert_name = "neelmiranitest2"

In [2]:
import boto3
import time
import urllib
import json
import pandas as pd

transcribe_client = boto3.client('transcribe')

def transcribe_file_up (job_name, file_uri, transcribe_client):     #Code for upload to transcribe (can only run once):
    transcribe_client.start_transcription_job(
        TranscriptionJobName=job_name,
        Media= {'MediaFileUri': file_uri},
        MediaFormat='mp3',
        LanguageCode="en-US" 
    )
    
def transcribe_file_down (job_name, transcribe_client):      #Code for download of text
    max_tries = 0
    while max_tries < 60: #might take a few tries for it to be ready
        max_tries+=1
        job = transcribe_client.get_transcription_job(TranscriptionJobName = job_name)
        job_status = job['TranscriptionJob']['TranscriptionJobStatus']
        if job_status == 'FAILED':
            print(f"{job_name} Failed")
        elif job_status == 'COMPLETED':
            response = urllib.request.urlopen(job['TranscriptionJob']['Transcript']['TranscriptFileUri'])
            data = json.loads(response.read())
            text = data['results']['transcripts'][0]['transcript']
            return(text)
        else:
            print(f"Waiting for {job_name}, trying again")
            time.sleep(10) #wait and then try it again

In [3]:
#code for setting up dataframe
import pandas as pd
orig_df = pd.read_excel('/home/ec2-user/SageMaker/DSC_Final_Project_Group3/Data/100Random.xlsx') #make dataframe of all the sentenses

#make language dict to get the language for each person
lang_dict = {'Joanna' : 'English', 'Emma': 'British', 'Lea': 'French', 'Seoyeon': 'Korean', 'Aditi' : 'Hindi', 'Lucia' : 'Castilian Spanish'}

In [7]:
#set up empty frame we will fill
df = pd.DataFrame(columns = ['Name', 'Language', 'Original', 'Transcription'])
print(df)

#code for uploading
names = ['Joanna','Emma','Lea','Seoyeon','Aditi','Lucia']
for name in names:
    for i in range(1,101): #numbers 1 to 100, like how we named the mp3 files
        job_name = '1'+name+str(i) #the number in front is needed to make the anme unique, you can see how many tries this took
        file_uri = f's3://{insert_name}/{name}{i}.mp3'
        transcribe_file_up(job_name, file_uri, transcribe_client)
    time.sleep(60) #give it time to transcribe all of them
    
    #the service can only do 100 at a time, so we need to download these. This is why the for loops kinda repeat
    
    for i in range(1,101):
        job_name = '1'+name+str(i)
        transcription = transcribe_file_down(job_name, transcribe_client)
        original = orig_df['Original'].iat[i-1]
        language = lang_dict[name]
        
        df = df.append({'Name' : name , 'Language' : language, 'Original' : original, 'Transcription' : transcription },
            ignore_index = True)

Empty DataFrame
Columns: [Name, Language, Original, Transcription]
Index: []


In [8]:
#Cleaning
df = df[df.Name != 'Seoyeon'] #there were major issues with the Korean transcriptions, so much so that half were blank

#Analysis
from textdistance import levenshtein
df['Score'] = df.apply(lambda x: levenshtein.distance(x['Original'],  x['Transcription']), axis=1)
df

Unnamed: 0,Name,Language,Original,Transcription,Score
0,Joanna,English,Gwen had her best sleep ever on her new bed of...,Gwen had her best sleep ever on her new bed of...,0
1,Joanna,English,When confronted with a rotary dial phone the t...,"When confronted with the rotary dial found, a ...",11
2,Joanna,English,There was no telling what thoughts would come ...,There was no telling what thoughts would come ...,0
3,Joanna,English,Baby wipes are made of chocolate stardust.,Baby wipes are made of chocolate stardust.,0
4,Joanna,English,It caught him off guard that space smelled of ...,It cut him off guard that space smelled of sea...,3
...,...,...,...,...,...
595,Lucia,Castilian Spanish,Love is not like pizza.,Love is not like Pizza.,1
596,Lucia,Castilian Spanish,Her scream silenced the rowdy teenagers.,First scream silenced the rowdy teenagers.,4
597,Lucia,Castilian Spanish,"The waves were crashing on the shore, it was a...","The waves were crashing on the shore, It was a...",1
598,Lucia,Castilian Spanish,Sometimes you have to just give up and win by ...,Sometimes you have to just give up and do invi...,10


In [9]:
df.to_csv('comparisonData.csv')