# Leveraging Transcribe and Comprehend

In [120]:
#!pip install twitterscraper

## Let's look at the audio files

In [None]:
!unzip audio_files.zip

In [None]:
import IPython
IPython.display.Audio("19-198-0002.mp3")

## Configure transcribe service and create a transcription job

In [None]:
!aws s3 cp 19-198-0002.mp3 s3://dbsworkshoppedro2/

In [None]:
import boto3
import time
transcribe = boto3.client('transcribe')


bucket_name='dbsworkshoppedro2'
job_name = "test"
job_uri = "s3://"+bucket_name+"/19-198-0002.mp3"
output_bucket =bucket_name

transcribe.start_transcription_job(
    TranscriptionJobName=job_name,
    Media={'MediaFileUri': job_uri},
    MediaFormat='mp3',
    LanguageCode='en-US',
    OutputBucketName=output_bucket,
    Settings={
        'ShowSpeakerLabels': False,
        'ChannelIdentification': False
    }
)


while True:
    status = transcribe.get_transcription_job(TranscriptionJobName=job_name)
    if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:
        break
    print("Not ready yet...")
    time.sleep(5)
print(status)

In [None]:
response = transcribe.get_transcription_job(
    TranscriptionJobName='test'
)

In [None]:
response['TranscriptionJob']

## Let's look at the results of the transcription

In [None]:
!aws s3 cp 's3://dbsworkshoppedro2/test.json' .

In [None]:
import IPython
IPython.display.Audio("19-198-0002.mp3")

In [None]:
import json
from pprint import pprint

with open('test.json') as f:
    data = json.load(f)

print(data['results']['transcripts'][0]['transcript'])

## Let's analyse the transcription with Comprehend 

In [None]:
import boto3
import json

comprehend = boto3.client(service_name='comprehend')
#text = "DBS has greatly improved from last year's results"
text=data['results']['transcripts'][0]['transcript']
print('Calling Sentiment Analysis')
print(json.dumps(comprehend.detect_sentiment(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
print('End of Sentiment Analysis \n')

## Simple example of Comprehend entity extraction

In [None]:
import boto3
import json

comprehend = boto3.client(service_name='comprehend')
text = "As of January, DBS has outperformed all other major banks in Singapore including HSBC"

print('Calling DetectEntities')
print(json.dumps(comprehend.detect_entities(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
print('End of DetectEntities\n')

## Query twitter messages to run analysis (similar to call center feedbak analysis)

In [None]:
!pip install twitterscraper

In [None]:
from twitterscraper import query_tweets


csv = open('output.txt', "w") 
#"w" indicates that you're writing strings to the file

columnTitleRow = "id, tweet\n"
csv.write(columnTitleRow)
i=0
for tweet in query_tweets("Amazon OR DBS", 10):
    uid = str(i)
    text = tweet.text
    row = uid+','+'"'+tweet.text.replace(',','').replace('\n',' ')+'"' + "\n"
    i=i+1
    csv.write(row)

## Output messages to txt

In [None]:
import pandas as pd
df=pd.read_csv('output.txt')

In [None]:
df.head()

In [None]:
df.iloc[:,1].to_csv('tweets.txt')

## Run batch Comprehend api calls

In [None]:
%%time
import sagemaker
from sagemaker import get_execution_role

role=get_execution_role()
print(role)
sess=sagemaker.Session()

In [None]:
output_bucket=bucket_name
s3 = boto3.resource('s3')
txt = open('tweets.txt', 'rb')
s3.Bucket(output_bucket).put_object(Key='twitterdata/input/tweets.txt', Body=txt)

In [None]:
import boto3
import json
comprehend = boto3.client(service_name='comprehend')

In [None]:

response = comprehend.start_entities_detection_job(
    InputDataConfig={
        'S3Uri': 's3://'+bucket_name+'/twitterdata/input/tweets.txt',
        'InputFormat':'ONE_DOC_PER_LINE'
    },
    OutputDataConfig={
        'S3Uri': 's3://'+bucket_name+'/twitterdata/output/results.txt'
    },
    DataAccessRoleArn=role,
    JobName='tweet-analysis2',
    LanguageCode='en'
)

In [None]:
!aws s3 cp s3://dbsbucketpedro/twitterdata/output/results.txt/349934754982-NER-990f64a5e84f2b7546abeae5970bd30a/output/output.tar.gz .


In [None]:
!tar -xvf output.tar.gz

In [None]:
!mv output output.json

In [None]:
import json
from pprint import pprint

responses=[]
with open('output.json') as f:
    for i in f:
        responses.append(json.loads(i))


In [None]:
len(df)

In [None]:
responses[2]['Entities'][1]['Text']

In [None]:
len(responses)

In [None]:
responses[1]['Entities'][1]['Text']

In [None]:
responses[39]

In [None]:
df['Org']='Na'
df['Score']='Na'
for i in range(len(df)):
    try:
        df['Org'][i]=responses[i]['Entities'][1]['Text']
        df['Score'][i]=responses[i]['Entities'][1]['Score']
    except Exception:
        pass

In [None]:
df.head(30)