# AWS Comprehend Sentiment Analysis Using Python

In [1]:
import boto3
from botocore.exceptions import ClientError
import pandas as pd
import json
import tarfile

In [2]:
df = pd.read_csv('data/final_data_clean.csv', lineterminator='\n') 
df2 = pd.read_csv('data/tweet_1k.csv') 

In [4]:
df.sample(1000, random_state=5)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,Verified Status,Follower Count,Location,Reply Counts,Retweet Counts,Like Count,Media,Keyword,Year
735268,735308,19660,2020-07-15 23:47:26+00:00,1283548762875375616,15 communities set records for most sunny day ...,KateReimann,False,513.0,Hawaii,0.0,0.0,1.0,,climatechange,2020
643946,643980,1336,2018-01-14 23:50:05+00:00,952689306451218432,2017 was the third warmest year on record for ...,funds4disaster,True,6101.0,"Washington, DC",0.0,1.0,2.0,,climatechange,2018
134875,134878,25277,2021-09-10 23:30:01+00:00,1436472055948587014,This building is sustainable. Part of a winery...,semodu_pr,False,8695.0,"Stuttgart, München, Birmingham",0.0,0.0,0.0,[Photo(previewUrl='https://pbs.twimg.com/media...,sustainable,2021
58431,58434,21933,2019-08-08 23:46:28+00:00,1159611868815642624,How does a university walk the talk of buildin...,UCNZ,True,10916.0,"Christchurch, Canterbury, NZ",0.0,0.0,7.0,,sustainable,2019
57338,57341,20840,2019-07-28 23:30:25+00:00,1155621561178697730,"All Natural Bamboo soap drying dish, Wooden Ba...",ArtByKeshia,False,822.0,,0.0,0.0,0.0,,sustainable,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492904,492938,7475,2021-03-16 21:18:54+00:00,1371934018896207873,ClimateEmergency auspol nswpol,XRHornsbyShire,False,1144.0,Darug & Guringai Country,0.0,0.0,0.0,,ClimateEmergency,2021
827765,827808,15414,2021-12-06 17:20:56+00:00,1467906920493191170,Another impressive record high for European ca...,KamKazemi,False,477.0,The North,0.0,0.0,0.0,,carbonemissions,2021
279332,279366,4324,2021-02-13 23:46:35+00:00,1360737162489167875,Evidence of global warming,levidacruz,False,9631.0,"Las Vegas, NV",0.0,0.0,0.0,,globalwarming,2021
821704,821747,9353,2021-08-13 06:44:20+00:00,1426072108497506307,"Que vienne faire un tour ici à Paris, avec...",fanodira,False,50.0,Paris,0.0,0.0,1.0,,carbonemissions,2021


In [36]:
df2 

Unnamed: 0,Text
0,ClimateChange Antarctica ClimateCrisis I want...
1,The greenest building is (usually) the one tha...
2,"Last Thursday teachers from Makino, Campbell S..."
3,"Following Vancouver and many cities in Quebec,..."
4,Like it or not Australia has to do more on Cli...
...,...
995,One good thing about COVID19- Shutdowns and WF...
996,"Um, no. Is this how the libs are fighting glob..."
997,Join XR ClimateEmergency
998,"Hail, flames hit state of fire and ice Climate..."


## Asynchronous Processing - Scheduling an Analysis Job

#### Upload the data to S3

In [37]:
local_file_name = 'data/tweet_1k.csv'
bucket_name  = 'pro-change-capstone-bucket'
aws_file_name = 'input-data/tweet_1k.csv'

s3 = boto3.client('s3')

# Upload file to specific location
s3.upload_file(local_file_name, bucket_name, aws_file_name)

#### Configure Sentiment Detection Job

In [19]:
# Set these values before running the program
input_s3_url = 's3://pro-change-capstone-bucket/input-data/tweet_1k.csv'
input_doc_format = 'ONE_DOC_PER_LINE'
output_s3_url = 's3://pro-change-capstone-bucket/results/'
data_access_role_arn = 'arn:aws:iam::490318335601:role/pro-change-capstone-role'

# Set up job configuration
input_data_config = {'S3Uri': input_s3_url, 'InputFormat': input_doc_format}
output_data_config = {'S3Uri': output_s3_url}

#### Run the job

In [21]:
# Start the client
comprehend = boto3.client(service_name='comprehend',region_name='us-west-1')

# Begin a job to detect the topics in the document collection
start_job_sentiment = comprehend.start_sentiment_detection_job(
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    DataAccessRoleArn=data_access_role_arn,
    LanguageCode='en',
    JobName='Twitter_1K_tweets')

job_id = start_job_sentiment['JobId']
print(f'Your Sentiment Detection JobID is: {job_id}')

InvalidRequestException: An error occurred (InvalidRequestException) when calling the StartSentimentDetectionJob operation: 

Your job runs asynchronously. **This may take several minutes to run.**
When the status turns to COMPLETED, you can retrieve the results.

In [22]:
# Retrieve information about the job - the job may take a while to run
describe_result = comprehend.describe_sentiment_detection_job(JobId='42e380837ddf6aa8619992e7b05814d1')
job_status = describe_result['SentimentDetectionJobProperties']['JobStatus']
print(f'Job Status: {job_status}')
if job_status == 'FAILED':
    print(f'Reason: {describe_result["SentimentDetectionJobProperties"]["Message"]}')

InvalidRequestException: An error occurred (InvalidRequestException) when calling the DescribeSentimentDetectionJob operation: 

#### Download the Results

In [None]:
#  The results are located here
results_S3Url = comprehend.describe_sentiment_detection_job(
    JobId=job_id)['SentimentDetectionJobProperties']['OutputDataConfig']['S3Uri']

# Your Output S3 Url
results_S3Url

If you don't know where your results are, get a listing of your bucket

In [35]:
# List all files and folders in the bucket
def s3_bucket_list_obj(bucket):
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(bucket)
    for obj in bucket.objects.all():
        print(obj.key)

In [None]:
# List all files
s3_bucket_list_obj(bucket_name)

Download the results

In [31]:
# Give your local results file a name
results_name = 'sentiment'

local_results_filename = 'Comprehend/outputs/' + results_name + '.tar.gz'
s3_name = 's3://' + bucket_name + '/'
results_aws_filename = results_S3Url.replace(s3_name, '')

# Download results
s3 = boto3.client('s3')
s3.download_file(bucket_name,
                 results_aws_filename, 
                 local_results_filename)

#### Process the results

In [24]:
# Unzip the file
def extract_targz(targz_file, output_path = ''):
    if targz_file.endswith("tar.gz"):
        tar = tarfile.open(targz_file, "r:gz")
        tar.extractall(path = output_path)
        tar.close()
    elif targz_file.endswith("tar"):
        tar = tarfile.open(targz_file, "r:")
        tar.extractall(path = output_path)
        tar.close()

In [25]:
# creates a temp file is called 'output'
results_name = 'sentiment'
local_results_filename = 'output' + '.tar.gz'
output_path = 'extracted' 
extract_targz('output.tar.gz', output_path)

Read the results

In [26]:
# Read JSON into a dictionary   
input_file = output_path + '/output'
results = [json.loads(line) for line in open(input_file, 'r')]
print('Number of records in the output:',len(results))

Number of records in the output: 1001


This is how the output looks like. Note that often the records are not in the same order that they were sent in.

In [27]:
results[0]

{'File': 'tweet_1k.csv',
 'Line': 2,
 'Sentiment': 'NEUTRAL',
 'SentimentScore': {'Mixed': 0.0005515885422937572,
  'Negative': 0.023323293775320053,
  'Neutral': 0.8814818859100342,
  'Positive': 0.09464327245950699}}

In [28]:
# This function parses sentiment data into a dataframe
def parse_sentiment(data):
    df = pd.DataFrame([item['SentimentScore'] for item in data])
    df['File'] = [item.get('File') for item in data]
    df['Sentiment'] = [item.get('Sentiment') for item in data]
    df['Line'] = [item.get('Line') for item in data]
    df.set_index('Line', inplace = True)
    
    return(df)

In [29]:
# Produce the dataframe
# Let's sort and index the dataframe by line
sentiment_results = parse_sentiment(results).sort_index()

In [43]:
sentiment_results

Unnamed: 0_level_0,Mixed,Negative,Neutral,Positive,File,Sentiment
Line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0.000220,0.002068,0.995343,0.002368,tweet_1k.csv,NEUTRAL
1,0.000031,0.026165,0.773712,0.200091,tweet_1k.csv,NEUTRAL
2,0.000552,0.023323,0.881482,0.094643,tweet_1k.csv,NEUTRAL
3,0.000052,0.013647,0.706558,0.279743,tweet_1k.csv,NEUTRAL
4,0.000638,0.447723,0.530791,0.020848,tweet_1k.csv,NEUTRAL
...,...,...,...,...,...,...
996,0.000835,0.161633,0.688598,0.148934,tweet_1k.csv,NEUTRAL
997,0.005910,0.265271,0.626391,0.102428,tweet_1k.csv,NEUTRAL
998,0.000023,0.000088,0.995790,0.004099,tweet_1k.csv,NEUTRAL
999,0.000029,0.154233,0.833915,0.011822,tweet_1k.csv,NEUTRAL


#### Results Validation

Let's take a look at a record and validate that the results were similar (they won't be exactly the same), and that we sorted the dataframe correctly.

In [33]:
record_no = 230
# Tweet text
print('TWEET TEXT:\n', 
      df.loc[record_no].item())
# Real Time Results
# print('\nREAL TIME RESULTS:\n') 
# print(comprehend.detect_sentiment(Text=df.loc[record_no].item(), LanguageCode='en')['SentimentScore'])
# Job Resutls
print('\nASYNCHRONOUS RESULTS:')
print(sentiment_results.loc[record_no])

TWEET TEXT:
   The obvious answer is that the right wingers in both the Nats and Libs form a Conservative party. The remaining moderates form a true liberal party. Just like the 50's split for Labor into the DLP. Substitute climate change deniers for conservative catholicism . Everyone is happy!

ASYNCHRONOUS RESULTS:
Mixed            0.102015
Negative         0.479512
Neutral           0.30216
Positive         0.116313
File         tweet_1k.csv
Sentiment        NEGATIVE
Name: 230, dtype: object


#### Save the results file

In [41]:
output_name = 'Comprehend/outputs/sentiment_results_walmart_1k.xlsx'
sentiment_results['Text'] = df.walmart_tweets
sentiment_results.to_excel(output_name, engine = 'xlsxwriter',  encoding = 'utf-8')