# Comprehend Organization NER on messages

In [120]:
#!pip install twitterscraper

## Simple example of Comprehend entity extraction

In [61]:
import boto3
import json

comprehend = boto3.client(service_name='comprehend', region_name='us-west-2')
text = "Netflix is dowing better than DBS"

print('Calling DetectEntities')
print(json.dumps(comprehend.detect_entities(Text=text, LanguageCode='en'), sort_keys=True, indent=4))
print('End of DetectEntities\n')

Calling DetectEntities
{
    "Entities": [
        {
            "BeginOffset": 0,
            "EndOffset": 7,
            "Score": 0.9994915723800659,
            "Text": "Netflix",
            "Type": "ORGANIZATION"
        },
        {
            "BeginOffset": 30,
            "EndOffset": 33,
            "Score": 0.5787805318832397,
            "Text": "DBS",
            "Type": "ORGANIZATION"
        }
    ],
    "ResponseMetadata": {
        "HTTPHeaders": {
            "connection": "keep-alive",
            "content-length": "208",
            "content-type": "application/x-amz-json-1.1",
            "date": "Sun, 16 Sep 2018 13:18:01 GMT",
            "x-amzn-requestid": "f01a1b91-b9b2-11e8-97de-3b872c716a91"
        },
        "HTTPStatusCode": 200,
        "RequestId": "f01a1b91-b9b2-11e8-97de-3b872c716a91",
        "RetryAttempts": 0
    }
}
End of DetectEntities



## Query twitter messages to run analysis (similar to OTP message analysis)

In [40]:
from twitterscraper import query_tweets


csv = open('output.txt', "w") 
#"w" indicates that you're writing strings to the file

columnTitleRow = "id, tweet\n"
csv.write(columnTitleRow)
i=0
for tweet in query_tweets("Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS", 40000):
    uid = str(i)
    text = tweet.text
    row = uid+','+'"'+tweet.text.replace(',','').replace('\n',' ')+'"' + "\n"
    i=i+1
    csv.write(row)

INFO: queries: ['Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS since:2006-03-21 until:2006-11-04', 'Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS since:2006-11-04 until:2007-06-20', 'Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS since:2007-06-20 until:2008-02-03', 'Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS since:2008-02-03 until:2008-09-18', 'Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS since:2008-09-18 until:2009-05-04', 'Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS since:2009-05-04 until:2009-12-18', 'Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS since:2009-12-18 until:2010-08-03', 'Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS since:2010-08-03 until:2011-03-19', 'Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS since:2011-03-19 until:2011-11-02', 'Netflix OR Facebook OR Amazon OR HSBC OR Lazada OR Bein OR DBS since:2011-11-02 unt

## Output messages to txt

In [41]:
import pandas as pd
df=pd.read_csv('output.txt')

In [42]:
df.head()

Unnamed: 0,id,tweet
0,0,Amazon Prime you had me at the knock on my door.
1,1,Amazon just recommended Paradigms of Artificia...
2,2,I am playing with texting twitter and facebook
3,3,Now I'm wondering if these updates can be comb...
4,4,Wondering if I should contact a co-worker I ju...


In [53]:
df.iloc[:,1].to_csv('tweets.txt')

## Run batch Comprehend api calls

In [54]:
%%time
import sagemaker
from sagemaker import get_execution_role

role=get_execution_role()
print(role)
sess=sagemaker.Session()

arn:aws:iam::349934754982:role/service-role/AmazonSageMaker-ExecutionRole-20180901T102635
CPU times: user 368 ms, sys: 0 ns, total: 368 ms
Wall time: 813 ms


In [70]:
output_bucket='sgtelcodemo'
s3 = boto3.resource('s3')
txt = open('tweets.txt', 'rb')
s3.Bucket(output_bucket).put_object(Key='twitterdata/input/tweets.txt', Body=txt)

s3.Object(bucket_name='sgtelcodemo', key='twitterdata/input/tweets.txt')

In [71]:
import boto3
import json
comprehend = boto3.client(service_name='comprehend', region_name='us-west-2')

In [72]:

response = comprehend.start_entities_detection_job(
    InputDataConfig={
        'S3Uri': 's3://sgtelcodemo/twitterdata/input/tweets.txt',
        'InputFormat':'ONE_DOC_PER_LINE'
    },
    OutputDataConfig={
        'S3Uri': 's3://sgtelcodemo/twitterdata/output/results.txt'
    },
    DataAccessRoleArn=role,
    JobName='tweet-analysis',
    LanguageCode='en'
)

In [73]:
!aws s3 cp s3://sgtelcodemo/twitterdata/output/results.txt/349934754982-NER-ed6fbfa663a094cece2d816f186336ad/output/output.tar.gz .


Completed 256.0 KiB/2.7 MiB (1.4 MiB/s) with 1 file(s) remainingCompleted 512.0 KiB/2.7 MiB (2.6 MiB/s) with 1 file(s) remainingCompleted 768.0 KiB/2.7 MiB (3.9 MiB/s) with 1 file(s) remainingCompleted 1.0 MiB/2.7 MiB (5.1 MiB/s) with 1 file(s) remaining  Completed 1.2 MiB/2.7 MiB (6.3 MiB/s) with 1 file(s) remaining  Completed 1.5 MiB/2.7 MiB (7.4 MiB/s) with 1 file(s) remaining  Completed 1.8 MiB/2.7 MiB (8.5 MiB/s) with 1 file(s) remaining  Completed 2.0 MiB/2.7 MiB (9.7 MiB/s) with 1 file(s) remaining  Completed 2.2 MiB/2.7 MiB (10.8 MiB/s) with 1 file(s) remaining Completed 2.5 MiB/2.7 MiB (11.9 MiB/s) with 1 file(s) remaining Completed 2.7 MiB/2.7 MiB (12.7 MiB/s) with 1 file(s) remaining download: s3://sgtelcodemo/twitterdata/output/results.txt/349934754982-NER-ed6fbfa663a094cece2d816f186336ad/output/output.tar.gz to ./output.tar.gz


In [74]:
!tar -xvf output.tar.gz

output


In [79]:
!mv output output.json

In [91]:
import json
from pprint import pprint

responses=[]
with open('output.json') as f:
    for i in f:
        responses.append(json.loads(i))


In [105]:
len(df)

38320

In [113]:
responses[2]['Entities'][1]['Text']

'twitter'

In [106]:
len(responses)

38394

In [101]:
responses[1]['Entities'][1]['Text']

'Amazon'

In [116]:
responses[39]

{'Entities': [{'BeginOffset': 0,
   'EndOffset': 2,
   'Score': 0.8730478882789612,
   'Text': '39',
   'Type': 'QUANTITY'},
  {'BeginOffset': 7,
   'EndOffset': 13,
   'Score': 0.9843676686286926,
   'Text': 'Amazon',
   'Type': 'ORGANIZATION'},
  {'BeginOffset': 14,
   'EndOffset': 19,
   'Score': 0.9237908124923706,
   'Text': 'Prime',
   'Type': 'COMMERCIAL_ITEM'},
  {'BeginOffset': 33,
   'EndOffset': 58,
   'Score': 0.9991692304611206,
   'Text': 'http://tinyurl.com/yqjofl',
   'Type': 'OTHER'}],
 'File': 'tweets.txt',
 'Line': 39}

In [124]:
df['Org']='Na'
df['Score']='Na'
for i in range(len(df)):
    try:
        df['Org'][i]=responses[i]['Entities'][1]['Text']
        df['Score'][i]=responses[i]['Entities'][1]['Score']
    except Exception:
        pass

In [126]:
df.head(30)

Unnamed: 0,id,tweet,Org,Score
0,0,Amazon Prime you had me at the knock on my door.,Amazon,0.996473
1,1,Amazon just recommended Paradigms of Artificia...,Amazon,0.998203
2,2,I am playing with texting twitter and facebook,twitter,0.99429
3,3,Now I'm wondering if these updates can be comb...,facebook,0.975061
4,4,Wondering if I should contact a co-worker I ju...,Facebook,0.99893
5,5,sleepy from furniture moving. listening to KCR...,KCRW,0.651127
6,6,very hot inside today. uploading photos to zoo...,today,0.957422
7,7,been online window shopping at amazon all day....,amazon,0.995884
8,8,Mailing Netflix.,Netflix,0.986677
9,9,LMAO: some guy on facebook's addicted to frien...,facebook,0.997715
