# Imports

In [91]:
import sys
import os
import json
import unicodedata
from ibm_watson import NaturalLanguageUnderstandingV1
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
from ibm_watson.natural_language_understanding_v1 import Features, SentimentOptions, EmotionOptions

# Authentification

In [2]:
authenticator = IAMAuthenticator('ZtAL7sIQ58MB8UrjOeYR_fvko36zVObg57OQ8y3FftDn')
natural_language_understanding = NaturalLanguageUnderstandingV1(
    version='2019-07-12',
    authenticator=authenticator
)
natural_language_understanding.set_service_url('https://api.us-south.natural-language-understanding.watson.cloud.ibm.com/instances/90eb04a6-053b-4fdd-a09a-2005d9fe421b')

# Analysis

In [89]:
# writes a json to fpath
def write_json(fpath, data):
    # write json back
    mode = 'a' if os.path.exists(fpath) else 'w'
    with open(fpath, mode) as f:
        json.dump(data, f)

In [4]:
# removes control characters from a string
def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")

In [5]:
# takes the raw content of a article (an array of text paragraph with unicode control characters) and returns a sanatized complete text
def sanatize_content(raw_content):
    text = ''
    for i in range(len(raw_content)):
        sanatized_paragraph = remove_control_characters(raw_content[i])
        text += ' ' + sanatized_paragraph

    return text

In [18]:
# takes a json of one newssource and calls sentiment and emotion analysis for each article and writes them directly to the given json file
def analyze_newssource(fpath, json):
    no_articles = 0

    for article in json:
        # skip cycle, if the article was already analyzed or the article doesn't have content
        if('watson_analysis' in article or article['content'] == [] or article['content'] == [""]):
            continue

        # print every 10 articles
        if(no_articles % 10 == 0):
            print('Analyzing article ' + str(no_articles) + ' to ' + str(no_articles + 10))
        
        no_articles += 1

        # sanatize content
        raw_content = article['content']
        sanatized_content = sanatize_content(raw_content)

        # analyze content
        sentiment, sadness, joy, fear, disgust, anger = analyze_text(sanatized_content, 0)

        # omit analysis if it yielded 'None' (has an exception analyzing)
        if(sentiment is None or sadness is None or joy is None or disgust is None or anger is None):
            continue

        # write analysis to json
        article['watson_analysis'] = {
            'sentiment' : sentiment,
            'sadness' : sadness,
            'joy' : joy,
            'fear' : fear,
            'disgust' : disgust,
            'anger' : anger,
        }

        # write json to file
        write_json(fpath, json)

    print('Analysis done. All changes written.')

In [16]:
# takes a text, requests analysis and returns sentiment and emotion results

def analyze_text(text, trial_counter):
    if(trial_counter > 5):
        print('Analysis failed 5 times. Skipping.')
        return None, None, None, None, None, None

    try:
        response = natural_language_understanding.analyze(
            text=text,
            features=Features(sentiment=SentimentOptions(), emotion=EmotionOptions())
        ).get_result()
    except Exception:
        print('Analysis exception, retrying...')
        return analyze_text(text, trial_counter + 1)

    try:
        sentiment = response['sentiment']['document']['score']
    except KeyError:
        sentiment = 'N/A'
        print('No value for sentiment.')

    try:
        sadness = response['emotion']['document']['emotion']['sadness']
        joy = response['emotion']['document']['emotion']['joy']
        fear = response['emotion']['document']['emotion']['fear']
        disgust = response['emotion']['document']['emotion']['disgust']
        anger = response['emotion']['document']['emotion']['anger']
    except KeyError:
        sadness = 'N/A'
        joy = 'N/A'
        fear = 'N/A'
        disgust = 'N/A'
        anger = 'N/A'
        print('No value for emotions.')

    return sentiment, sadness, joy, fear, disgust, anger

In [24]:
def analyze_newssources(newssources):

  for newssource in newssources:
      fpath = './dataset/cn_media_raw/' + newssource + '.json'

      # open json at fpath
      with open(fpath) as f:
        print('Opening ' + fpath)
        data = json.load(f)

      # append analysis
      print('Analyzing newssource ' + newssource)
      analyze_newssource('./dataset/cn_media_analyzed/' + newssource + ' analyzed.json', data)

# Start

In [17]:
newssources = ['People', 'Xinhua']

analyze_newssources(newssources)

Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/ibm_watson/natural_language_understanding_v1.py", line 185, in analyze
    response = self.send(request)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/ibm_cloud_sdk_core/base_service.py", line 228, in send
    raise ApiException(
ibm_cloud_sdk_core.api_exception.ApiException: Error: not enough text for language id, Code: 422 , X-global-transaction-id: a61e97fc2c32377e9f9205b68e7f844a

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "<ipython-input-16-e835b6b254f2>", line 9, in analyze_text
    response = natural_language_understanding.analyze(
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/ibm_watson/natural_language_understanding_v1.py", line 185, in analyze
    response = self.send(request)
  File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/i

# Postprocessing

In [96]:
# @TODO: if this goes live, we should do the postprocessing while analyzing...
def postprocess_newssources(newssources):

  for newssource in newssources:
      fpath = './dataset/us_media_analyzed/' + newssource + '.json'

      # open json at fpath
      with open(fpath) as f:
        print('Opening ' + fpath)
        data = json.load(f)

      # append analysis
      print('Postprocessing ' + newssource)
      postprocess_newssource('./dataset/us_media_analyzed_postprocessed/' + newssource + ' postprocessed.json', data)

In [81]:
def postprocess_newssource(fpath, data):
    article_count = 0
    
    postprocessed_data = []

    for article in data:
        article_count += 1
        if('watson_analysis' not in article):
            print('No analysis, deleting')
            continue
            
        analysis = article['watson_analysis']
        if(analysis['sentiment'] == 'N/A' or analysis['sadness'] == 'N/A' or analysis['joy'] == 'N/A' or analysis['fear'] == 'N/A' or analysis['disgust'] == 'N/A' or analysis['anger'] == 'N/A'):
            print('Analysis yielded N/A, deleting')
            continue

        postprocessed_data.append(article)

    write_json(fpath, postprocessed_data)

    print(article_count)

In [98]:
newssources = ['ABCNews analyzed', 'breitbart analyzed', 'CNN analyzed', 'FoxNews analyzed', 'New York Times analyzed', 'Washington Post analyzed']

postprocess_newssources(newssources)

Opening ./dataset/us_media_analyzed/ABCNews analyzed.json
Postprocessing ABCNews analyzed
238
Opening ./dataset/us_media_analyzed/breitbart analyzed.json
Postprocessing breitbart analyzed
617
Opening ./dataset/us_media_analyzed/CNN analyzed.json
Postprocessing CNN analyzed
316
Opening ./dataset/us_media_analyzed/FoxNews analyzed.json
Postprocessing FoxNews analyzed
388
Opening ./dataset/us_media_analyzed/New York Times analyzed.json
Postprocessing New York Times analyzed
479
Opening ./dataset/us_media_analyzed/Washington Post analyzed.json
Postprocessing Washington Post analyzed
No analysis, deleting
833
