## OBJECTIVE
Sentiment analysis can help us understand if a corpus is generally positive or negative.

## SETUP

In [1]:
%matplotlib inline
import nltk
import os
import re
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from nltk import tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import re
import os
import json
from nltk.tokenize import TweetTokenizer, WordPunctTokenizer
from nltk.sentiment import SentimentIntensityAnalyzer

In [2]:
tt=TweetTokenizer()

In [3]:
sid = SentimentIntensityAnalyzer()

In [4]:
[m for m in dir(sid) if not m.startswith('_')]

['lexicon',
 'lexicon_file',
 'make_lex_dict',
 'polarity_scores',
 'score_valence',
 'sentiment_valence']

In [5]:
len(sid.lexicon.keys())

7502

In [6]:
vlex = list(sid.lexicon.items())
vlex[:10]

[('$:', -1.5),
 ('%)', -0.4),
 ('%-)', -1.5),
 ('&-:', -0.4),
 ('&:', -0.7),
 ("( '}{' )", 1.6),
 ('(%', -0.9),
 ("('-:", 2.2),
 ("(':", 2.3),
 ('((-:', 2.1)]

In [7]:
import os
import json
from nltk.tokenize import TweetTokenizer, WordPunctTokenizer
from nltk.sentiment import SentimentIntensityAnalyzer

In [8]:
import json
from collections import Counter

In [9]:
NRC_VAD_lexicon = open('VADFolder/NRC-VAD-Lexicon.txt').readlines()

##  FUNCTIONS

In [10]:
%run functions.ipynb

These are my own functions that I edited, but they are also fount in the funtions folder.

In [11]:
def make_kwic2_as_text(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line if it contains spaces split into list of kws
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    
    kw_toks = kw.split()
    kw_width = len(kw_toks)
    
    hits = []
    for i,w in enumerate(text):
        match = text[i:i+kw_width]
        if match==kw_toks:
            hits.append((' '.join(kw_toks),i))
    
    
    lines = []
    for hit in hits:
        left = text[hit[1]-win:hit[1]]
        kw = ' '.join(text[hit[1]:hit[1]+kw_width])
        right = text[hit[1]+kw_width : hit[1]+win+kw_width]
        
        
        left = ['']*(win-len(left)) + left if len(left)<win else left
        right = right+['']*(win-len(right)) if len(right)<win else right

        
        lines.append(left + right)
        
    return lines

In [12]:
def make_kwic_as_text(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    hits = [(w,i) for i,w in enumerate(text) if w==kw]
    
    lines = []
    for hit in hits:
        hidx = hit[1]
        left = text[hidx-win:hidx]
        kw = hit[0]
        right = text[hidx+1 : hidx+win+1]
        
        left = ['']*len(left)-win + left if len(left)<win else left
        
        right = right + ['']*len(left)-win  if len(right)<win else right
        
        lines.append(left + right)
        
    return lines

## LOAD DATA

In [13]:
corpus = json.load(open('data/briefing_transcripts.json'))

In [14]:
len(corpus)

53

In [15]:
apr_briefings = [item for item in corpus if item['date'].split()[0]=='Apr']
mar_briefings = [item for item in corpus if item['date'].split()[0]=='Mar']
feb_briefings = [item for item in corpus if item['date'].split()[0]=='Feb']
jan_briefings = [item for item in corpus if item['date'].split()[0]=='Jan']

In [16]:
len(apr_briefings), len(feb_briefings)

(23, 2)

In [17]:
Counter([item['date'].split()[0] for item in corpus]).most_common()

[('Mar', 26), ('Apr', 23), ('Feb', 2), ('Jan', 2)]

## TOKENIZATION

In [18]:
apr_tokens = []

# process each of the April briefings
for briefing in apr_briefings:
    
    # get the text string for the transcript of current briefing
    briefing_text = briefing['text']
    
    # get a list of tokens for this text
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    
    # update the list of tokens
    apr_tokens.extend(tokens)

In [19]:
jan_tokens = []
for briefing in jan_briefings:
    briefing_text = briefing['text']
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    jan_tokens.extend(tokens)

In [20]:
feb_tokens = []
for briefing in feb_briefings:
    briefing_text = briefing['text']
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    feb_tokens.extend(tokens)

In [21]:
mar_tokens = []
for briefing in mar_briefings:
    briefing_text = briefing['text']
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    mar_tokens.extend(tokens)

## Lexicon Sentiment Analysis

### "We Will": April

In [22]:
apr_kwic_wewill = make_kwic2_as_text("we will", apr_tokens, win=6)

In [28]:
apr_kwic_wewill_sidscore = []

for item in apr_kwic_wewill:
    for tok in item:
        score = sid.lexicon.get(tok,0)
        apr_kwic_wewill_sidscore.append(score)


In [29]:
sum(apr_kwic_wewill_sidscore)

165.9000000000001

### "We Will": March

In [30]:
mar_kwic_wewill= make_kwic2_as_text("we will", mar_tokens, win=6)

In [31]:
mar_kwic_wewill_sidscore = []
for item in mar_kwic_wewill:
    for tok in item:
        score = sid.lexicon.get(tok,0)
    mar_kwic_wewill_sidscore.append(score)

In [32]:
sum(mar_kwic_wewill_sidscore)

13.700000000000001

More positive in April

### "He Did": April

In [33]:
apr_kwic_hedid = make_kwic2_as_text("he did", apr_tokens, win=6)

In [34]:
apr_kwic_hedid

[['back',
  'from',
  'wisconsin',
  'the',
  'first',
  'thing',
  'he',
  'called',
  'up',
  'i',
  'said',
  '“how’s'],
 ['and',
  'also',
  'on',
  '—',
  'the',
  'president:',
  'what?',
  'what?',
  'q',
  'reuters',
  'reported',
  'today'],
 ['of',
  'laboratories',
  'in',
  'each',
  'individual',
  'state',
  'raise',
  'the',
  'issue',
  'that',
  'we',
  'had'],
 ['didn’t',
  'know',
  'about',
  'it',
  'and',
  'if',
  'know',
  'about',
  'it',
  'he',
  'would’ve',
  'been'],
 ['of',
  '—',
  'of',
  'plague',
  'so',
  'what',
  'was',
  'totally',
  'inappropriate',
  'other',
  'than',
  'that'],
 ['“how',
  'did',
  'president',
  'trump',
  'do?”',
  '“oh',
  'uh',
  'terribly',
  'let’s',
  'see',
  'he',
  'did'],
 ['he',
  'did',
  'uh',
  'terribly',
  'let’s',
  'see',
  '—',
  'uh',
  'yeah',
  'he',
  'did',
  'terribly”'],
 ['see',
  'he',
  'did',
  '—',
  'uh',
  'yeah',
  'terribly”',
  '“oh”',
  'but',
  'we',
  'just',
  'got'],
 ['“mark”',
  '—',


In [35]:
apr_kwic_hedid_sidscore = []
for item in apr_kwic_hedid:
    for tok in item:
        score = sid.lexicon.get(tok,0)
    apr_kwic_hedid_sidscore.append(score)

In [36]:
sum(apr_kwic_hedid_sidscore)

1.3

barely positive

### "He Did": March

In [41]:
mar_kwic_hedid = make_kwic2_as_text("he did", mar_tokens, win=6)
len(mar_kwic_hedid)

7

In [42]:
mar_kwic_hedid_sidscore = []
for item in mar_kwic_hedid:
    for tok in item:
        score = sid.lexicon.get(tok,0)
    mar_kwic_hedid_sidscore.append(score)

In [43]:
sum(mar_kwic_hedid_sidscore)

0

seems weird that it is exactly 0, perhaps the lexicon didn't know these words

### "We Have": April

In [64]:
apr_kwic_wehave = make_kwic2_as_text("we have", apr_tokens, win=6)

In [65]:
apr_kwic_wehave 

[['lives',
  'for',
  'those',
  'who',
  'are',
  'infected',
  'taken',
  'unprecedented',
  'action',
  'to',
  'ensure',
  'they'],
 ['and',
  'others',
  'throughout',
  'the',
  'world',
  'because',
  'ventilators',
  'like',
  '—',
  'the',
  'job',
  'that'],
 ['molecular',
  'tests',
  'that',
  'we',
  'do',
  'today',
  'also',
  'brought',
  'up',
  'serological',
  'testing',
  'we'],
 ['make',
  'testing',
  'more',
  'convenient',
  'and',
  'easier',
  'the',
  'swabs',
  'now',
  'that',
  'are',
  'much'],
 ['entire',
  'country',
  'and',
  'lastly',
  'mr',
  'president',
  'a',
  'rather',
  'large',
  'drug',
  'development',
  'business'],
 ['be',
  'used',
  'in',
  'these',
  'testing',
  'kits',
  'about',
  '1200',
  'people',
  'in',
  'our',
  'company'],
 ['test',
  'site',
  'and',
  'since',
  'that',
  'time',
  'opened',
  'large-scale',
  'testing',
  'facilities',
  'across',
  'five'],
 ['excited',
  'with',
  'the',
  'public-private',
  'partners

In [66]:
apr_kwic_wehave_sidscore = []
for item in apr_kwic_wehave:
    for tok in item:
        score = sid.lexicon.get(tok,0)
    apr_kwic_wehave_sidscore.append(score)

In [67]:
sum(apr_kwic_wehave_sidscore)

60.099999999999994

relatively high, positive

### "Be Tested": April

In [68]:
apr_kwic_betested = make_kwic2_as_text("be tested", apr_tokens, win=6)

In [69]:
apr_kwic_betested

[['vaccines',
  'again',
  'the',
  'vaccines',
  'have',
  'to',
  'the',
  'therapeutics',
  'are',
  'for',
  'now',
  'but'],
 ['do',
  'not',
  'have',
  'the',
  'opportunity',
  'to',
  'elsewhere',
  'and',
  'they',
  'are',
  'performing',
  'outstandingly'],
 ['right',
  'now',
  'the',
  'vaccines',
  'have',
  'to',
  'so',
  'it',
  'takes',
  'a',
  'longer',
  'period'],
 ['if',
  'you',
  'have',
  'symptoms',
  'you',
  'should',
  'and',
  'make',
  'sure',
  'that',
  'you',
  'are'],
 ['and',
  'i',
  'hope',
  'our',
  'strength',
  'will',
  'and',
  'our',
  'endurance',
  'will',
  'be',
  'tried']]

In [70]:
apr_kwic_betested_sidscore = []
for item in apr_kwic_betested:
    for tok in item:
        score = sid.lexicon.get(tok,0)
    apr_kwic_betested_sidscore.append(score)

In [51]:
sum(apr_kwic_betested_sidscore)

0

neutral/unknown words

### "We": April

In [71]:
apr_kwic_we= make_kwic_as_text("we", apr_tokens, win=6)

In [72]:
apr_kwic_we_sidscore = []
for item in apr_kwic_we:
    for tok in item:
        score = sid.lexicon.get(tok,0)
    apr_kwic_we_sidscore.append(score)

In [54]:
sum(apr_kwic_we_sidscore)

276.1000000000003

really high

### "We": March

In [55]:
mar_kwic_we= make_kwic_as_text("we", mar_tokens, win=6)

In [56]:
mar_kwic_we_sidscore = []
for item in mar_kwic_we:
    for tok in item:
        score = sid.lexicon.get(tok,0)
    mar_kwic_we_sidscore.append(score)

In [57]:
sum(mar_kwic_we_sidscore)

182.6000000000002

still high

### "He": April

In [58]:
apr_kwic_he= make_kwic_as_text("he", apr_tokens, win=6)

In [59]:
apr_kwic_he_sidscore = []
for item in apr_kwic_he:
    for tok in item:
        score = sid.lexicon.get(tok,0)
    apr_kwic_he_sidscore.append(score)

In [60]:
sum(apr_kwic_he_sidscore)

54.29999999999998

lower than we

### "He": March

In [61]:
mar_kwic_he= make_kwic_as_text("he", mar_tokens, win=6)

In [62]:
mar_kwic_he_sidscore = []
for item in mar_kwic_he:
    for tok in item:
        score = sid.lexicon.get(tok,0)
    mar_kwic_he_sidscore.append(score)

In [63]:
sum(mar_kwic_he_sidscore)

19.300000000000004

lower than he in April