In [9]:
## Madelyn -- I made some edits below to get you started -- take a look and let me know if you have any issues. -RV

## OBJECTIVE

Collocation analysis of a word shows lists words that are often next to that word.

## SETUP

In [5]:
%matplotlib inline

import matplotlib.pyplot as plt
import os
import re
import pandas as pd
import math

from collections import Counter

plt.style.use('seaborn')

import json
from collections import Counter

In [6]:
to_strip = ',.\xa0:-()\';$"/?][!`Ą@Ś§¨’–“”…ï‘>&\\%˝˘*—'

## FUNCTIONS

In [7]:
def tokenize(text, lowercase=True, strip_chars=''):
    '''turn a string into a list of whitespace separated tokens - after observing lowercase flag and stripping specified characters
    
    Args:
        text        -- a string object containing the text to be tokenized
        lowercase   -- whether the string should be lowercased before tokenization (default: True)
        strip_chars -- a string containing a series of characters which should be stripped from text before tokenization (default: empty string)
        
    
    Returns:
        list of tokens
    '''
    rdict = str.maketrans('','',strip_chars)
    
    if lowercase:
        text=text.lower()
        
    text = text.translate(rdict)
    
    tokens = text.split()
    
    return tokens
        

In [8]:
def collocates(tokens, kw, win=[4,4]):
    '''return the collocates in a window around a given keyword
    
    Args:
          tokens -- a list of tokens
          kw     -- keyword string to find and get collocates for
          win    -- a list of number of tokens to left (index 0) and right (index 1) to use; default: [4,4]
    
    Returns:
          a list of contexts (matching window specification) around each instance of keyword in tokens
    '''
    hits = [p for p,t in enumerate(tokens) if t==kw]
    
    context=[]
    for hit in hits:
        left = [] if win[0]<1 else tokens[hit-win[0]:hit]
        right = [] if win[1]<1 else tokens[hit+1:hit+win[1]+1]
        
        context.extend(left)
        context.extend(right)
        
    return context

In [9]:
def get_colls(texts,kw, win=[4,4]):
    '''create a collocate frequency list for instances of a kw in a list of texts
    
    Args:
        texts  -- a list of tokenized texts
        kw     -- keyword string to find and get collocates for
        win    -- a list of number of tokens to left (index 0) and right (index 1) to use; default: [4,4]
    
    Returns:
        a list-of-tuples where each tuple is (collocate, freq_with_kw, coll_total_freq)
    '''
    word_dist = Counter()
    colls = Counter()
    for text, tokens in texts.items():
        word_dist.update(tokens)
        colls.update(collocates(tokens,kw, win))
    
    return [(str(k),v, word_dist[k]) for k,v in colls.items()], word_dist.get(kw), sum(word_dist.values())

In [10]:
def pmi(A, B, AB, N):
    '''calculate pointwise mutual information for a pair of words given their co-occurring frequency and total frequencies
    
    Args:
        A   -- total frequency of word 1
        B   -- total frequency of word 1
        AB  -- frequency of word 1 and word 2 together
        N   -- number of tokens in corpus/sample
        
    Returns:
        the PMI value   log2( AB / A*B * N)
    '''
    return math.log2(N* (AB / (A * B)))

## LOAD DATA

In [11]:
corpus = json.load(open('data/briefing_transcripts.json'))

In [12]:
apr_briefings = [item for item in corpus if item['date'].split()[0]=='Apr']
mar_briefings = [item for item in corpus if item['date'].split()[0]=='Mar']
feb_briefings = [item for item in corpus if item['date'].split()[0]=='Feb']
jan_briefings = [item for item in corpus if item['date'].split()[0]=='Jan']

In [13]:
# set up a list for April tokens
apr_tokens = []

# process each of the April briefings
for briefing in apr_briefings:
    
    # get the text string for the transcript of current briefing
    briefing_text = briefing['text']
    
    # get a list of tokens for this text
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    
    # update the list of tokens
    apr_tokens.extend(tokens)

print(apr_tokens[0:50])

['rose', 'garden', '538', 'pm', 'edt', 'the', 'president', 'thank', 'you', 'very', 'much', 'thank', 'you', 'today', 'id', 'like', 'to', 'provide', 'you', 'with', 'an', 'update', 'in', 'our', 'war', 'against', 'the', 'coronavirus', 'thanks', 'to', 'our', 'comprehensive', 'strategy', 'and', 'extraordinary', 'devotion', 'to', 'our', 'citizens', 'weve', 'had', 'such', 'tremendous', 'support', 'all', 'over', 'we', 'continue', 'to', 'see']


In [14]:
jan_tokens = []
for briefing in jan_briefings:
    briefing_text = briefing['text']
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    jan_tokens.extend(tokens)

In [15]:
mar_tokens = []
for briefing in mar_briefings:
    briefing_text = briefing['text']
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    mar_tokens.extend(tokens)

In [16]:
feb_tokens = []
for briefing in feb_briefings:
    briefing_text = briefing['text']
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    feb_tokens.extend(tokens)

In [17]:
# for text in apr_tokens: <-- you don't need a for loop here because apr_tokens is ALREADY a list of tokens
apr_word_dist = Counter()
apr_bigram_dist = Counter()

tokens = apr_tokens
bigrams = ["{} {}".format(tokens[i],tokens[i+1]) for i,_ in enumerate(tokens[:-1])]
apr_word_dist.update(tokens)
apr_bigram_dist.update(bigrams)


In [18]:
apr_word_dist.most_common(50)

[('the', 18749),
 ('to', 12776),
 ('and', 12315),
 ('that', 8680),
 ('a', 7991),
 ('of', 7885),
 ('you', 6783),
 ('i', 6574),
 ('we', 6399),
 ('in', 5090),
 ('have', 4787),
 ('it', 4773),
 ('president', 3718),
 ('is', 3692),
 ('were', 3440),
 ('they', 3176),
 ('but', 3147),
 ('are', 3061),
 ('be', 2917),
 ('this', 2832),
 ('so', 2693),
 ('for', 2577),
 ('with', 2436),
 ('very', 2413),
 ('q', 2353),
 ('on', 2319),
 ('its', 2296),
 ('going', 2287),
 ('people', 2137),
 ('what', 2070),
 ('do', 1977),
 ('know', 1945),
 ('about', 1893),
 ('as', 1768),
 ('was', 1749),
 ('our', 1723),
 ('at', 1683),
 ('not', 1665),
 ('all', 1602),
 ('think', 1600),
 ('well', 1590),
 ('if', 1538),
 ('want', 1416),
 ('them', 1357),
 ('can', 1345),
 ('just', 1336),
 ('theyre', 1304),
 ('been', 1285),
 ('now', 1271),
 ('because', 1261)]

In [19]:
apr_bigram_dist.most_common(50)

[('the president', 2634),
 ('going to', 1867),
 ('of the', 1566),
 ('in the', 1326),
 ('we have', 1230),
 ('to be', 1210),
 ('i think', 1121),
 ('want to', 1072),
 ('you know', 1065),
 ('a lot', 955),
 ('and i', 912),
 ('to the', 818),
 ('lot of', 767),
 ('and we', 709),
 ('to do', 698),
 ('and the', 672),
 ('thank you', 661),
 ('were going', 656),
 ('on the', 638),
 ('that we', 615),
 ('have to', 601),
 ('have a', 591),
 ('with the', 583),
 ('mr president', 562),
 ('this is', 518),
 ('at the', 504),
 ('it was', 499),
 ('all of', 484),
 ('i dont', 482),
 ('you have', 464),
 ('to get', 447),
 ('that the', 443),
 ('a very', 437),
 ('for the', 437),
 ('if you', 432),
 ('i mean', 424),
 ('to have', 421),
 ('new york', 420),
 ('have been', 399),
 ('the country', 381),
 ('and they', 376),
 ('dr birx', 375),
 ('to make', 366),
 ('vice president', 366),
 ('that are', 363),
 ('we are', 363),
 ('and were', 361),
 ('able to', 360),
 ('in a', 353),
 ('go ahead', 351)]

### "Reopen": April

In [20]:
apr_colls_reopen = Counter()
# for text in apr_tokens:
apr_colls_reopen.update(collocates(apr_tokens,'reopen', [4,0]))

In [21]:
apr_colls_reopen.most_common(30)

[('to', 54),
 ('we', 19),
 ('and', 11),
 ('can', 9),
 ('the', 9),
 ('you', 8),
 ('that', 7),
 ('want', 6),
 ('one', 5),
 ('states', 5),
 ('ready', 5),
 ('do', 5),
 ('be', 4),
 ('not', 4),
 ('about', 4),
 ('are', 4),
 ('is', 4),
 ('when', 4),
 ('see', 3),
 ('begin', 3),
 ('businesses', 3),
 ('it', 3),
 ('country', 3),
 ('gradually', 2),
 ('america', 2),
 ('able', 2),
 ('safely', 2),
 ('will', 2),
 ('way', 2),
 ('day', 2)]

### "Fault": March

In [22]:
mar_colls_fault = Counter()
# for text in apr_tokens:
mar_colls_fault.update(collocates(mar_tokens,'fault', [4,0]))

In [30]:
mar_colls_fault.most_common(30)

[('not', 17),
 ('their', 16),
 ('its', 15),
 ('no', 11),
 ('nobodys', 9),
 ('it', 8),
 ('is', 6),
 ('was', 6),
 ('this', 5),
 ('through', 5),
 ('the', 4),
 ('and', 3),
 ('fault', 3),
 ('but', 3),
 ('wasnt', 3),
 ('are', 2),
 ('in', 2),
 ('to', 2),
 ('virus', 2),
 ('that', 2),
 ('thats', 2),
 ('these', 1),
 ('governors', 1),
 ('at', 1),
 ('federal', 1),
 ('governments', 1),
 ('president', 1),
 ('well', 1),
 ('country', 1),
 ('make', 1)]

no clear ownership, no main party's fault
nobodys is a collocate, which means he said it is nobdy's fault

### "Test": March

In [23]:
mar_colls_test = Counter()
# for text in apr_tokens:
mar_colls_test.update(collocates(mar_tokens,'test', [4,0]))

In [24]:
mar_colls_test.most_common(30)

[('a', 139),
 ('the', 106),
 ('to', 73),
 ('have', 37),
 ('that', 34),
 ('this', 30),
 ('of', 28),
 ('get', 27),
 ('and', 26),
 ('we', 25),
 ('you', 22),
 ('can', 19),
 ('is', 18),
 ('coronavirus', 17),
 ('test', 16),
 ('it', 15),
 ('i', 15),
 ('about', 14),
 ('need', 13),
 ('not', 12),
 ('do', 12),
 ('new', 11),
 ('its', 11),
 ('dont', 10),
 ('pointofcare', 9),
 ('be', 9),
 ('who', 9),
 ('able', 8),
 ('in', 8),
 ('with', 8)]

### "Symptoms": March

In [37]:
mar_colls_symptoms = Counter()
# for text in apr_tokens:
mar_colls_symptoms.update(collocates(mar_tokens,'symptoms', [4,0]))

In [38]:
mar_colls_symptoms.most_common(30)

[('have', 45),
 ('you', 20),
 ('people', 20),
 ('the', 18),
 ('that', 16),
 ('if', 13),
 ('dont', 12),
 ('who', 11),
 ('had', 11),
 ('no', 11),
 ('with', 10),
 ('are', 9),
 ('any', 8),
 ('of', 8),
 ('and', 7),
 ('mild', 7),
 ('to', 7),
 ('they', 5),
 ('or', 5),
 ('i', 5),
 ('duration', 3),
 ('fever', 3),
 ('flu', 3),
 ('for', 3),
 ('didnt', 3),
 ('through', 3),
 ('experiencing', 3),
 ('those', 3),
 ('about', 2),
 ('their', 2)]

### "Ventilators": March

In [39]:
mar_colls_ventilators = Counter()
mar_colls_ventilators.update(collocates(mar_tokens,'ventilators', [4,0]))

In [40]:
mar_colls_ventilators.most_common(30)

[('of', 56),
 ('the', 35),
 ('to', 25),
 ('we', 19),
 ('have', 15),
 ('and', 15),
 ('thousands', 14),
 ('a', 14),
 ('for', 11),
 ('lot', 8),
 ('number', 8),
 ('more', 8),
 ('that', 7),
 ('or', 7),
 ('on', 7),
 ('need', 6),
 ('about', 6),
 ('many', 6),
 ('equipment', 6),
 ('doing', 5),
 ('10000', 5),
 ('numbers', 5),
 ('wants', 5),
 ('protective', 5),
 ('than', 5),
 ('masks', 5),
 ('right', 4),
 ('them', 4),
 ('including', 4),
 ('tremendous', 4)]

### "Ventilators": April

In [41]:
apr_colls_ventilators = Counter()
apr_colls_ventilators.update(collocates(apr_tokens,'ventilators', [4,0]))

In [42]:
apr_colls_ventilators.most_common(30)

[('of', 113),
 ('the', 88),
 ('to', 65),
 ('have', 62),
 ('we', 48),
 ('a', 44),
 ('with', 35),
 ('on', 32),
 ('they', 32),
 ('thousands', 31),
 ('and', 30),
 ('more', 25),
 ('need', 24),
 ('about', 22),
 ('ventilators', 22),
 ('that', 21),
 ('were', 20),
 ('get', 19),
 ('lot', 19),
 ('for', 19),
 ('than', 18),
 ('you', 17),
 ('them', 15),
 ('be', 14),
 ('all', 14),
 ('some', 14),
 ('in', 14),
 ('like', 13),
 ('are', 13),
 ('it', 12)]

nothing too interesting in March and April

### "States": April

In [43]:
apr_colls_states = Counter()
apr_colls_states.update(collocates(apr_tokens,'states', [4,0]))

In [44]:
apr_colls_states.most_common(30)

[('the', 553),
 ('united', 250),
 ('of', 186),
 ('in', 175),
 ('to', 167),
 ('and', 87),
 ('some', 75),
 ('that', 74),
 ('have', 60),
 ('with', 56),
 ('those', 52),
 ('all', 50),
 ('are', 47),
 ('other', 41),
 ('for', 41),
 ('we', 40),
 ('a', 39),
 ('their', 38),
 ('you', 38),
 ('testing', 27),
 ('president', 27),
 ('but', 27),
 ('were', 25),
 ('these', 25),
 ('across', 24),
 ('up', 24),
 ('our', 23),
 ('is', 23),
 ('it', 23),
 ('different', 21)]

different, indicates that he wants to give power back to states, it depends on the different states to do what is best for them

### "Oil": April

In [46]:
apr_colls_oil = Counter()
apr_colls_oil.update(collocates(apr_tokens,'oil', [4,0]))

In [47]:
apr_colls_oil.most_common(30)

[('the', 36),
 ('of', 32),
 ('to', 22),
 ('on', 19),
 ('with', 14),
 ('a', 13),
 ('oil', 10),
 ('and', 7),
 ('you', 7),
 ('we', 7),
 ('now', 5),
 ('it', 5),
 ('than', 5),
 ('great', 5),
 ('theres', 5),
 ('for', 5),
 ('lot', 5),
 ('more', 5),
 ('q', 5),
 ('our', 5),
 ('know', 4),
 ('that', 4),
 ('president', 4),
 ('just', 4),
 ('from', 4),
 ('much', 4),
 ('price', 3),
 ('want', 3),
 ('have', 3),
 ('cut', 3)]

nothing too interesting

### "Available": March

In [48]:
mar_colls_available = Counter()
mar_colls_available.update(collocates(mar_tokens,'available', [4,0]))

In [49]:
mar_colls_available.most_common(30)

[('that', 43),
 ('be', 30),
 ('testing', 30),
 ('is', 29),
 ('are', 27),
 ('make', 27),
 ('to', 26),
 ('tests', 26),
 ('will', 25),
 ('and', 22),
 ('the', 21),
 ('more', 16),
 ('of', 16),
 ('those', 15),
 ('have', 12),
 ('made', 8),
 ('making', 8),
 ('now', 8),
 ('these', 7),
 ('with', 7),
 ('thats', 7),
 ('test', 7),
 ('we', 6),
 ('it', 6),
 ('they', 6),
 ('masks', 6),
 ('for', 5),
 ('this', 5),
 ('going', 5),
 ('sure', 5)]

testing, tests, masks

### "Available": April

In [50]:
apr_colls_available = Counter()
apr_colls_available.update(collocates(apr_tokens,'available', [4,0]))

In [51]:
apr_colls_available.most_common(30)

[('that', 28),
 ('be', 17),
 ('those', 14),
 ('the', 14),
 ('make', 13),
 ('is', 12),
 ('are', 11),
 ('have', 11),
 ('and', 11),
 ('to', 10),
 ('we', 10),
 ('will', 10),
 ('tests', 8),
 ('made', 7),
 ('were', 6),
 ('this', 6),
 ('of', 6),
 ('testing', 5),
 ('best', 4),
 ('information', 4),
 ('all', 4),
 ('ventilators', 4),
 ('additional', 3),
 ('million', 3),
 ('capacity', 3),
 ('also', 3),
 ('on', 3),
 ('in', 3),
 ('can', 3),
 ('beds', 3)]

testing, tests, ventilators, information, capacity