##  OBJECTIVE

Keyness analysis is a statistical measure used to identify significant differences between 2 corpuses. If certain linguistic items are key, their normalized distribution among the two texts are not the same. However, if a linguistic item has the same normalized frequencies in each text, it is not key. We can think about keys as being distinctive components of that text. Linguistic items include words, ngrams, lemmas, and POS-tags. Normalized frequency simply refers to the frequency of a linguistic item scaled to the size of the corpus.


# SETUP

In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import os
import pandas as pd
import re
import math
import random

import json
from collections import Counter


import os
import zipfile
import pandas as pd
import json

import random

import datetime as dt
import calendar

from collections import Counter
import seaborn as sn
import matplotlib.pyplot as plt

In [2]:
to_strip = '().[]!,"'

# FUNCTIONS

In [3]:
def calculate_keyness(fdist1, fdist2, fthreshold=5, keyness_threshold=6.6, top=100, print_table=True):
    '''
    '''
    
    c1size = sum(fdist1.values())
    c2size = sum(fdist2.values())

    
    kdata = []
    
    for item, freq in fdist1.items():
        if freq<fthreshold:
            continue
            
        ref_freq = fdist2.get(item,0)
        
        if ref_freq<fthreshold:
            continue
        
        
        keyness = log_likelihood(freq, c1size, ref_freq, c2size)
        
        row = {'item': item, 'freq': freq, 'ref_freq': ref_freq, 'keyness': keyness}
        
        if keyness>keyness_threshold:
        
            kdata.append(row)
        
    
    kdf = pd.DataFrame(kdata)[['item', 'freq', 'ref_freq', 'keyness']]
    
    kdf=kdf.sort_values('keyness', ascending=False)
    
    if not print_table:
        return kdf[:top]
    
    template = "{: <25}{: <10}{: <10}{:0.3f}"
    
    header = "{: <25}{: <10}{: <10}{}".format('WORD', 'Corpus Freq.', 'RC Freq.', 'Keyness')
    
    print("{}\n{}".format(header, "="*len(header)))
    
    for item, freq, ref_freq, keyness in kdf[:top].values:
        print(template.format(item, freq, ref_freq, keyness))

In [4]:
def tokenize(text, lowercase=False, strip_chars=''):
    '''create a list of tokens from a string by splitting on whitespace and applying optional normalization 
    
    Args:
        text        -- a string object containing the text to be tokenized
        lowercase   -- should text string be normalized as lowercase (default: False)
        strip_chars -- a string indicating characters to strip out of text, e.g. punctuation (default: empty string) 
        
    Return:
        A list of tokens
    '''
    
    # create a replacement dictionary from the
    # string of characters in the **strip_chars**
    rdict = str.maketrans('','',strip_chars)
    
    if lowercase:
        text = text.lower()
    
    tokens = text.translate(rdict).split()
    
    return tokens

In [5]:
def get_ngram_tokens(tokens, n=1):
    '''create a list of n-gram tokens from a list of tokens
    
    Args:
        tokens -- a list of tokens
        n      -- the size of the window to use to build n-gram token list
        
    Returns:
        
        list of n-gram strings (whitespace separated) of length n
    '''
    new_tokens = []
    
    for i in range(len(tokens)-n+1):
        new_tokens.append(" ".join(tokens[i:i+n]))
        
    return new_tokens

In [6]:
def make_kwic(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    hits = [(w,i) for i,w in enumerate(text) if w==kw]
    
    lines = []
    for hit in hits:
        left = text[hit[1]-win:hit[1]]
        kw = text[hit[1]]
        right = text[hit[1]+1 : hit[1]+win+1]
        
        lines.append([left, kw, right])
        
    return lines

In [7]:
def log_likelihood(item_A_freq, corpus_A_size, item_B_freq, corpus_B_size):
    
    E1 = corpus_A_size*(item_A_freq+item_B_freq) / (corpus_A_size+corpus_B_size) 
    E2 = corpus_B_size*(item_A_freq+item_B_freq) / (corpus_A_size+corpus_B_size)
    
    G2 = 2*((item_A_freq*math.log(item_A_freq/E1)) + (item_B_freq*math.log(item_B_freq/E2)))
    
    sign = 1 if (item_A_freq / corpus_A_size) >= (item_B_freq / corpus_B_size) else -1
    
    return sign*G2

# LOAD DATA

In [8]:
corpus = json.load(open('data/briefing_transcripts.json'))

In [9]:
apr_briefings = [item for item in corpus if item['date'].split()[0]=='Apr']
mar_briefings = [item for item in corpus if item['date'].split()[0]=='Mar']
feb_briefings = [item for item in corpus if item['date'].split()[0]=='Feb']
jan_briefings = [item for item in corpus if item['date'].split()[0]=='Jan']

# ANALYSIS

### TOKENIZING BY MONTH

In [10]:
# set up a list for April tokens
apr_tokens = []

# process each of the April briefings
for briefing in apr_briefings:
    
    # get the text string for the transcript of current briefing
    briefing_text = briefing['text']
    
    # get a list of tokens for this text
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    
    # update the list of tokens
    apr_tokens.extend(tokens)

In [11]:
mar_tokens = []
for briefing in mar_briefings:
    mar_text = briefing['text']
    tokens = tokenize(mar_text, lowercase=True, strip_chars=to_strip)
    mar_tokens.extend(tokens)

In [12]:
feb_tokens = []
for briefing in jan_briefings:
    feb_text = briefing['text']
    tokens = tokenize(feb_text, lowercase=True, strip_chars=to_strip)
    feb_tokens.extend(tokens)

In [13]:
jan_tokens = []
for briefing in jan_briefings:
    jan_text = briefing['text']
    tokens = tokenize(jan_text, lowercase=True, strip_chars=to_strip)
    jan_tokens.extend(tokens)

### GENERATING FREQUENCY LISTS

In [14]:
apr_word_dist = Counter(apr_tokens)
apr_bigram_dist = Counter(get_ngram_tokens(apr_tokens, 2))
apr_trigram_dist = Counter(get_ngram_tokens(apr_tokens, 3))

In [15]:
mar_word_dist = Counter(mar_tokens)
mar_bigram_dist = Counter(get_ngram_tokens(mar_tokens, 2))
mar_trigram_dist = Counter(get_ngram_tokens(mar_tokens, 3))

In [16]:
feb_word_dist = Counter(feb_tokens)
feb_bigram_dist = Counter(get_ngram_tokens(feb_tokens, 2))
feb_trigram_dist = Counter(get_ngram_tokens(feb_tokens, 3))

In [17]:
jan_word_dist = Counter(jan_tokens)
jan_bigram_dist = Counter(get_ngram_tokens(jan_tokens, 2))
jan_trigram_dist = Counter(get_ngram_tokens(jan_tokens, 3))

In [18]:
apr_word_dist.most_common(50)

[('the', 18732),
 ('to', 12770),
 ('and', 12318),
 ('that', 8493),
 ('—', 8392),
 ('a', 7985),
 ('of', 7882),
 ('you', 6703),
 ('i', 6527),
 ('we', 6368),
 ('in', 5074),
 ('have', 4770),
 ('it', 4614),
 ('is', 3637),
 ('they', 3172),
 ('but', 3146),
 ('are', 3058),
 ('be', 2904),
 ('this', 2730),
 ('so', 2691),
 ('we’re', 2594),
 ('for', 2569),
 ('president:', 2420),
 ('with', 2415),
 ('very', 2413),
 ('q', 2357),
 ('on', 2305),
 ('going', 2279),
 ('it’s', 2215),
 ('people', 2108),
 ('what', 1985),
 ('know', 1927),
 ('do', 1924),
 ('about', 1878),
 ('as', 1768),
 ('was', 1745),
 ('our', 1709),
 ('at', 1681),
 ('not', 1661),
 ('all', 1595),
 ('think', 1595),
 ('if', 1532),
 ('want', 1412),
 ('can', 1334),
 ('just', 1334),
 ('them', 1332),
 ('they’re', 1301),
 ('been', 1280),
 ('president', 1280),
 ('because', 1262)]

In [19]:
apr_bigram_dist.most_common(50)

[('the president:', 2218),
 ('going to', 1867),
 ('of the', 1551),
 ('in the', 1321),
 ('we have', 1216),
 ('to be', 1204),
 ('i think', 1119),
 ('— the', 1086),
 ('want to', 1072),
 ('you know', 1046),
 ('a lot', 949),
 ('and i', 909),
 ('to the', 789),
 ('lot of', 767),
 ('and we', 708),
 ('to do', 680),
 ('and the', 655),
 ('thank you', 651),
 ('we’re going', 626),
 ('on the', 625),
 ('that we', 610),
 ('have to', 600),
 ('have a', 591),
 ('— and', 586),
 ('with the', 578),
 ('— i', 564),
 ('mr president', 542),
 ('this is', 511),
 ('at the', 504),
 ('it was', 492),
 ('all of', 483),
 ('i don’t', 479),
 ('you have', 452),
 ('to get', 446),
 ('a very', 437),
 ('if you', 431),
 ('for the', 426),
 ('i mean', 423),
 ('to have', 420),
 ('new york', 415),
 ('the president', 413),
 ('have been', 398),
 ('and they', 376),
 ('that the', 372),
 ('to make', 364),
 ('that are', 362),
 ('we are', 361),
 ('able to', 360),
 ('the country', 358),
 ('and we’re', 354)]

In [20]:
jan_word_dist.most_common(50)

[('the', 351),
 ('to', 194),
 ('of', 182),
 ('and', 165),
 ('that', 103),
 ('in', 94),
 ('we', 89),
 ('you', 76),
 ('is', 74),
 ('a', 71),
 ('—', 71),
 ('this', 69),
 ('have', 59),
 ('as', 58),
 ('secretary', 57),
 ('are', 56),
 ('i', 49),
 ('china', 40),
 ('with', 40),
 ('so', 39),
 ('for', 38),
 ('be', 37),
 ('united', 32),
 ('states', 32),
 ('will', 31),
 ('health', 30),
 ('dr', 30),
 ('they', 30),
 ('at', 30),
 ('from', 30),
 ('on', 29),
 ('our', 29),
 ('can', 28),
 ('about', 26),
 ('risk', 25),
 ('public', 25),
 ('it', 25),
 ('we’re', 24),
 ('people', 24),
 ('but', 23),
 ('by', 22),
 ('what', 22),
 ('us', 21),
 ('american', 20),
 ('cases', 20),
 ('thank', 19),
 ('an', 19),
 ('low', 19),
 ('been', 19),
 ('virus', 19)]

In [21]:
mar_word_dist.most_common(50)

[('the', 12986),
 ('to', 8990),
 ('and', 8302),
 ('that', 5967),
 ('of', 5452),
 ('a', 5006),
 ('—', 4837),
 ('we', 4445),
 ('you', 4347),
 ('i', 3937),
 ('in', 3291),
 ('have', 2885),
 ('it', 2623),
 ('is', 2379),
 ('are', 2296),
 ('be', 2269),
 ('this', 2095),
 ('for', 2000),
 ('but', 1956),
 ('with', 1870),
 ('we’re', 1830),
 ('very', 1785),
 ('they', 1726),
 ('so', 1614),
 ('on', 1549),
 ('people', 1483),
 ('going', 1478),
 ('president:', 1441),
 ('as', 1392),
 ('it’s', 1364),
 ('q', 1300),
 ('all', 1281),
 ('about', 1218),
 ('do', 1212),
 ('our', 1200),
 ('will', 1166),
 ('president', 1165),
 ('what', 1146),
 ('know', 1137),
 ('at', 1116),
 ('not', 1024),
 ('think', 1021),
 ('want', 966),
 ('can', 952),
 ('been', 944),
 ('was', 931),
 ('if', 929),
 ('just', 854),
 ('thank', 806),
 ('now', 799)]

In [22]:
jan_size = len(jan_tokens)
apr_size = len(apr_tokens)

### LOOKING AT KEYNESS

In [23]:
calculate_keyness(apr_word_dist, jan_word_dist)

WORD                     Corpus Freq.RC Freq.  Keyness
it                       4614      25        35.510
i                        6527      49        27.876
it’s                     2215      8         26.482
—                        8392      71        26.144
a                        7985      71        21.273
q                        2357      12        19.714
think                    1595      7         15.861
know                     1927      10        15.723
they’re                  1301      5         14.733
but                      3146      23        14.276
do                       1924      11        13.699
going                    2279      15        12.815
don’t                    1175      5         12.066
because                  1262      6         11.479
was                      1745      11        10.607
very                     2413      19        9.168
all                      1595      12        6.778
they                     3172      30        6.770


Not much that is distinct to April/ that isn't discussed in January

In [24]:
calculate_keyness(jan_word_dist, apr_word_dist)

WORD                     Corpus Freq.RC Freq.  Keyness
secretary                57        261       191.082
risk                     25        51        116.845
china                    40        279       105.908
united                   32        262       76.228
low                      19        68        71.505
screening                11        6         71.220
public                   25        168       67.744
chinese                  11        12        61.629
14                       12        25        55.689
department               16        76        52.654
health                   30        364       52.440
travel                   13        46        49.169
security                 11        48        37.762
homeland                 7         10        36.515
azar                     7         11        35.522
citizens                 11        61        33.343
steps                    8         23        32.992
transmission             7         15        32.175
entry 

In [25]:
calculate_keyness(mar_word_dist, feb_word_dist)

WORD                     Corpus Freq.RC Freq.  Keyness
it                       2623      25        23.387
it’s                     1364      8         22.649
i                        3937      49        19.281
a                        5006      71        15.866
think                    1021      7         14.466
—                        4837      71        13.330
very                     1785      19        12.849
q                        1300      12        12.249
all                      1281      12        11.772
do                       1212      11        11.746
going                    1478      15        11.725
know                     1137      10        11.588
but                      1956      23        11.169
don’t                    707       5         9.669
president                1165      12        8.974
they’re                  680       5         8.878
because                  711       6         7.716
we’re                    1830      24        7.647
that          

In [26]:
calculate_keyness(feb_word_dist, mar_word_dist)

WORD                     Corpus Freq.RC Freq.  Keyness
secretary                57        201       173.522
china                    40        204       98.025
united                   32        158       80.083
azar:                    16        22        71.966
risk                     25        113       66.116
low                      19        66        58.300
novel                    10        6         55.800
assistant                9         6         49.109
14                       12        26        45.787
public                   25        192       45.100
department               16        71        42.789
security                 11        26        40.481
chinese                  11        33        36.334
health                   30        350       35.500
screening                11        35        35.300
states                   32        406       34.134
quarantine               10        30        33.031
airports                 9         23        32.017
asymptom

In [27]:
calculate_keyness(mar_word_dist, apr_word_dist)

WORD                     Corpus Freq.RC Freq.  Keyness
vice                     508       378       107.722
will                     1166      1196      84.206
thank                    806       789       71.857
available                187       102       71.496
travel                   119       46        69.674
tested                   155       80        64.244
president                1165      1280      58.227
risk                     113       51        55.773
meeting                  98        41        52.781
night                    106       49        50.814
elderly                  47        8         48.634
symptoms                 91        40        46.341
legislation              46        10        41.916
house                    165       115       40.576
senate                   57        18        40.132
commercial               93        47        39.693
fault                    41        8         39.653
children                 48        13        37.997
private 

In [28]:
calculate_keyness(apr_word_dist, mar_word_dist)

WORD                     Corpus Freq.RC Freq.  Keyness
he                       960       357       96.248
states                   1014      406       80.691
oil                      119       10        69.635
military                 175       28        67.322
—                        8392      4837      66.669
ventilators              478       165       58.554
see                      927       397       57.636
jersey                   158       27        57.216
farmers                  91        9         49.017
they                     3172      1726      47.421
antibody                 95        12        43.997
it                       4614      2623      43.561
million                  484       188       42.536
program                  127       24        41.555
machines                 83        10        39.692
metro                    98        15        39.175
infrastructure           65        6         36.318
reopen                   72        8         36.253
banks    

In [29]:
calculate_keyness(apr_bigram_dist, mar_bigram_dist)

WORD                     Corpus Freq.RC Freq.  Keyness
new jersey               154       27        54.403
the president:           2218      1164      45.084
— the                    1086      524       38.455
he said                  122       26        34.781
should have              79        12        31.783
states that              102       21        30.315
to open                  91        18        28.323
the states               200       66        27.301
can see                  93        22        23.112
what he                  48        6         22.391
opening up               48        6         22.391
more than                301       124       21.682
new orleans              50        7         21.479
it was                   492       229       21.096
and new                  65        13        19.979
it and                   244       97        19.889
to reopen                43        6         18.519
have enough              39        5         17.875
did the  

In [30]:
calculate_keyness(mar_bigram_dist, apr_bigram_dist, top=50)

WORD                     Corpus Freq.RC Freq.  Keyness
thank you                707       651       80.132
vice president:          273       199       60.758
the vice                 345       278       59.631
mr vice                  116       53        56.411
the president            451       413       52.144
be tested                40        5         46.974
the risk                 44        7         46.944
vice president           225       170       45.978
will be                  360       339       37.523
south korea              64        27        34.152
the fda                  100       60        32.787
american public          61        26        32.179
the senate               46        15        31.531
commercial labs          43        13        31.355
the house                53        21        30.255
let me                   188       157       29.345
with regard              46        17        28.155
regard to                46        17        28.155
of washin

In [31]:
calculate_keyness(jan_bigram_dist, apr_bigram_dist, top=50)

WORD                     Corpus Freq.RC Freq.  Keyness
the risk                 17        7         115.148
united states            32        238       81.321
14 days                  12        13        67.351
the chinese              10        6         63.663
the united               27        247       59.414
in china                 10        9         58.633
department of            14        46        54.672
mr secretary             11        26        48.850
homeland security        7         7         40.057
american public          9         26        37.048
the department           10        38        36.641
to china                 8         19        35.466
public health            13        87        35.321
of homeland              6         6         34.334
risk of                  6         10        29.917
a public                 5         5         28.612
of the                   53        1551      28.341
from china               8         33        28.218
dr redfi

In [32]:
calculate_keyness(apr_bigram_dist, jan_bigram_dist)

WORD                     Corpus Freq.RC Freq.  Keyness
going to                 1867      11        12.686
to be                    1204      6         10.343
i think                  1119      6         8.734


### Initial Notes

* "the risk" was the most key bigram, meaning The White House used this pair of words far more often in January than April. This could mean they were trying to explain the new, groundshaking 
* When we compare the recent corpora of April Briefings to older ones like January, it is clear that there are not that many words that are key. What can we learn from this? It means that other words used in April speeches have been used before. It is largely a reiteration of the same information. This is not, however, the case when we compare speeches from January to April. This means that the speeches are not using the same words and delivering the same information throughout the months. Rather, speeches in January had a number of words that were unique to January, that the White House phased out of usage over time. The most notable instance is with the bigram "the risk". For a further examination of what this means, I will consider a concordance analysis into this and other top words.
* Looking at words from March to April, March was far more focused on combating the coronavirus itself. Within it's top 20 key words there were "travel","tested","risk", "symptoms", "elderly", "legislation","house" and "senate". None of these words were numbers.
* Looking from April to March, the top 20 key words included "he" being the most key word. I anticipate this is in reference to NY Govornor Andrew Cuomo, who Donald Trump made a big enemy of. other words include "states", "oil", "antibody", "infrastructure" and "reopen". 7 of these words were numbers.
* From March to April, he shifts ownership of the problem. In march he uses we, while in April he uses They and He.
* "in fact" and "you see" as language used to correct things


### Looking at Trigrams

In [33]:
calculate_keyness(mar_trigram_dist, apr_trigram_dist, top=30)

WORD                     Corpus Freq.RC Freq.  Keyness
the vice president:      273       198       61.466
mr vice president        112       52        53.401
the american public      59        25        31.330
thank you the            67        33        29.609
you the vice             33        8         28.144
with regard to           45        17        26.959
vice president: well     49        21        25.689
for the american         37        13        23.748
spread of the            38        14        23.328
q mr vice                29        8         22.657
the spread of            44        20        21.532
the risk of              23        5         20.958
the private sector       30        10        20.180
you mr vice              47        24        19.775
president: thank you     117       96        19.222
personal protective equipment46        24        18.776
health and safety        25        8         17.410
let me say               29        12        15.808
as th

In [34]:
calculate_keyness(apr_trigram_dist, mar_trigram_dist, top=30)

WORD                     Corpus Freq.RC Freq.  Keyness
— the president:         708       297       47.696
you can see              68        15        18.564
take a look              70        16        18.237
the president: and       100       31        15.656
the president: you       57        13        14.894
i — i                    37        6         14.064
in the back              53        12        13.984
to get back              59        15        13.184
they want to             106       37        12.623
to be a                  86        28        12.107
want to see              31        5         11.841
— and you                31        5         11.841
you have a               85        28        11.651
the president: i         214       96        10.934
what they’re doing       34        7         10.105
that dr birx             31        6         9.884
the greatest economy     31        6         9.884
some of the              147       62        9.701
go ahead q  