## 1. OBJECTIVE

Concordance analysis allows us to see a given word or bigram in context to provide more insight about the meaning. 

## 2. SETUP

In [1]:
import json
from collections import Counter

In [2]:
# import modules

import os
import random
import re
import string

import json
from collections import Counter

In [3]:
### PARAMETERS

to_strip = ',.\xa0:-()\';$"/?][!`Ą@Ś§¨’–“”…ï‘>&\\%˝˘*'

## 3. FUNCTIONS

In [4]:
def tokenize(text, lowercase=True, strip_chars=''):
    '''turn a string into a list of whitespace separated tokens - after observing lowercase flag and stripping specified characters
    
    Args:
        text        -- a string object containing the text to be tokenized
        lowercase   -- whether the string should be lowercased before tokenization (default: True)
        strip_chars -- a string containing a series of characters which should be stripped from text before tokenization (default: empty string)
        
    
    Returns:
        list of tokens
    '''
    if lowercase:
        text = text.lower()
        
    rdict = str.maketrans('','',strip_chars)
    text = text.translate(rdict)
        
    tokens=text.split()
    
    return tokens

In [5]:
def make_kwic(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    hits = [(w,i) for i,w in enumerate(text) if w==kw]
    
    lines = []
    for hit in hits:
        hidx = hit[1]
        left = text[hidx-win:hidx]
        kw = hit[0]
        right = text[hidx+1 : hidx+win+1]
        
        left = ['']*len(left)-win + left if len(left)<win else left
        
        right = right + ['']*len(left)-win  if len(right)<win else right
        
        lines.append([left, kw, right])
        
    return lines

In [6]:
#multi-word KWIC function put together by Prof. O'Donnell
def make_kwic2(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line if it contains spaces split into list of kws
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    
    kw_toks = kw.split()
    kw_width = len(kw_toks)
    
    hits = []
    for i,w in enumerate(text):
        match = text[i:i+kw_width]
        if match==kw_toks:
            hits.append((' '.join(kw_toks),i))
    
    
    lines = []
    for hit in hits:
        left = text[hit[1]-win:hit[1]]
        kw = ' '.join(text[hit[1]:hit[1]+kw_width])
        right = text[hit[1]+kw_width : hit[1]+win+kw_width]
        
        
        left = ['']*(win-len(left)) + left if len(left)<win else left
        right = right+['']*(win-len(right)) if len(right)<win else right

        
        lines.append([left, kw, right])
        
    return lines

In [7]:
def print_kwic(kwic, win=None):
    '''A basic print function for a KWIC object
    
    Args:
        kwic -- a list of KWIC lines of the form [ [left words], kw, [right words]]
        win  -- if None then use all words provided in context otherwise limit by win
        
    Prints KWIC lines with left context width/padding win*8 characters
    '''
    
    if not kwic:
        return
    
    if win is None:
        win = len(kwic[0][0])
    
    for line in kwic:
        print("{: >{}}  {}  {}".format(' '.join(line[0][-win:]), 
                                      win*8, 
                                      line[1], 
                                      ' '.join(line[2][:win])
                                     )
             )    

In [8]:
def sort_kwic(kwic, order=None):
    ''' sort a kwic list using the passed positional arguments 
    
    Args:
        kwic   -- a list of lists [ [left tokens], kw, [right tokens]]
        order  -- a list of one or more positional arguments of form side-pos, e.g. L1, R3, L4 (default: None)
    
    Returns:
        kwic sorted for each positional argument in reverse, i.e. ['R1','L1'] sorts first by L1 and then R1
    '''
    if order is None:
        return kwic
   
    order = [order] if not type(order) is list else order
    order.reverse()
    
    for sort_term in order:
        if not re.match('[LR][1-4]', sort_term):
            pass
        
        pos1 = 0 if sort_term[0]=='L' else 2
        pos2 = int(sort_term[1])-1
        pos2 = 3-pos2 if sort_term[0]=='L' else pos2
        kwic.sort(key=lambda l : l[pos1][pos2])
    
    return kwic

I made some edits to the make_kwic2 function in order for me to use the findings for sentiment analysis.

In [9]:
def make_kwic2_as_text(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line if it contains spaces split into list of kws
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    
    kw_toks = kw.split()
    kw_width = len(kw_toks)
    
    hits = []
    for i,w in enumerate(text):
        match = text[i:i+kw_width]
        if match==kw_toks:
            hits.append((' '.join(kw_toks),i))
    
    
    lines = []
    for hit in hits:
        left = text[hit[1]-win:hit[1]]
        kw = ' '.join(text[hit[1]:hit[1]+kw_width])
        right = text[hit[1]+kw_width : hit[1]+win+kw_width]
        
        
        left = ['']*(win-len(left)) + left if len(left)<win else left
        right = right+['']*(win-len(right)) if len(right)<win else right

        
        lines.extend([left + right])
        
    return lines

In [10]:
def make_kwic_as_text(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    hits = [(w,i) for i,w in enumerate(text) if w==kw]
    
    lines = []
    for hit in hits:
        hidx = hit[1]
        left = text[hidx-win:hidx]
        kw = hit[0]
        right = text[hidx+1 : hidx+win+1]
        
        left = ['']*len(left)-win + left if len(left)<win else left
        
        right = right + ['']*len(left)-win  if len(right)<win else right
        
        lines.append(left + right)
        
    return lines

## 4. LOAD DATA

In [11]:
corpus = json.load(open('data/briefing_transcripts.json'))

In [12]:
apr_briefings = [item for item in corpus if item['date'].split()[0]=='Apr']
mar_briefings = [item for item in corpus if item['date'].split()[0]=='Mar']
feb_briefings = [item for item in corpus if item['date'].split()[0]=='Feb']
jan_briefings = [item for item in corpus if item['date'].split()[0]=='Jan']

In [13]:
# set up a list for April tokens
apr_tokens = []

# process each of the April briefings
for briefing in apr_briefings:
    
    # get the text string for the transcript of current briefing
    briefing_text = briefing['text']
    
    # get a list of tokens for this text
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    
    # update the list of tokens
    apr_tokens.extend(tokens)

In [14]:
jan_tokens = []
for briefing in jan_briefings:
    briefing_text = briefing['text']
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    jan_tokens.extend(tokens)

In [15]:
mar_tokens = []
for briefing in mar_briefings:
    briefing_text = briefing['text']
    tokens = tokenize(briefing_text, lowercase=True, strip_chars=to_strip)
    mar_tokens.extend(tokens)

### He Did: April

In [16]:
apr_kwic_hedid = make_kwic2("he did", apr_tokens, win=6)

In [17]:
len(apr_kwic_hedid)

40

In [18]:
print_kwic(sort_kwic(apr_kwic_hedid, order= ['R6']))

                 you that sir the president yeah  he did  say that actually russia talked about
                        few people can do it and  he did  it well and then he became
                      didnt know about it and if  he did  know about it he wouldve been
                        so he didnt go crazy but  he did  say — q your ban bought
                  happened there and i had heard  he did  because he didnt want to cause
                   how did president trump do oh  he did  uh terribly lets see he did
               ban bought you time the president  he did  call me xenophobic q what did
                        i didnt speak to him but  he did  that i think just to end
        — and tremendous intelligence doing what  he did  it was not that easy for
                            see he did — uh yeah  he did  terribly oh but we just got
        of laboratories in each individual state  he did  raise the issue that we had
             terminate — the president i thought

variety of different subjects that he is referring to

### Should Have: March

In [18]:
mar_kwic_shouldhave = make_kwic2("should have", mar_tokens, win=6)

In [19]:
print_kwic(sort_kwic(mar_kwic_shouldhave, order= ['R6']))

        focus on critical infrastructure jobs we  should have  that guidance before the president and
               that were right here because they  should have  never allowed it to happen but
             cdcapproved tests are moving out we  should have  2500 kits out before the end
               american — i think every american  should have  a grateful heart first and foremost
            talking about the ventilators but he  should have  ordered the ventilators and he had
                         to all of them but they  should have  told us about this and i
               you did all those divisions italy  should have  close to 400000 deaths theyre not
                     they knew about it and they  should have  told us we could have saved
                next italy the president well he  should have  — you know the hospital systems
                       up to half of the country  should have  caution before traveling and if thats
                 epidemic the way people sa

less instances in March, more opportunity for reflection in April

### Should Have: April

In [20]:
apr_kwic_shouldhave = make_kwic2("should have", apr_tokens, win=6)

In [21]:
apr_kwic_shouldhave

[[['him', 'the', 'president', 'you', '—', 'you'],
  'should have',
  ['no', 'complaints', 'please', 'q', 'thank', 'you']],
 [['the', 'world', 'and', 'we', 'think', 'that'],
  'should have',
  ['happened', 'so', 'well', 'let', 'you', 'know']],
 [['at', 'a', 'group', 'of', 'people', 'that'],
  'should have',
  ['stopped', 'it', 'at', 'the', 'source', 'but']],
 [['one', 'day', 'because', 'of', 'something', 'that'],
  'should have',
  ['never', 'been', 'allowed', 'to', 'happen', 'we']],
 [['the', 'media', 'would', 'say', 'oh', 'you'],
  'should have',
  ['done', 'two', 'for', 'each', 'person', 'no']],
 [['it', 'death', 'that', 'was', 'unnecessary', 'it'],
  'should have',
  ['never', 'happened', 'it', 'should', 'have', 'never']],
 [['it', 'should', 'have', 'never', 'happened', 'it'],
  'should have',
  ['never', 'left', 'that', 'little', 'area', 'where']],
 [['is', 'what', 'every', 'department', 'of', 'health'],
  'should have',
  ['because', 'when', 'you', 'go', 'to', 'that']],
 [['things

In [22]:
print_kwic(sort_kwic(apr_kwic_shouldhave , order=['R1']))

              have infrastructure and i think we  should have  a payroll tax moratorium because thats
          worth of our agricultural product that  should have  a huge impact on our farmers
                         but on many of them you  should have  a picture on your — on
                           — it should be and it  should have  always been and ive always said
         theres other tests that other americans  should have  and i think this has really
              is what every department of health  should have  because when you go to that
               have been looking into china they  should have  been looking into china as an
       angry because information about the virus  should have  been told to us earlier and
                situation that was caused — that  should have  been solved long ago it could
                      and i was angry because it  should have  been told to us it should
                         have been told to us it  should have  been told to u

the rare ocassions where he says I, he says "Maybe I should have" or "some people think I should have"

### You See: April

In [24]:
apr_kwic_yousee= make_kwic2("you see", apr_tokens, win=6)

In [25]:
print_kwic(sort_kwic(apr_kwic_yousee  , order=['R1']))

                   low end you were shocked when  you see  100 and 120000 and 200000 people
        president theyre very sobering yeah when  you see  100000 people thats a — and
                     new jersey — 29 people when  you see  173 people died in new york
                     even thought of it and then  you see  6 million people unemployed unemployment numbers
unemployed unemployment numbers get released and  you see  6 million people and its an
                can businesses expect to have do  you see  a phase five a phase six
                 your question q mr secretary do  you see  a need for a phase four
           the president was talking about where  you see  a flattening out of cases and
                         at a very low level and  you see  a couple at a higher level
                      you look at that graph and  you see  all of the — the bumps
                         — they were tough — but  you see  all of those levels you know
                      and a

no clear trend

In [26]:
apr_kwic_hedidnt= make_kwic2("he didnt", apr_tokens, win=6)

In [27]:
print_kwic(sort_kwic(apr_kwic_hedidnt , order=['R1']))

                      the fake news is saying oh  he didnt  act fast enough well you remember
                        — we talked to him today  he didnt  bring that up today q havent
                    the president no he didnt no  he didnt  defy me at all thats your
                   me at all thats your language  he didnt  defy me q well i mean
                  would say wheres the 11th time  he didnt  do his job trump didnt do
                       he was supposed to do and  he didnt  do very well with it but
                rights activist but he said that  he didnt  do that and he was —
                   was — the president you think  he didnt  do it are you on his
                          go crazy like he has —  he didnt  even know what the hell the
                       you dont need the ban you  he didnt  go crazy like he has —
                             ban was but he — so  he didnt  go crazy but he did say
                       that he wished he did but  he didnt  g

no clear pattern

### We will: March

In [28]:
mar_kwic_wewill= make_kwic2("we will", mar_tokens, win=6)

In [29]:
print_kwic(sort_kwic(mar_kwic_wewill , order=['R1']))

                       of our people i know that  we will  achieve victory and quickly return to
                    fault we want to protect and  we will  all of the things that a
                   to making that data public so  we will  all know dr birx were committed
               something that i hope — hopefully  we will  all have made the right moves
                   earth and in the coming weeks  we will  all have to make changes and
                    than we were even before and  we will  also have apparatus in place that
                    on a program to address that  we will  also be working with small businesses
                   put in a great healthcare and  we will  always — i will say this
                        is yes and the answer is  we will  always maintain a solvent social security
                 listens to the governors and so  we will  assess at the end of the
                will only do it with preexisting  we will  back preexisting conditions okay 

### We will: April

In [30]:
apr_kwic_wewill= make_kwic2("we will", apr_tokens, win=6)

In [31]:
print_kwic(sort_kwic(apr_kwic_wewill, order=['R1']))

      need certain authorities from the treasury  we will  accommodate that so were — were
                   anthony the 100000 and i hope  we will  all right so today the department
                   if they need to remain closed  we will  allow them to do that and
               the last four weeks is incredible  we will  also continue to expand our testing
                   lee zeldin out in long island  we will  also be delivering another 200000 n95
        address volatility in global oil markets  we will  always protect our servicemen and servicewomen
                   ship we said if its necessary  we will  and we did there were military
                      going down — which i think  we will  and i feel certain that we
                      long that will be and then  we will  assess it and then we will
                  to have in place and hopefully  we will  at the time that we then
     deposit information and within several days  we will  automatically deposit the mo

### We're working: March

In [32]:
mar_kwic_wereworking= make_kwic2("were working", mar_tokens, win=6)

In [33]:
print_kwic(sort_kwic(mar_kwic_wereworking , order=['R1']))

                        i want to tell you folks  were working  247 on this were going to
                      for the people of new york  were working  along with him and then i
             the patients can be protected again  were working  also with logistics and if you
      likewise building hospitals in los angeles  were working  also — the state of washington
                         wont do well at all but  were working  also with the hotel industry but
          medical supplies from around the world  were working  as we speak on an airlift
                       i want to assure you that  were working  as quickly as we can i
             working very closely with italy and  were working  closely with spain too which is
            under the authority of each governor  were working  constantly with the department of defense
             but to your very important question  were working  day by day with the largest
                      and in fact were working —  were working

### We're working: April

In [34]:
apr_kwic_wereworking= make_kwic2("were working", apr_tokens, win=6)

In [35]:
print_kwic(sort_kwic(apr_kwic_wereworking , order=['R1']))

   15 days the presidents coronavirus guidelines  were working  and thats precisely why president trump
                    stone unturned as i said and  were working  around the clock to develop these
                            a day now but were —  were working  around the clock to scale up
          today as an antimalaria medication but  were working  around the country and internationally to
                america and to stay open america  were working  around the clock to continually expand
undertested underserved and minority communities and  were working  closely with partners and states to
             were working with the governors and  were working  closely with the governors the relationship
                        to be a real focal point  were working  closely with governor gretchen whitmer as
                    birx yeah thank you so while  were working  diligently in the midst of the
          million people were killed in addition  were working  directly with hospi

### Have Enough: April

In [54]:
apr_kwic_haveenough = make_kwic2("have enough", apr_tokens, win=6)

In [55]:
print_kwic(sort_kwic(apr_kwic_haveenough, order=['R1']))

           governors are still wondering do they  have enough  bed space and whats important here
                      does not mean that we dont  have enough  food in this country to feed
                           20 people a day now i  have enough  for thenext three days if that
               have enough ventilators yes do we  have enough  hospital beds yes weve built 20000
               weve built 20000 hospital beds we  have enough  hospital beds go ahead jim lets
                 them because they say they dont  have enough  illinois said they didnt have enough
                   some of these questions do we  have enough  masks no do we have enough
                   president but you asked do we  have enough  masks yes q we hear from
                         say that you said do we  have enough  masks yes q does the country
                  dont have enough tests we dont  have enough  masks the president the governor —
                      pandemic why is it we dont  have 

### In Fact: April

In [23]:
apr_kwic_infact = make_kwic2("in fact", apr_tokens, win=6)

In [24]:
print_kwic(sort_kwic(apr_kwic_infact, order=['R1']))

                to the indian health service and  in fact  1800 members of the public health
             people dont need a pointofcare test  in fact  a pointofcare test does not —
          new cases and the hospitalizations are  in fact  a reflection of the results of
                      the us developed a plan if  in fact  a second wave of the virus
              back q inaudible mr president that  in fact  are you suggesting — the president
           the nation literally by the thousands  in fact  as we stand here today 576
            soon would that happen the president  in fact  certain hospitals — yeah certain hosp
              beginning we appreciate it so much  in fact  dr fauci is here maybe i
                      sense of deaths a bad week  in fact  every day there seems to be
      country have no reported coronavirus cases  in fact  half of the states in america
                 tremendous — hes a tremendous —  in fact  he recently got the presidential medal
       

### Oil: April

In [25]:
apr_kwic_oil = make_kwic("oil", apr_tokens, win=6)

In [26]:
print_kwic(sort_kwic(apr_kwic_oil , order= ['R6']))

                 please q thank you mr president  oil  is trading today at about 23
         our reserves with this very inexpensive  oil  nobody thought theyd ever see a
         to let american producers store surplus  oil  that can be sold at a
                 water is far more valuable than  oil  theres so much oil its a
             for national security and also will  oil  companies fit into that at all
                          now there was a lot of  oil  but it was very controllable all
                       in order to agree to that  oil  deal the president yeah q and
                         because it has a lot of  oil  underneath — oil and gas and
              ever the airlines were doing great  oil  was doing great — oil and
                           so there was a lot of  oil  production to start off with and
                   the market so theres too much  oil  theres a glut and these are
                    a great oil industry and the  oil  industry is being rava

Trump is bailing out oil industry