In [1]:
##################################################################
# In this Notebook I aggregated all my functions
#################################################################

In [2]:
import math
import pandas as pd
import re
import numpy as np

In [3]:
def calculate_keyness(fdist1, fdist2, fthreshold=5, keyness_threshold=6.6, top=100, print_table=True):
    '''create a keyness comparison table from two frequency lists
    '''
    
    c1size = sum(fdist1.values())
    c2size = sum(fdist2.values())

    
    kdata = []
    
    for item, freq in fdist1.items():
        if freq<fthreshold:
            continue
            
        ref_freq = fdist2.get(item,0)
        
        if ref_freq<fthreshold:
            continue
        
        
        keyness = log_likelihood(freq, c1size, ref_freq, c2size)
        
        row = {'item': item, 'freq': freq, 'ref_freq': ref_freq, 'keyness': keyness}
        
        if keyness>keyness_threshold:
        
            kdata.append(row)
        
    
    kdf = pd.DataFrame(kdata)[['item', 'freq', 'ref_freq', 'keyness']]
    
    kdf=kdf.sort_values('keyness', ascending=False)
    
    if not print_table:
        return kdf[:top]
    
    template = "{: <25}{: <10}{: <10}{:0.3f}"
    
    header = "{: <25}{: <10}{: <10}{}".format('WORD', 'Corpus A Freq.', 'Corpus B Freq.', 'Keyness')
    
    print("{}\n{}".format(header, "="*len(header)))
    
    for item, freq, ref_freq, keyness in kdf[:top].values:
        print(template.format(item, freq, ref_freq, keyness))

In [4]:
def log_likelihood(item_A_freq, corpus_A_size, item_B_freq, corpus_B_size):
    '''calculate the log likelihood score for a comparison between the frequency of two items
    '''
    E1 = corpus_A_size*(item_A_freq+item_B_freq) / (corpus_A_size+corpus_B_size)
    E2 = corpus_B_size*(item_A_freq+item_B_freq) / (corpus_A_size+corpus_B_size)

    G2 = 2*((item_A_freq*math.log(item_A_freq/E1)) + (item_B_freq*math.log(item_B_freq/E2)))
    
    sign = 1 if (item_A_freq / corpus_A_size) >= (item_B_freq / corpus_B_size) else -1
    
    return sign*G2

In [5]:
def tokenize(text, lowercase=False, strip_chars=''):
    '''create a list of tokens from a string by splitting on whitespace and applying optional normalization 
    
    Args:
        text        -- a string object containing the text to be tokenized
        lowercase   -- should text string be normalized as lowercase (default: False)
        strip_chars -- a string indicating characters to strip out of text, e.g. punctuation (default: empty string) 
        
    Return:
        A list of tokens
    '''
    
    # create a replacement dictionary from the
    # string of characters in the **strip_chars**
    rdict = str.maketrans('','',strip_chars)
    
    if lowercase:
        text = text.lower()
    
    tokens = text.translate(rdict).split()
    
    return tokens

In [6]:
def get_ngram_tokens(tokens, n=1):
    '''create a list of n-gram tokens from a list of tokens
    
    Args:
        tokens -- a list of tokens
        n      -- the size of the window to use to build n-gram token list
        
    Returns:
        
        list of n-gram strings (whitespace separated) of length n
    '''
    
    if n<2 or n>len(tokens):
        return tokens
    
    new_tokens = []
    
    for i in range(len(tokens)-n+1):
        new_tokens.append(" ".join(tokens[i:i+n]))
        
    return new_tokens

In [7]:
def make_kwic(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    hits = [(w,i) for i,w in enumerate(text) if w==kw]
    
    lines = []
    for hit in hits:
        left = text[hit[1]-win:hit[1]]
        kw = text[hit[1]]
        right = text[hit[1]+1 : hit[1]+win+1]
        
        
        left = ['']*(win-len(left)) + left if len(left)<win else left
        right = right+['']*(win-len(right)) if len(right)<win else right

        
        lines.append([left, kw, right])
        
    return lines

In [8]:
def print_kwic(kwic, win=None):
    '''A basic print function for a KWIC object
    
    Args:
        kwic -- a list of KWIC lines of the form [ [left words], kw, [right words]]
        win  -- if None then use all words provided in context otherwise limit by win
        
    Prints KWIC lines with left context width/padding win*8 characters
    '''
    
    if not kwic:
        return
    
    if win is None:
        win = len(kwic[0][0])
    
    for line in kwic:
        print("{: >{}}  {}  {}".format(' '.join(line[0][-win:]), 
                                      win*10, 
                                      line[1], 
                                      ' '.join(line[2][:win])
                                     )
             )    

In [9]:
def sort_kwic(kwic, order=None):
    ''' sort a kwic list using the passed positional arguments 
    
    Args:
        kwic   -- a list of lists [ [left tokens], kw, [right tokens]]
        order  -- a list of one or more positional arguments of form side-pos, e.g. L1, R3, L4 (default: None)
    
    Returns:
        kwic sorted for each positional argument in reverse, i.e. ['R1','L1'] sorts first by L1 and then R1
    '''
    if order is None:
        return kwic
   
    order = [order] if not type(order) is list else order
    order.reverse()
    
    for sort_term in order:
        if not re.match('[LR][1-4]', sort_term):
            pass
        
        pos1 = 0 if sort_term[0]=='L' else 2
        pos2 = int(sort_term[1])-1
        pos2 = 3-pos2 if sort_term[0]=='L' else pos2
        kwic.sort(key=lambda l : l[pos1][pos2])
    
    return kwic

In [10]:
def collocates(tokens, kw, win=[4,4]):
    '''return the collocates in a window around a given keyword
    
    Args:
          tokens -- a list of tokens
          kw     -- keyword string to find and get collocates for
          win    -- a list of number of tokens to left (index 0) and right (index 1) to use; default: [4,4]
    
    Returns:
          a list of contexts (matching window specification) around each instance of keyword in tokens
    '''
    hits = [p for p,t in enumerate(tokens) if t==kw]
    
    context=[]
    for hit in hits:
        left = [] if win[0]<1 else tokens[hit-win[0]:hit]
        right = [] if win[1]<1 else tokens[hit+1:hit+win[1]+1]
        
        context.extend(left)
        context.extend(right)
        
    return context

In [11]:
def get_colls(texts,kw, win=[4,4]):
    '''create a collocate frequency list for instances of a kw in a list of texts
    
    Args:
        texts  -- a list of tokenized texts
        kw     -- keyword string to find and get collocates for
        win    -- a list of number of tokens to left (index 0) and right (index 1) to use; default: [4,4]
    
    Returns:
        a list-of-tuples where each tuple is (collocate, freq_with_kw, coll_total_freq)
    '''
    word_dist = Counter()
    colls = Counter()
    for text, tokens in texts.items():
        word_dist.update(tokens)
        colls.update(collocates(tokens,kw, win))
    
    return [(str(k),v, word_dist[k]) for k,v in colls.items()], word_dist.get(kw), sum(word_dist.values())

In [12]:
def load_tweets(tfile):
    
    tweets = []
    for line in open(tfile):
        try:
            tweets.append(json.loads(line))
        except:
            pass
        
    return tweets

In [13]:
def process_tweet(tweet):
    try:
        toks = tt.tokenize(tweet['text'])
    
        tweet['tokens'] = toks
    
        tweet['VAD_toks'] = [] 
        tweet['Valence']=0
        tweet['Dominance']=0
        tweet['Arousal']=0
    
        for t in toks:
            if t.lower() in NRC_VAD.keys():
                scores = NRC_VAD[t.lower()]
                scores['tok']=t
            
                tweet['Valence']+=scores['V']
                tweet['Arousal']+=scores['A']
                tweet['Dominance']+=scores['D']
            
                tweet['VAD_toks'].append(scores)
    
    
        for dimension in ('Valence','Arousal','Dominance'):
            if len(tweet['VAD_toks'])>0:
                tweet[dimension] /= len(tweet['VAD_toks'])
    except:
        toks = tt.tokenize(tweet['content'])
        tweet['tokens'] = toks
    
        tweet['VAD_toks'] = [] 
        tweet['Valence']=0
        tweet['Dominance']=0
        tweet['Arousal']=0
    
        for t in toks:
            if t.lower() in NRC_VAD.keys():
                scores = NRC_VAD[t.lower()]
                scores['tok']=t
            
                tweet['Valence']+=scores['V']
                tweet['Arousal']+=scores['A']
                tweet['Dominance']+=scores['D']
            
                tweet['VAD_toks'].append(scores)
    
    
        for dimension in ('Valence','Arousal','Dominance'):
            if len(tweet['VAD_toks'])>0:
                tweet[dimension] /= len(tweet['VAD_toks'])
        

In [14]:
def pmi(A, B, AB, N):
    '''calculate pointwise mutual information for a pair of words given their co-occurring frequency and total frequencies
    
    Args:
        A   -- total frequency of word 1
        B   -- total frequency of word 1
        AB  -- frequency of word 1 and word 2 together
        N   -- number of tokens in corpus/sample
        
    Returns:
        the PMI value   log2( AB / A*B * N)
    '''
    return math.log2(N* (AB / (A * B)))

In [15]:
def average(lst):
    return round(sum(lst) / len(lst),2)

In [16]:
def fill_missing_range(df, field, range_from, range_to, range_step=1, fill_with=0):
    return df\
      .merge(how='right', on=field,
            right = pd.DataFrame({field:np.arange(range_from, range_to, range_step)}))\
      .sort_values(by=field).reset_index().fillna(fill_with).drop(['index'], axis=1)