In [1]:
import tweepy
import configparser
import os
import json
import GetOldTweets3 as got
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import string
import random
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
import csv
import seaborn as sn
import graphviz
import math

from collections import Counter

In [2]:
def calculate_keyness(fdist1, fdist2, fthreshold=5, keyness_threshold=6.6, top=100, print_table=True):
    '''create a keyness comparison table from two frequency lists
    '''
    
    c1size = sum(fdist1.values())
    c2size = sum(fdist2.values())

    
    kdata = []
    
    for item, freq in fdist1.items():
        if freq<fthreshold:
            continue
            
        ref_freq = fdist2.get(item,0)
        
        if ref_freq<fthreshold:
            continue
        
        
        keyness = log_likelihood(freq, c1size, ref_freq, c2size)
        
        row = {'item': item, 'freq': freq, 'ref_freq': ref_freq, 'keyness': keyness}
        
        if keyness>keyness_threshold:
        
            kdata.append(row)
        
    
    kdf = pd.DataFrame(kdata)[['item', 'freq', 'ref_freq', 'keyness']]
    
    kdf=kdf.sort_values('keyness', ascending=False)
    
    if not print_table:
        return kdf[:top]
    
    template = "{: <25}{: <10}{: <10}{:0.3f}"
    
    header = "{: <25}{: <10}{: <10}{}".format('WORD', 'Corpus A Freq.', 'Corpus B Freq.', 'Keyness')
    
    print("{}\n{}".format(header, "="*len(header)))
    
    for item, freq, ref_freq, keyness in kdf[:top].values:
        print(template.format(item, freq, ref_freq, keyness))

In [3]:
def log_likelihood(item_A_freq, corpus_A_size, item_B_freq, corpus_B_size):
    '''calculate the log likelihood score for a comparison between the frequency of two items
    '''
    E1 = corpus_A_size*(item_A_freq+item_B_freq) / (corpus_A_size+corpus_B_size)
    E2 = corpus_B_size*(item_A_freq+item_B_freq) / (corpus_A_size+corpus_B_size)

    G2 = 2*((item_A_freq*math.log(item_A_freq/E1)) + (item_B_freq*math.log(item_B_freq/E2)))
    
    sign = 1 if (item_A_freq / corpus_A_size) >= (item_B_freq / corpus_B_size) else -1
    
    return sign*G2

In [4]:
def tokenize(text, lowercase=False, strip_chars=''):
    '''create a list of tokens from a string by splitting on whitespace and applying optional normalization 
    
    Args:
        text        -- a string object containing the text to be tokenized
        lowercase   -- should text string be normalized as lowercase (default: False)
        strip_chars -- a string indicating characters to strip out of text, e.g. punctuation (default: empty string) 
        
    Return:
        A list of tokens
    '''
    
    # create a replacement dictionary from the
    # string of characters in the **strip_chars**
    rdict = str.maketrans('','',strip_chars)
    
    if lowercase:
        text = text.lower()
    
    tokens = text.translate(rdict).split()
    
    return tokens

In [5]:
def get_ngram_tokens(tokens, n=1):
    '''create a list of n-gram tokens from a list of tokens
    
    Args:
        tokens -- a list of tokens
        n      -- the size of the window to use to build n-gram token list
        
    Returns:
        
        list of n-gram strings (whitespace separated) of length n
    '''
    
    if n<2 or n>len(tokens):
        return tokens
    
    new_tokens = []
    
    for i in range(len(tokens)-n+1):
        new_tokens.append(" ".join(tokens[i:i+n]))
        
    return new_tokens

In [6]:
def make_kwic(kw, text, win=4):
    '''A basic KWIC function for a text
    
    Args:
        kw   -- string match for keyword to match for each line
        text -- a list of tokens for the text
        
    Return:
        list of lines of form [ [left context words], kw, [right context words]]
    '''
    
    hits = [(w,i) for i,w in enumerate(text) if w==kw]
    
    lines = []
    for hit in hits:
        left = text[hit[1]-win:hit[1]]
        kw = text[hit[1]]
        right = text[hit[1]+1 : hit[1]+win+1]
        
        
        left = ['']*(win-len(left)) + left if len(left)<win else left
        right = right+['']*(win-len(right)) if len(right)<win else right

        
        lines.append([left, kw, right])
        
    return lines

In [7]:
def plot_shared_vocab(cdf, start=5, end=100):
    
    def scaler(values):
        vmin=min(values)
        vmax=max(values)

        scaled_values = [(v-vmin)/(vmax-vmin)for v in values]
        return scaled_values
    
    with plt.style.context('seaborn-paper'):

        fig = plt.figure(figsize=(18,9))

        ax_max = cdf.iloc[start:end][['Tweets_percent','Articles_percent']].max().max()
        ax_min = cdf.iloc[start:end][['Tweets_percent','Articles_percent']].min().min()
        
        cdf['text_size'] = scaler(cdf.Tweets_percent.add(cdf.Articles_percent))

        for row in cdf[start:end].itertuples():
            plt.text(row.Tweets_percent*.8, row.Articles_percent*1.2, row.word, 
                     color='#0000FF', 
                     size=140*row.text_size,
                     ha='center', va='center', alpha=0.25)

        plt.axis([0, ax_max, 0, ax_max])
        plt.plot((0,ax_max),(0,ax_max)) #, color='#A0A0A0')

        plt.xticks([])
        plt.yticks([])
        plt.xlabel('Use in Tweets (% of tokens)', fontsize=20)
        plt.ylabel('Use in Articles (% of tokens)', fontsize=20)

        plt.show()

In [8]:
def print_kwic(kwic, win=None):
    '''A basic print function for a KWIC object
    
    Args:
        kwic -- a list of KWIC lines of the form [ [left words], kw, [right words]]
        win  -- if None then use all words provided in context otherwise limit by win
        
    Prints KWIC lines with left context width/padding win*8 characters
    '''
    
    if not kwic:
        return
    
    if win is None:
        win = len(kwic[0][0])
    
    for line in kwic:
        print("{: >{}}  {}  {}".format(' '.join(line[0][-win:]), 
                                      win*10, 
                                      line[1], 
                                      ' '.join(line[2][:win])
                                     )
             )    

In [9]:
def sort_kwic(kwic, order=None):
    ''' sort a kwic list using the passed positional arguments 
    
    Args:
        kwic   -- a list of lists [ [left tokens], kw, [right tokens]]
        order  -- a list of one or more positional arguments of form side-pos, e.g. L1, R3, L4 (default: None)
    
    Returns:
        kwic sorted for each positional argument in reverse, i.e. ['R1','L1'] sorts first by L1 and then R1
    '''
    if order is None:
        return kwic
   
    order = [order] if not type(order) is list else order
    order.reverse()
    
    for sort_term in order:
        if not re.match('[LR][1-4]', sort_term):
            pass
        
        pos1 = 0 if sort_term[0]=='L' else 2
        pos2 = int(sort_term[1])-1
        pos2 = 3-pos2 if sort_term[0]=='L' else pos2
        kwic.sort(key=lambda l : l[pos1][pos2])
    
    return kwic

In [10]:
def get_collocates(kw,token_dict, span=4):
    ''' Create a frequency list of the collocates of a specified keyword in a corpus of texts
    
    Args:
        kw         -- keyword to use as center of analysis
        token_dict -- corpus in the form of a dictionary of tokenized texts where a text is a list of tokens
        span       --
        
    Returns:
    
        Frequency list of collocates in a Counter object 
    '''
    collocates = Counter()
    for speech, tokens in token_dict.items():
        hits = [i for i,w in enumerate(tokens) if w==kw]
        for i in hits:
            collocates.update(tokens[i-span:i] + tokens[i+1:span+1] )
        
    return collocates

In [11]:
def collocates(tokens, kw, win=[4,4]):
    '''return the collocates in a window around a given keyword
    
    Args:
          tokens -- a list of tokens
          kw     -- keyword string to find and get collocates for
          win    -- a list of number of tokens to left (index 0) and right (index 1) to use; default: [4,4]
    
    Returns:
          a list of contexts (matching window specification) around each instance of keyword in tokens
    '''
    hits = [p for p,t in enumerate(tokens) if t==kw]
    
    context=[]
    for hit in hits:
        left = [] if win[0]<1 else tokens[hit-win[0]:hit]
        right = [] if win[1]<1 else tokens[hit+1:hit+win[1]+1]
        
        context.extend(left)
        context.extend(right)
        
    return context

In [12]:
def get_colls(texts,kw, win=[4,4]):
    '''create a collocate frequency list for instances of a kw in a list of texts
    
    Args:
        texts  -- a list of tokenized texts
        kw     -- keyword string to find and get collocates for
        win    -- a list of number of tokens to left (index 0) and right (index 1) to use; default: [4,4]
    
    Returns:
        a list-of-tuples where each tuple is (collocate, freq_with_kw, coll_total_freq)
    '''
    word_dist = Counter()
    colls = Counter()
    for text, tokens in texts.items():
        word_dist.update(tokens)
        colls.update(collocates(tokens,kw, win))
    
    return [(str(k),v, word_dist[k]) for k,v in colls.items()], word_dist.get(kw), sum(word_dist.values())

In [13]:
def plot_collocates(kw, collocate_list, num=20, show_freq=False, title=None, threshold=1):
    ''' Create a graph of the collocates of a keyword within a specified window and threshold
    
    Args:
        kw              -- keyword to place at center of graph
        collocate_list  -- Counter object of collocate frequencies
        num             -- the number of collocates (in descending frequency to display) [default=20]
        show_freq       -- whether to show frequency beside edge True/False [default=False]
        title           -- string to use as a title for the plot [default=None]
        threshold       -- frequency threshold for showing edges [default=1]
        
    '''
    cG = graphviz.Graph(engine='neato')
    cG.attr('graph', overlap='scalexy', size="6,6")
    if title:
        cG.attr('graph', label=title, labelloc='t', fontsize='20')
    for item, freq in collocate_list.most_common(num):
        if freq >= threshold:
            cG.edge(kw.upper(), item, penwidth=str(math.log(freq,2)), 
                    label=None if not show_freq else str(freq))
    
    return cG

In [14]:
def compare_items(dist1, dist2, dist3, dist4, items, scaling=10000):
    ''' given two Counter objects with common keys compare the frequency and relative frequency of list of items
    
    Args:
        dist1    -- Counter frequency list object
        dist2    -- Counter frequency list object
        items    -- list of string items that should be keys in dist1 and dist2
        scaling  -- normalization factor, e.g. 10,000 words (default: 100000)
    
    Returns:
    
        list of tuples of form
            (item, item_freq_dist1, norm_item_freq_dist1, item_freq_dist2, norm_item_freq_dist2)
    '''
    dist1_size = sum(dist2.values())
    dist2_size = sum(dist2.values())
    dist3_size = sum(dist3.values())
    dist4_size = sum(dist4.values())

    item_comparison = []
    
    for item in items:
        
        d1_freq = dist1.get(item,0)
        d2_freq = dist2.get(item,0)
        d3_freq = dist3.get(item,0)
        d4_freq = dist4.get(item,0)
        
        item_comparison.append((item, 
                                d1_freq, d1_freq/dist1_size*scaling,
                                d2_freq, d2_freq/dist2_size*scaling,
                                d3_freq, d3_freq/dist3_size*scaling,
                                d4_freq, d4_freq/dist4_size*scaling))
    
    return item_comparison

In [15]:
def compare_2items(dist1, dist2, items, scaling=10000):
    ''' given two Counter objects with common keys compare the frequency and relative frequency of list of items
    
    Args:
        dist1    -- Counter frequency list object
        dist2    -- Counter frequency list object
        items    -- list of string items that should be keys in dist1 and dist2
        scaling  -- normalization factor, e.g. 10,000 words (default: 100000)
    
    Returns:
    
        list of tuples of form
            (item, item_freq_dist1, norm_item_freq_dist1, item_freq_dist2, norm_item_freq_dist2)
    '''
    dist1_size = sum(dist2.values())
    dist2_size = sum(dist2.values())

    item_comparison = []
    
    for item in items:
        
        d1_freq = dist1.get(item,0)
        d2_freq = dist2.get(item,0)
        
        item_comparison.append((item, 
                                d1_freq, d1_freq/dist1_size*scaling,
                                d2_freq, d2_freq/dist2_size*scaling))
    
    return item_comparison

In [16]:
def comparison_plot(comparison_data, label1='corpus 1', label2='corpus 2',label3='corpus 3',label4='corpus 4'):
    ''' create a paired barplot of relative frequencies of items in two corpora
    
    Args:
        comparison_data --  list of tuples produced by the compare_items() function
        label1          --  legend label for first corpus (default: corpus 1)
        label2          --  legend label for second corpus (default: corpus 2)
        
    Produces a Seaborn barplot
    '''
    fig = plt.figure(figsize=(12,6))
    
    df=pd.DataFrame(comparison_data)[[0,2,4,6,8]] 
    df.columns = ['item', label1, label2, label3, label4]
    df2=df.melt(id_vars=['item'])
    df2.columns=['item', 'corpus', 'frequency']
    sn.barplot(x='item',y='frequency', hue='corpus',data=df2)
    plt.show()

In [17]:
def comparison2_plot(comparison_data, label1='corpus 1', label2='corpus 2'):
    ''' create a paired barplot of relative frequencies of items in two corpora
    
    Args:
        comparison_data --  list of tuples produced by the compare_items() function
        label1          --  legend label for first corpus (default: corpus 1)
        label2          --  legend label for second corpus (default: corpus 2)
        
    Produces a Seaborn barplot
    '''
    fig = plt.figure(figsize=(12,6))
    
    df=pd.DataFrame(comparison_data)[[0,2,4]] 
    df.columns = ['item', label1, label2]
    df2=df.melt(id_vars=['item'])
    df2.columns=['item', 'corpus', 'frequency']
    sn.barplot(x='item',y='frequency', hue='corpus',data=df2)
    plt.show()

In [18]:
def jsonconverter(o):
    if isinstance(o, datetime.datetime):
        return o.__str__()

In [19]:
def download_query_tweets(query, date_since, date_until, max=100000):
    print(f"Downloading tweets for query: '{query}' from {date_since} to {date_until}")
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch(query)\
                                               .setSince(date_since)\
                                               .setUntil(date_until)\
                                               .setMaxTweets(max)

    tweets = got.manager.TweetManager.getTweets(tweetCriteria)
    list_of_tweets = [tweet.__dict__ for tweet in tweets]
    return list_of_tweets

In [20]:
def download_user_tweets(username, user_query, date_since, date_until):
    print(f"Downloading for {username} and {query}")
    tweetCriteria = got.manager.TweetCriteria().setUsername(username)\
                                               .setQuerySearch(query)\
                                               .setSince(date_since)\
                                               .setUntil(date_until)

    tweets = got.manager.TweetManager.getTweets(tweetCriteria)
    list_of_tweets = [tweet.__dict__ for tweet in tweets]
    return list_of_tweets

In [21]:
def load_tweets(tf):
    tweets = []
    for tweet in open(tf):
        tweets.append(json.loads(tweet))
    return tweets

In [22]:
def DictListUpdate(lis1, lis2):
    for aLis1 in lis1:
        lis2.append(aLis1)
    return lis2