In [1]:
import sys
import json
import nltk
import operator
from collections import Counter

In [2]:
def read_tweets_from_json(filename):
    tweets=[]
    with open(filename, encoding="utf-8") as file:
        data=json.load(file)
        for tweet in data:
            tweets.append(tweet["text"])
    return tweets

In [3]:
trump_tweets=read_tweets_from_json("../data/trump_tweets.json")

In [4]:
aoc_tweets=read_tweets_from_json("../data/aoc_tweets.json")

Explore your assumptions between the words you think will most distinguish the tweets of Donald Trump from those Alexandria Ocasio-Cortez.  Before looking at the data, what words do you think will be comparatively distinct to both?  (If you're not familiar with either, see http://twitter.com/realDonaldTrump and http://twitter.com/AOC).

In [5]:
def convert_tweets_to_tokens(tweets):
    tokens=[]
    for tweet in tweets:
        tokens.extend(nltk.casual_tokenize(tweet))
    return tokens

In [6]:
def get_counts(tokens):
    counts=Counter()
    for token in tokens:
        counts[token]+=1
    return counts

The $\chi^2$ test as used in the comparison of different texts is designed to measure how statistically significant the distriubtion of counts in a 2x2 contingency table is.  Use the following function to analyze the difference between these accounts.  How do the most distinct terms comport with your assumptions?

In [7]:
def chi_square(one_counts, two_counts):

    one_sum=0.
    two_sum=0.
    vocab={}
    for word in one_counts:
        one_sum+=one_counts[word]
        vocab[word]=1
    for word in two_counts:
        vocab[word]=1
        two_sum+=two_counts[word]

    N=one_sum+two_sum
    vals={}
    
    for word in vocab:
        O11=one_counts[word]
        O12=two_counts[word]
        O21=one_sum-one_counts[word]
        O22=two_sum-two_counts[word]
        
        # We'll use the simpler form given in Manning and Schuetze (1999) 
        # for 2x2 contingency tables: 
        # https://nlp.stanford.edu/fsnlp/promo/colloc.pdf, equation 5.7
        
        vals[word]=(N*(O11*O22 - O12*O21)**2)/((O11 + O12)*(O11+O21)*(O12+O22)*(O21+O22))
        
    sorted_chi = sorted(vals.items(), key=operator.itemgetter(1), reverse=True)
    one=[]
    two=[]
    for k,v in sorted_chi:
        if one_counts[k]/one_sum > two_counts[k]/two_sum:
            one.append(k)
        else:
            two.append(k)
    
    print ("@realdonaldtrump:\n")
    for k in one[:20]:
        print("%s\t%s" % (k,vals[k]))

    print ("\n\n@AOC:\n")
    for k in two[:20]:
        print("%s\t%s" % (k,vals[k]))

In [8]:
trump_tokens=convert_tweets_to_tokens(trump_tweets)
trump_counts=get_counts(trump_tokens)

In [9]:
aoc_tokens=convert_tweets_to_tokens(aoc_tweets)
aoc_counts=get_counts(aoc_tokens)

In [10]:
chi_square(trump_counts, aoc_counts)

@realdonaldtrump:

"	1843.3462598546037
@realDonaldTrump	767.7537107486318
!	735.1458768455142
.	390.8452989655345
Trump	308.2324244351674
will	226.22255210586596
great	206.309973781387
Donald	139.34687488005184
Obama	122.4401090728926
Thanks	118.68753314790392
be	108.05517114062634
...	106.28152288766938
Great	103.51347969059988
he	101.40623092074443
President	79.85272402251856
#Trump2016	74.21019140598298
president	71.74551195557245
?	71.28235249685538
his	69.37430185956471
U	68.78604706272642


@AOC:

…	15795.91275829964
@Ocasio2018	6518.920393680039
RT	5536.225035994314
💜	2091.116009234117
’	1632.664834479263
🏽	1459.8766408560216
*	989.3264413370716
Queens	947.1505645422372
Bronx	925.0762605949463
+	792.9379998245004
Ocasio-Cortez	747.8298934010594
Alexandria	712.0431240696781
@AOC	668.5093331893148
️	615.1176007845903
💪	607.8691457792726
Ocasio	600.5212766556089
s	523.3983763090894
re	522.0381636819615
progressive	508.1585138436886
Crowley	496.83364760794115


We saw earlier that $\chi^2$ is not a perfect estimator since it doesn't account for the burstiness of language (the tendency of mentions of the same word to clump together in a text).  Do you expect this to still hold on Twitter?  Why or why not?  How are the differences identified by a $\chi^2$ similar to those by Mann-Whitney?