In [1]:
import sys, operator, math, nltk
from collections import Counter
import pandas as pd
import os
import numpy as np
import re
from scipy.stats import mannwhitneyu
import os

In [2]:
inpath2= os.path.dirname(os.path.dirname(os.getcwd()))+"\\Data\\Intermediate Data\\"

In [3]:
df=pd.read_csv(inpath2+'final_all_files.csv')

In [4]:
df=df[df['candidate']=='trump']
df['rally']=np.where(df['event_type']=='rally', 1, 0)

In [5]:
trump_rallies=df[df['rally']==1][['text']]
trump_rallies2=df[df['rally']==1][['title', 'text']]
trump_not_rallies=df[df['rally']==0][['text']]
trump_not_rallies2=df[df['rally']==0][['title', 'text']]

In [6]:
trump_rallies.to_csv(inpath2+'trump_rallies.txt', index=False, header=False, quoting=None)
trump_not_rallies.to_csv(inpath2+'trump_not_rallies.txt', index=False, header=False, quoting=None)

In [7]:
look_for = re.compile("(\w+(?:-\w+)*)|(\w+(?:\'\w+)*)")
stop_words = set(nltk.corpus.stopwords.words('english'))

In [8]:
def read_and_tokenize(filename):
    
    with open(filename, encoding="utf-8") as file:
        tokens=[]
        # lowercase
        for line in file:
            data=[i[0].lower() for i in nltk.tokenize.regexp_tokenize(line, look_for) if i[0].lower() not in stop_words]
            # This dataset is already tokenized, so we can split on whitespace
            tokens.extend(data)
        return tokens

In [9]:
trump_rallies=read_and_tokenize(inpath2+'trump_rallies.txt')
trump_not_rallies=read_and_tokenize(inpath2+'trump_not_rallies.txt')

In [10]:
def get_counts(tokens):
    counts=Counter()
    for token in tokens:
        counts[token]+=1
    return counts

In [11]:
def logodds_with_uninformative_prior(one_tokens, two_tokens, display=25):
    n1=len(one_tokens)
    n2=len(two_tokens)
    alpha_w=.01
    V=list(set(one_tokens+two_tokens))
    alpha_0=alpha_w*len(V)

    one_counts=get_counts(one_tokens)
    two_counts=get_counts(two_tokens)

    log_odds_dict={}
    for word in V:
        y1=one_counts[word]
        y2=two_counts[word]
        numerator=math.log((y1+alpha_w)/(n1+alpha_0-alpha_w-y1))-math.log((y2+alpha_w)/(n2+alpha_0-alpha_w-y2))
        denominator=math.sqrt((1/(y1+alpha_w))+(1/(y2+alpha_w)))
        log_odds_dict[word]=numerator/denominator
    positive_words=sorted(log_odds_dict.items(), key=operator.itemgetter(1), reverse=True)
    negative_words=sorted(log_odds_dict.items(), key=operator.itemgetter(1))

    print ("Rallies:\n")
    for k in positive_words[:display]:
        print("%s\t%s" % (k[0],k[1]))
    print("\n\nNot Rallies:\n")
    for k in negative_words[:display]:
        print("%s\t%s" % (k[0],k[1]))

In [12]:
logodds_with_uninformative_prior(trump_rallies, trump_not_rallies, 25)

Rallies:

said	24.86122971196984
vote	21.151084638358938
know	20.64923610486125
got	19.02721952553394
right	18.370402266291777
remember	17.562982924057472
guy	16.142592222833528
great	15.807316944133166
trump	15.484817605006095
ok	14.141159904606258
democrats	13.545129452395676
hell	12.841648480195815
sir	12.597041074311733
oh	12.268306120039293
winning	12.126280214184954
party	12.096299949334531
love	12.07496284150118
never	11.923178783453086
win	11.680564024052126
democrat	11.53291018932397
crazy	11.477835468645246
america	11.11521577286875
republican	10.985774628806718
like	10.633109437878183
guys	10.619671672800308


Not Rallies:

ahead	-19.59131380181445
think	-18.32105978977841
tremendous	-17.47662545172401
virus	-16.795085436827833
please	-16.785496924633954
yeah	-16.724168081626907
question	-14.306378152991341
governors	-13.937198067390186
ventilators	-13.855258719311752
okay	-13.49973932658898
things	-13.151307388999177
different	-12.989465096177817
certain	-12.804150345262164