In [1]:
import sys, operator, math, nltk
from collections import Counter
import pandas as pd
import os
import numpy as np
import re
from scipy.stats import mannwhitneyu

In [2]:
inpath2="C:/Users/pcrrt/Haas/Applied_NLP/Data/Intermediate Data/"

In [4]:
df=pd.read_csv(inpath2+'final_all_files.csv')

In [5]:
df=df[df['candidate']=='trump']
df['rally']=np.where(df['event_type']=='rally', 1, 0)

In [6]:
trump_rallies=df[df['rally']==1][['text']]
trump_rallies2=df[df['rally']==1][['title', 'text']]
trump_not_rallies=df[df['rally']==0][['text']]
trump_not_rallies2=df[df['rally']==0][['title', 'text']]

In [7]:
trump_rallies.to_csv(inpath2+'trump_rallies.txt', index=False, header=False, quoting=None)
trump_not_rallies.to_csv(inpath2+'trump_not_rallies.txt', index=False, header=False, quoting=None)

In [8]:
look_for = re.compile("(\w+(?:-\w+)*)|(\w+(?:\'\w+)*)")
stop_words = set(nltk.corpus.stopwords.words('english'))

In [9]:
def read_and_tokenize(filename):
    
    with open(filename, encoding="utf-8") as file:
        tokens=[]
        # lowercase
        for line in file:
            data=[i[0].lower() for i in nltk.tokenize.regexp_tokenize(line, look_for) if i[0].lower() not in stop_words]
            # This dataset is already tokenized, so we can split on whitespace
            tokens.extend(data)
        return tokens

In [10]:
trump_rallies=read_and_tokenize(inpath2+'trump_rallies.txt')
trump_not_rallies=read_and_tokenize(inpath2+'trump_not_rallies.txt')

In [11]:
def get_counts(tokens):
    counts=Counter()
    for token in tokens:
        counts[token]+=1
    return counts

In [12]:
def logodds_with_uninformative_prior(one_tokens, two_tokens, display=25):
    n1=len(one_tokens)
    n2=len(two_tokens)
    alpha_w=.01
    V=list(set(one_tokens+two_tokens))
    alpha_0=alpha_w*len(V)

    one_counts=get_counts(one_tokens)
    two_counts=get_counts(two_tokens)

    log_odds_dict={}
    for word in V:
        y1=one_counts[word]
        y2=two_counts[word]
        numerator=math.log((y1+alpha_w)/(n1+alpha_0-alpha_w-y1))-math.log((y2+alpha_w)/(n2+alpha_0-alpha_w-y2))
        denominator=math.sqrt((1/(y1+alpha_w))+(1/(y2+alpha_w)))
        log_odds_dict[word]=numerator/denominator
    positive_words=sorted(log_odds_dict.items(), key=operator.itemgetter(1), reverse=True)
    negative_words=sorted(log_odds_dict.items(), key=operator.itemgetter(1))

    print ("Rallies:\n")
    for k in positive_words[:display]:
        print("%s\t%s" % (k[0],k[1]))
    print("\n\nNot Rallies:\n")
    for k in negative_words[:display]:
        print("%s\t%s" % (k[0],k[1]))

In [13]:
logodds_with_uninformative_prior(trump_rallies, trump_not_rallies, 25)

Rallies:

vote	21.920255735254027
said	21.523702344944414
got	17.255809737225867
remember	16.954070705112056
right	16.481435912585162
ok	16.154356712058643
great	15.950374422632743
know	15.593345949438671
democrats	15.27150812887106
guy	13.027457685053607
winning	12.17656092834039
sir	11.926603945959842
party	11.681032104954179
love	11.642471439273994
hell	11.61997158802787
trump	11.570842234247209
democrat	11.125252319887663
republican	10.984686639810153
beautiful	10.932202680602613
voted	10.748923749024192
republicans	10.626657363689626
oh	10.234791955950962
never	10.189919252070618
veterans	10.146655918066113
tax	10.122531850146673


Not Rallies:

okay	-19.23418302749438
ahead	-16.703893343935412
uh	-16.056181306989195
yeah	-15.725792250763408
virus	-15.51517204299181
think	-15.447070796956634
please	-14.657873302959485
tremendous	-14.338397616612825
york	-13.063533848199857
question	-12.669158991774573
governors	-12.209191575389474
quickly	-11.464710528520518
ventilators	-11.429278