In [1]:
import sys, operator, math, nltk
from collections import Counter
import pandas as pd
import os
import numpy as np
import re
from scipy.stats import mannwhitneyu

In [2]:
inpath2="C:/Users/pcrrt/Applied NLP Final Project/Data/Intermediate Data/"

In [3]:
df=pd.read_csv(inpath2+'final v4.csv')

In [4]:
df=df[df['candidate']=='trump']
df['rally']=np.where(df['event_type']=='rally', 1, 0)

In [5]:
trump_rallies=df[df['rally']==1][['text']]
trump_rallies2=df[df['rally']==1][['title', 'text']]
trump_not_rallies=df[df['rally']==0][['text']]
trump_not_rallies2=df[df['rally']==0][['title', 'text']]

In [6]:
trump_rallies.to_csv(inpath2+'trump_rallies.txt', index=False, header=False, quoting=None)
trump_not_rallies.to_csv(inpath2+'trump_not_rallies.txt', index=False, header=False, quoting=None)

In [7]:
look_for = re.compile("(\w+(?:-\w+)*)|(\w+(?:\'\w+)*)")
stop_words = set(nltk.corpus.stopwords.words('english'))

In [8]:
def read_and_tokenize(filename):
    
    with open(filename, encoding="utf-8") as file:
        tokens=[]
        # lowercase
        for line in file:
            data=[i[0].lower() for i in nltk.tokenize.regexp_tokenize(line, look_for) if i[0].lower() not in stop_words]
            # This dataset is already tokenized, so we can split on whitespace
            tokens.extend(data)
        return tokens

In [9]:
trump_rallies=read_and_tokenize(inpath2+'trump_rallies.txt')
trump_not_rallies=read_and_tokenize(inpath2+'trump_not_rallies.txt')

In [10]:
def get_counts(tokens):
    counts=Counter()
    for token in tokens:
        counts[token]+=1
    return counts

In [11]:
def logodds_with_uninformative_prior(one_tokens, two_tokens, display=25):
    n1=len(one_tokens)
    n2=len(two_tokens)
    alpha_w=.01
    V=list(set(one_tokens+two_tokens))
    alpha_0=alpha_w*len(V)

    one_counts=get_counts(one_tokens)
    two_counts=get_counts(two_tokens)

    log_odds_dict={}
    for word in V:
        y1=one_counts[word]
        y2=two_counts[word]
        numerator=math.log((y1+alpha_w)/(n1+alpha_0-alpha_w-y1))-math.log((y2+alpha_w)/(n2+alpha_0-alpha_w-y2))
        denominator=math.sqrt((1/(y1+alpha_w))+(1/(y2+alpha_w)))
        log_odds_dict[word]=numerator/denominator
    positive_words=sorted(log_odds_dict.items(), key=operator.itemgetter(1), reverse=True)
    negative_words=sorted(log_odds_dict.items(), key=operator.itemgetter(1))

    print ("Rallies:\n")
    for k in positive_words[:display]:
        print("%s\t%s" % (k[0],k[1]))
    print("\n\nNot Rallies:\n")
    for k in negative_words[:display]:
        print("%s\t%s" % (k[0],k[1]))

In [12]:
logodds_with_uninformative_prior(trump_rallies, trump_not_rallies, 25)

Rallies:

know	22.71252282585128
said	22.368467838786906
vote	20.530851571993278
got	17.617085583069976
remember	17.28142090295741
right	17.26852722657596
trump	17.06138489692587
guy	16.606722524464644
democrats	14.157898938510009
great	13.882045999828724
love	13.014326703117034
hell	12.393117484967371
state	12.109363687485748
like	12.065048255955693
crazy	11.934262127858007
taxes	11.834297260327594
oh	11.793473892421124
sir	11.642481029597452
fake	11.557558187227903
democrat	11.376815737556944
election	10.91357639640798
never	10.702906190601668
joe	10.452208649193292
borders	10.44459178499608
republican	10.43893806958612


Not Rallies:

ahead	-16.235705909807887
tremendous	-16.0200439714907
please	-15.804571236595976
think	-14.832383916408794
virus	-14.330902571452988
yeah	-14.07627822928747
agree	-12.71057507118347
things	-12.157286096490163
much	-12.119647250953998
many	-11.849802523876011
cases	-11.847270469324823
different	-11.639210837344036
ventilators	-11.623626824136078
govern