In [27]:
import spacy
from collections import defaultdict
import pandas as pd
import numpy as np
import re
import os

from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath

import nltk
from nltk.tokenize import RegexpTokenizer
from collections import Counter
from scipy.stats import wilcoxon

In [2]:
inpath2= os.path.dirname(os.path.dirname(os.getcwd()))+"\\Data\\Intermediate Data\\"
df=pd.read_csv(inpath2+'final_all_files.csv')

In [3]:
inpath=os.path.dirname(os.path.dirname(os.getcwd()))+"\\Data\\Source Data\\"
vad=pd.read_csv(inpath+'NRC-VAD-Lexicon.txt', sep='\t', header=None)
vad.set_index(0, inplace=True)

In [4]:
vad=vad[[1]]
vad_dict=vad.to_dict('index')
vad_dict={v:j for v,k in vad_dict.items() for i,j in k.items()}
len(vad_dict)

19971

In [5]:
df=df[df['candidate']=='trump']
df['rally']=np.where(df['event_type']=='rally', 1, 0)

In [6]:
rally_list=df[df['rally']==1]['text'].to_list()
rally_titles=df[df['rally']==1]['title'].to_list()
speech_list=df[df['rally']==0]['text'].to_list()
speech_titles=df[df['rally']==0]['title'].to_list()

#### Word Embedding Neighboor Valence Functions

In [7]:
look_for = re.compile("\w+(?:[-']\w+)*")
tokenizer=RegexpTokenizer(look_for)

In [8]:
#get the tokens for the rallies
def tokenize_for_w2v(text_list):
    num_tokens=0
    text_list_tokenized=[]
    for text in text_list:
        new_tokens=[i.lower() for i in tokenizer.tokenize(text)]
        num_tokens+=len(new_tokens)
        #print(num_tokens)
        text_list_tokenized.append(new_tokens)
    print(num_tokens)
    return text_list_tokenized

In [9]:
#model based on the tokens
def model_w2v(text_list_tokenized):
    model = Word2Vec(
        text_list_tokenized,
        vector_size=100,
        window=10,
        min_count=5,
        workers=10)
    #save the vectors
    trained_vectors = model.wv
    trained_vectors.save_word2vec_format('rally_embeddings.txt', binary=False)
    #normalizing the vectors
    model.init_sims(replace=True)
    return model, trained_vectors

In [10]:
def get_counts(tokens):
    counts=Counter()
    for text in tokens:
        for t in text:
            counts[t]+=1
    return counts

In [11]:
def centroid(words, text_list_tokenized, trained_vectors):
    word_counts=get_counts(text_list_tokenized)
    weight_list=[]
    arrays=[]
    for i in words:
        if i in trained_vectors.key_to_index.keys():
            weight_list.extend([word_counts[i]])
            array_i=trained_vectors.get_vector(i)
            arrays.append(array_i)
    weights=[i/np.sum(weight_list) for i in weight_list]
    weight_dict={}
    for w,j in zip(weights, words):
        weight_dict[j]=w
    centroid=np.average(arrays, axis=0, weights=weights)
    #print(weights)
    #print(arrays)
    return weight_dict, centroid

In [12]:
def valence_score(trained_vectors, centroid, words, topn=50):
    top_word_scores=[i for i in trained_vectors.similar_by_vector(centroid, topn=topn, restrict_vocab=None) if i[0] not in words] 
    top_words=[i[0] for i in trained_vectors.similar_by_vector(centroid, topn=topn, restrict_vocab=None) if i[0] not in words]   
    mean_valence=np.array([vad_dict[t] for t in top_words if t in vad_dict.keys()]).mean()
    return top_word_scores, mean_valence

In [13]:
def embedding_valence(words, text_list):
    #tokenize the speeches
    text_list_tokenized=tokenize_for_w2v(text_list)
    #run word2vec and create trained vectors
    model, trained_vectors=model_w2v(text_list_tokenized)
    #calculate the centroid of the words
    weights, centoidvector=centroid(words, text_list_tokenized, trained_vectors)
    #calculate the average valence of the 50 nearest neighbors
    top_words, mean_valence=valence_score(trained_vectors, centoidvector, words)
    return mean_valence, top_words, weights

#### Word Embedding Neighbor Valence

In [14]:
democrat_words=["democrat", "democrats", "liberals", "socialists", "socialist", "liberal", "democrat-run"]

In [15]:
rally_valence_score, rally_top_words, rally_weights=embedding_valence(democrat_words, rally_list)

1040144


  del sys.path[0]


In [16]:
rally_valence_score

0.4641304347826086

In [17]:
 rally_weights

{'democrat': 0.29707998307236566,
 'democrats': 0.5666525603046975,
 'liberals': 0.012272534913245875,
 'socialists': 0.014388489208633094,
 'socialist': 0.05459162082099027,
 'liberal': 0.0452814219212865,
 'democrat-run': 0.00973338975878121}

In [18]:
rally_top_words

[('radical', 0.8455104827880859),
 ('extreme', 0.7867962121963501),
 ('democratic', 0.7788916230201721),
 ('socialism', 0.7587141990661621),
 ('party', 0.7435035705566406),
 ('platform', 0.7169208526611328),
 ('resistance', 0.6994298696517944),
 ('mob', 0.6977725028991699),
 ('left-wing', 0.6773056983947754),
 ('do-nothing', 0.6740021109580994),
 ('crime', 0.6644102334976196),
 ('late-term', 0.6597546339035034),
 ('abortion', 0.6507912874221802),
 ('control', 0.6506705284118652),
 ('unhinged', 0.6469799280166626),
 ('corrupt', 0.6454719305038452),
 ('message', 0.6424803733825684),
 ('lawmakers', 0.6396994590759277),
 ('obstructionist', 0.6389696598052979),
 ('gain', 0.6273723840713501),
 ('republicans', 0.6202999949455261),
 ('agenda', 0.6181731820106506),
 ('washington', 0.6155006885528564),
 ('desperate', 0.6153740882873535),
 ('cues', 0.6115223169326782),
 ('far-left', 0.6112707853317261),
 ('sanders', 0.6100523471832275),
 ('voters', 0.6073018312454224),
 ('obstructionists', 0.6053

In [19]:
speech_valence_score, speech_top_words, speech_weights=embedding_valence(democrat_words, speech_list)

562856


  del sys.path[0]


In [20]:
speech_valence_score

0.5599200000000001

In [21]:
speech_top_words

[('republicans', 0.8390195965766907),
 ('word', 0.7972410917282104),
 ('voter', 0.7828963994979858),
 ('id', 0.775454044342041),
 ('blame', 0.7722207903862),
 ('kurds', 0.7642298936843872),
 ('press', 0.7584674954414368),
 ('negotiate', 0.7430112361907959),
 ('vote', 0.7398492693901062),
 ('russians', 0.7356542348861694),
 ('catch-and-release', 0.7354024648666382),
 ('existed', 0.7327938675880432),
 ('whatever', 0.7231122851371765),
 ('opposite', 0.717688262462616),
 ('media', 0.7120858430862427),
 ('whoever', 0.7116665840148926),
 ('observers', 0.710892915725708),
 ('crazy', 0.7056524157524109),
 ('anymore', 0.7044343948364258),
 ('lousy', 0.7039798498153687),
 ('necessarily', 0.7001340389251709),
 ('solve', 0.6955881118774414),
 ('votes', 0.6943065524101257),
 ('buybacks', 0.6931703090667725),
 ('independents', 0.693042516708374),
 ('handle', 0.6929306983947754),
 ('anyway', 0.6925455927848816),
 ('reason', 0.691535234451294),
 ('otherwise', 0.6863505840301514),
 ('advice', 0.6836447

In [22]:
speech_weights

{'democrat': 0.2552204176334107,
 'democrats': 0.6496519721577726,
 'liberals': 0.030162412993039442,
 'socialists': 0.04408352668213457,
 'socialist': 0.02088167053364269}

In [23]:
text_list_tokenized=tokenize_for_w2v(speech_list)
model, trained_vectors=model_w2v(text_list_tokenized)
#weights, centoidvector=centroid(democrat_words, text_list_tokenized, trained_vectors)
#top_words, mean_valence=valence_score(trained_vectors, centoidvector, democrat_words)

562856


  del sys.path[0]


In [None]:
#Wilcoxon’s signed-rank test
wilcoxon(0.4641304347826086)

#### Bootstrap

In [24]:
def bootstrap(rallies, speeches, words, B=10000, confidence_level=0.95):
    metric_bootstrap=[]
    upper_c=confidence_level*100
    lower_c=100-(confidence_level*100)
    for i in range(B):
        rally_valence_score, rally_top_words, rally_weights=embedding_valence(words, rallies)
        speech_valence_score, speech_top_words, speech_weights=embedding_valence(words, speeches)
        diff=speech_valence_score-rally_valence_score
        metric_bootstrap.append(diff)
    lower=np.percentile(metric_bootstrap, lower_c)
    upper=np.percentile(metric_bootstrap, upper_c)
    median=np.percentile(metric_bootstrap, 50)
    return lower, median, upper

In [25]:
lower, median, upper=bootstrap(rally_list, speech_list, democrat_words, B=100, confidence_level=0.95)
print("{}, {}, {}".format(lower, median, upper))

1040144


  del sys.path[0]


562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
1040144
562856
104

In [26]:
print("{}, {}, {}".format(lower, median, upper))

-0.022086310892172882, 0.04667876623376627, 0.11948689408867001
