In [6]:
import spacy
from collections import defaultdict
import networkx as nx
import pandas as pd
import numpy as np
import re
import os
import neuralcoref

In [7]:
nlp = spacy.load('en_core_web_sm')
coref = neuralcoref.NeuralCoref(nlp.vocab)
nlp.add_pipe(coref, name='neuralcoref')

In [61]:
inpath2= os.path.dirname(os.path.dirname(os.getcwd()))+"\\Data\\Intermediate Data\\"
df=pd.read_csv(inpath2+'final_all_files.csv')

In [103]:
inpath=os.path.dirname(os.path.dirname(os.getcwd()))+"\\Data\\Source Data\\"
vad=pd.read_csv(inpath+'NRC-VAD-Lexicon.txt', sep='\t', header=None)
vad.set_index(0, inplace=True)

In [121]:
vad=vad[[1]]
vad_dict=vad.to_dict('index')
vad_dict={v:j for v,k in vad_dict.items() for i,j in k.items()}
len(vad_dict)

19971

In [62]:
df=df[df['candidate']=='trump']
df['rally']=np.where(df['event_type']=='rally', 1, 0)

In [63]:
rally_list=df[df['rally']==1]['text'].to_list()
rally_titles=df[df['rally']==1]['title'].to_list()
speech_list=df[df['rally']==0]['text'].to_list()
speech_titles=df[df['rally']==0]['title'].to_list()

#### Create lists, both rallies and speeches, of all sentences that mention democrats

In [14]:
democrat_list=['Democrats', 'Democrat', 'democrats', 'democrat',
       'DEMOCRATS', 'DEMOCRAT', 'Liberals', 'Socialist', 
       'Socialists', 'Left Democrats',
       'Socialist Democrats']

In [51]:
def get_democrat_mentions(data):
    doc=nlp(data)
    mention_list=[]
    sentence_list=[]
    #for all entities in the document
    for entity in doc.ents:
        #if the entity is democrat
        if entity.text.lstrip().rstrip() in democrat_list:
            #add to the list of mentions
            mention_list.append(entity)
    #if there are some mentions of dems in the document
    if mention_list!=[]:
        #for each mention of democrat in the list of mentions
        for dem in mention_list:
            #if there is a coref cluster for the mention of democrat
            if dem._.coref_cluster is not None:
                #for each mention in the coref cluster
                for mention in dem._.coref_cluster:
                    #if the mention has not yet been included in mention list
                    if mention not in mention_list:
                        #include the mention
                        mention_list.append(mention)
        for dem2 in mention_list:
            if dem2.root.sent not in sentence_list:
                sentence_list.append(dem2.sent)
    
    return sentence_list

In [52]:
rally_democrat_sentences=[]
for ral, titl in zip(rally_list, rally_titles):
    print(titl)
    demo_mentions=get_democrat_mentions(ral)
    rally_democrat_sentences+=demo_mentions

Speech: Donald Trump Holds a Campaign Rally in Sanford, Florida
Speech: Donald Trump Holds a Campaign Rally in Duluth, Minnesota
Speech: Donald Trump Holds a Campaign Rally in Middletown, Pennsylvania
Speech: Donald Trump Holds a Campaign Rally in Newport News, Virginia
Speech: Donald Trump Holds a Campaign Rally in Jacksonville, Florida
Speech: Donald Trump Holds a Campaign Rally in Moon Township, Pennsylvania
Speech: Donald Trump Holds a Campaign Rally in Swanton, Ohio
Speech: Donald Trump Holds a Campaign Rally in Fayetteville, North Carolina
Speech: Donald Trump Holds a Campaign Rally in Bedmidji, Minnesota
Speech: Donald Trump Holds a Campaign Rally in Mosinee, Wisconsin
Speech: Donald Trump Holds a Campaign Rally in Minden, Nevada
Speech: Donald Trump Holds a Campaign Rally in Freeland, Michigan
Speech: Donald Trump Holds a Campaign Rally in Winston-Salem, North Carolina
Speech: Donald Trump Holds a Political Rally in Tulsa, Oklahoma
Speech: Donald Trump Holds a Political Rally i

In [56]:
len(rally_democrat_sentences)

1976

In [57]:
speech_democrat_sentences=[]
for spe, titl in zip(speech_list, speech_titles):
    print(titl)
    demo_mentions=get_democrat_mentions(spe)
    speech_democrat_sentences+=demo_mentions

Debate: 2016-09-26 Hempstead New York
Debate: 2016-10-09 St. Louis Missouri
Debate: 2016-10-19 Paradise Nevada
Donald Trump 4th of July Event Speech Transcript
Donald Trump Speech Transcript Turning Point USA Teen Student Summit Speech
Donald Trump Speech Transcript in Jamestown Virginia
Donald Trump Statement on Mass Shootings
Donald Trump, Emmanuel Macron G7 Press Conference & Speeches
E-Cigarette Regulation and John Bolton Discussion
Press Conference Response to Impechment Inquiry
Trump Transcript Trump Says China Should Investigate Biden
Trump Claims No Quid Pro Quo
Press Conference with Italian President
Donald Trump Cabinet Meeting Transcript Trump Calls Emoluments Clause of the Constitution Phony
Donald Trump Syria Press Conference Transcript Trump Orders All Turkey Sanctions Lifted
ISIS Leader Abu Bakr al-Baghdadi Killed
Macron Trump NATO Summit Meeting Transcript
Trump Makes Statement After Iran Attacks US Bases in Iraq
Davos World Economic Forum Trump Holds News Conference
Tr

In [59]:
len(speech_democrat_sentences)

354

#### Calculate mean valence score for lemmas of words in same sentence as the mention of democrats.

In [242]:
all_stopwords = nlp.Defaults.stop_words
democrat_list=['democrats', 'democrat', 'liberal', 'liberals', 'socialist', 'socialists']
def valence_scores(sentence_list):
    valence_list=[]
    for sent in sentence_list:
        x=[t.lemma_.lower() for t in sent if t.lemma_.lower() not in democrat_list and not t.is_punct and not t in all_stopwords] 
        mean_valence=np.array([vad_dict[t] for t in x if t in vad_dict.keys()]).mean()
        if not pd.isna(mean_valence):
            valence_list.append(mean_valence)
        if pd.isna(mean_valence):
            print('excluded for lack of words: {}'.format(sent))
    return valence_list

In [243]:
rally_valence_list=valence_scores(rally_democrat_sentences)
len(rally_valence_list)

excluded for lack of words: Democrat, all Democrat.
excluded for lack of words: How about the Democrats?  
excluded for lack of words: And how about these Democrats?
excluded for lack of words: All Democrats.
excluded for lack of words: All by the Democrats.
excluded for lack of words: The Democrats.
excluded for lack of words: How about the Democrats?
excluded for lack of words: Only a Democrat --
excluded for lack of words: Because the Democrats –
excluded for lack of words: Over a Democrat.
excluded for lack of words: Democrats


  import sys


1965

In [244]:
speech_valence_list=valence_scores(speech_democrat_sentences)
len(speech_valence_list)

excluded for lack of words: That’s the Democrats.
excluded for lack of words: He’s a Democrat.
excluded for lack of words: Democrats.
excluded for lack of words: It’s because of the Democrats.


  import sys


350

#### Run bootstrap to see whether there is a difference between the two texts

In [247]:
def bootstrap(rallies, speeches, B=10000, confidence_level=0.95):
    # your code here
    rally_len=len(rallies)
    speech_len=len(speeches)
    indices_rally=[i for i in range(rally_len)]  
    indices_speech=[i for i in range(speech_len)]
    rally_array=np.array(rallies)
    speeches_array=np.array(speeches)
    metric_bootstrap=[]
    upper_c=confidence_level*100
    lower_c=100-(confidence_level*100)
    for i in range(B):
        #print(rally_array[list(np.random.choice(indices_rally, rally_len, replace = True))])
        rally_sample=rally_array[list(np.random.choice(indices_rally, rally_len, replace = True))].mean()
        speech_sample=speeches_array[list(np.random.choice(indices_speech, speech_len, replace = True))].mean()
        diff=speech_sample-rally_sample
        metric_bootstrap.append(diff)
    lower=np.percentile(metric_bootstrap, lower_c)
    upper=np.percentile(metric_bootstrap, upper_c)
    median=np.percentile(metric_bootstrap, 50)
    return lower, median, upper

In [246]:
bootstrap(rally_valence_list, speech_valence_list)

(0.03157960563337052, 0.04009405777796965, 0.04843618384538449)