In [1]:
import re
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from spacy.lang.en import English
import spacy
import pandas as pd
import numpy as np
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import spacy

nltk.download('punkt')
nltk.download('movie_reviews')

SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]
OBJECTS = ["dobj", "dative", "attr", "oprd"]
ADJECTIVES = ["acomp", "advcl", "advmod", "amod", "appos", "nn", "nmod", "ccomp", "complm",
              "hmod", "infmod", "xcomp", "rcmod", "poss"," possessive"]
COMPOUNDS = ["compound"]
PREPOSITIONS = ["prep"]

def getSubsFromConjunctions(subs):
    moreSubs = []
    for sub in subs:
        # rights is a generator
        rights = list(sub.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreSubs.extend([tok for tok in rights if tok.dep_ in SUBJECTS or tok.pos_ == "NOUN"])
            if len(moreSubs) > 0:
                moreSubs.extend(getSubsFromConjunctions(moreSubs))
    return moreSubs

def getObjsFromConjunctions(objs):
    moreObjs = []
    for obj in objs:
        # rights is a generator
        rights = list(obj.rights)
        rightDeps = {tok.lower_ for tok in rights}
        if "and" in rightDeps:
            moreObjs.extend([tok for tok in rights if tok.dep_ in OBJECTS or tok.pos_ == "NOUN"])
            if len(moreObjs) > 0:
                moreObjs.extend(getObjsFromConjunctions(moreObjs))
    return moreObjs

def getVerbsFromConjunctions(verbs):
    moreVerbs = []
    for verb in verbs:
        rightDeps = {tok.lower_ for tok in verb.rights}
        if "and" in rightDeps:
            moreVerbs.extend([tok for tok in verb.rights if tok.pos_ == "VERB"])
            if len(moreVerbs) > 0:
                moreVerbs.extend(getVerbsFromConjunctions(moreVerbs))
    return moreVerbs

def findSubs(tok):
    head = tok.head
    while head.pos_ != "VERB" and head.pos_ != "NOUN" and head.head != head:
        head = head.head
    if head.pos_ == "VERB":
        subs = [tok for tok in head.lefts if tok.dep_ == "SUB"]
        if len(subs) > 0:
            verbNegated = isNegated(head)
            subs.extend(getSubsFromConjunctions(subs))
            return subs, verbNegated
        elif head.head != head:
            return findSubs(head)
    elif head.pos_ == "NOUN":
        return [head], isNegated(tok)
    return [], False

def isNegated(tok):
    negations = {"no", "not", "n't", "never", "none"}
    for dep in list(tok.lefts) + list(tok.rights):
        if dep.lower_ in negations:
            return True
    return False

def findSVs(tokens):
    svs = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        if len(subs) > 0:
            for sub in subs:
                svs.append((sub.orth_, "!" + v.orth_ if verbNegated else v.orth_))
    return svs

def getObjsFromPrepositions(deps):
    objs = []
    for dep in deps:
        if dep.pos_ == "ADP" and dep.dep_ == "prep":
            objs.extend([tok for tok in dep.rights if tok.dep_  in OBJECTS or (tok.pos_ == "PRON" and tok.lower_ == "me")])
    return objs

def getAdjectives(toks):
    toks_with_adjectives = []
    for tok in toks:
        adjs = [left for left in tok.lefts if left.dep_ in ADJECTIVES]
        adjs.append(tok)
        adjs.extend([right for right in tok.rights if tok.dep_ in ADJECTIVES])
        tok_with_adj = " ".join([adj.lower_ for adj in adjs])
        toks_with_adjectives.extend(adjs)

    return toks_with_adjectives

def getObjsFromAttrs(deps):
    for dep in deps:
        if dep.pos_ == "NOUN" and dep.dep_ == "attr":
            verbs = [tok for tok in dep.rights if tok.pos_ == "VERB"]
            if len(verbs) > 0:
                for v in verbs:
                    rights = list(v.rights)
                    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
                    objs.extend(getObjsFromPrepositions(rights))
                    if len(objs) > 0:
                        return v, objs
    return None, None

def getObjFromXComp(deps):
    for dep in deps:
        if dep.pos_ == "VERB" and dep.dep_ == "xcomp":
            v = dep
            rights = list(v.rights)
            objs = [tok for tok in rights if tok.dep_ in OBJECTS]
            objs.extend(getObjsFromPrepositions(rights))
            if len(objs) > 0:
                return v, objs
    return None, None

def getAllSubs(v):
    verbNegated = isNegated(v)
    subs = [tok for tok in v.lefts if tok.dep_ in SUBJECTS and tok.pos_ != "DET"]
    if len(subs) > 0:
        subs.extend(getSubsFromConjunctions(subs))
    else:
        foundSubs, verbNegated = findSubs(v)
        subs.extend(foundSubs)
    return subs, verbNegated

def getAllObjs(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]
    objs.extend(getObjsFromPrepositions(rights))

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def getAllObjsWithAdjectives(v):
    # rights is a generator
    rights = list(v.rights)
    objs = [tok for tok in rights if tok.dep_ in OBJECTS]

    if len(objs)== 0:
        objs = [tok for tok in rights if tok.dep_ in ADJECTIVES]

    objs.extend(getObjsFromPrepositions(rights))

    potentialNewVerb, potentialNewObjs = getObjFromXComp(rights)
    if potentialNewVerb is not None and potentialNewObjs is not None and len(potentialNewObjs) > 0:
        objs.extend(potentialNewObjs)
        v = potentialNewVerb
    if len(objs) > 0:
        objs.extend(getObjsFromConjunctions(objs))
    return v, objs

def findSVOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "VERB" and tok.dep_ != "aux"]
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjs(v)
            for sub in subs:
                for obj in objs:
                    objNegated = isNegated(obj)
                    svos.append((sub.lower_, "!" + v.lower_ if verbNegated or objNegated else v.lower_, obj.lower_))
    return svos

def findSVAOs(tokens):
    svos = []
    verbs = [tok for tok in tokens if tok.pos_ == "AUX" or tok.pos_ == "VERB"]
    # for tok in tokens:
    #     print(tok, tok.pos_, tok.dep_)
    for v in verbs:
        subs, verbNegated = getAllSubs(v)
        # hopefully there are subs, if not, don't examine this verb any longer
        if len(subs) > 0:
            v, objs = getAllObjsWithAdjectives(v)
            # print(subs, v, objs)
            for sub in subs:
                if len(objs) == 0:
                    sub_compound = generate_sub_compound(sub)
                    svos.append((" ".join(tok.lower_ for tok in sub_compound), "!" + v.lower_ if verbNegated else v.lower_))
                else:
                    for obj in objs:
                        objNegated = isNegated(obj)
                        obj_desc_tokens = generate_left_right_adjectives(obj)
                        sub_compound = generate_sub_compound(sub)
                        svos.append((" ".join(tok.lower_ for tok in sub_compound), "!" + v.lower_ if verbNegated or objNegated else v.lower_, " ".join(tok.lower_ for tok in obj_desc_tokens)))
    return svos

def generate_sub_compound(sub):
    sub_compunds = []
    for tok in sub.lefts:
        if tok.dep_ in COMPOUNDS:
            sub_compunds.extend(generate_sub_compound(tok))
    sub_compunds.append(sub)
    for tok in sub.rights:
        if tok.dep_ in COMPOUNDS:
            sub_compunds.extend(generate_sub_compound(tok))
    return sub_compunds

def generate_left_right_adjectives(obj):
    obj_desc_tokens = []
    for tok in obj.lefts:
        if tok.dep_ in ADJECTIVES:
            obj_desc_tokens.extend(generate_left_right_adjectives(tok))
    obj_desc_tokens.append(obj)

    for tok in obj.rights:
        if tok.dep_ in ADJECTIVES:
            obj_desc_tokens.extend(generate_left_right_adjectives(tok))

    return obj_desc_tokens

[nltk_data] Downloading package punkt to /Users/ziqiwang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/ziqiwang/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [4]:
data = pd.read_csv("race&gender&pp-2016-all.csv", index_col=0)
print(data.shape)


(68166, 40)


In [4]:
parser = spacy.load('en_core_web_sm')

sentence = 'Biden enters the election about 2 points stronger than Clinton was in 2016, when Trump emerged with a 1-point victory.'
parse = parser(sentence)
print(findSVAOs(parse))

sentence = 'Donald Trump is the worst president of USA, but Biden is better than him'
parse = parser(sentence)
print(findSVAOs(parse))

sentence = 'Trump may have fared better than during the last debate, but then again, the bar is pretty low. Biden also outperformed himself, but at times appeared exasperated by the whole charade of a lifelong public servant debating a showman.'
parse = parser(sentence)
print(findSVAOs(parse))

sentence = 'Interrupting Joe Biden nearly every time he spoke, President Trump made little attempt to reassure swing voters about his leadership. Mr. Biden hit back: “This is so unpresidential.”'
parse = parser(sentence)
print(findSVAOs(parse))

sentence = "#Trump will be next #commander #chief ?! do u believe ...#uschoice #uselection #electionday… "
parse = parser(sentence)
print(findSVAOs(parse))

[('biden', 'enters', 'election stronger'), ('trump', 'emerged')]
[('donald trump', 'is', 'worst president'), ('biden', 'is', 'better')]
[('trump', 'fared', 'better'), ('bar', 'is', 'pretty low'), ('biden', 'outperformed', 'himself'), ('servant', 'debating', 'showman')]
[('he', 'spoke'), ('president trump', 'made', 'little attempt'), ('attempt', 'reassure', 'swing'), ('mr. biden', 'hit', 'back'), ('mr. biden', 'hit', 'is so unpresidential')]
[('trump', 'be', '# chief'), ('u', 'believe')]


In [5]:
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores("trump fared better")['compound']

0.4404

In [16]:
def political_orientation(text: str):
    '''
    Arg:
        text: any text
    Return:
        (int party, int score)
        party:  0-non_political or neutral,
                1-Republican Party,
                2-Democratic Party
        score: positive rate of sentiment 
    '''
    text = text.lower()
    republican_keywords = ['@realdonaldtrump','trump','dt ', ' dt', 'republican']
    democratic_keywords = ['@joebiden','biden','jb ', ' jb', 'democrat']
    analyzer = SentimentIntensityAnalyzer()
    parser = spacy.load('en_core_web_sm')

    if any(keyword in text for keyword in republican_keywords) and any(keyword in text for keyword in democratic_keywords): #text about both republican and democratic

        parse = findSVAOs(parser(text))

        republican_sentences = ''
        democratic_sentences = ''

        for tuple_words in parse:

            sentence = ' '.join(tuple_words)

            if any(word in sentence for word in republican_keywords):
                republican_sentences += sentence + ' '
            elif any(word in sentence for word in democratic_keywords):
                democratic_sentences += sentence + ' '

        # print(republican_sentences)
        # print(democratic_sentences)

        republican_sentimen_polarity = analyzer.polarity_scores(republican_sentences)['compound']
        democratic_sentimen_polarity = analyzer.polarity_scores(democratic_sentences)['compound']

        # print(republican_sentimen_polarity)
        # print(democratic_sentimen_polarity)

        if republican_sentimen_polarity * democratic_sentimen_polarity > 0 :
            return 0, 0
        elif republican_sentimen_polarity == 0 and democratic_sentimen_polarity == 0:
            return 0, 0
        elif republican_sentimen_polarity > democratic_sentimen_polarity:
            return 1, republican_sentimen_polarity
        else:
            return 2, democratic_sentimen_polarity


    elif any(keyword in text for keyword in republican_keywords): #text only about republican

        sentimen_polarity = analyzer.polarity_scores(text)['compound']

        if sentimen_polarity==0:
            return 0, 0
        elif sentimen_polarity>0:
            return 1, sentimen_polarity
        else:
            return 2, -sentimen_polarity

    elif any(keyword in text for keyword in democratic_keywords):  #text only about democratic

        sentimen_polarity = analyzer.polarity_scores(text)['compound']

        if sentimen_polarity==0:
            return 0, 0
        elif sentimen_polarity>0:
            return 2, sentimen_polarity
        else:
            return 1, -sentimen_polarity

    else: # non-political  
        
        return 0, 0

In [7]:
political_orientation('Biden enters the election about 2 points stronger than Clinton was in 2016, when Trump emerged with a 1-point victory.')

trump emerged 
biden enters election stronger 
0.0
0.3818


(2, 0.3818)

In [80]:
political_orientation("@guru7777 @alegretron @TheJimCornette Yeah, check Biden’s taxes because criminals often disclose their illegal activities.")

(1, 0.7269)

In [9]:
import time
start_time = time.clock()
political_orientation('Interrupting Joe Biden nearly every time he spoke, President Trump made little attempt to reassure swing voters about his leadership. Mr. Biden hit back: “This is so unpresidential.”')
print(time.clock() - start_time)

president trump made little attempt 
. biden hit back . biden hit is so unpresidential 
0.0
0.0
0.5465700000000009


In [84]:
import multiprocessing as mp

data = pd.read_csv("race&gender-2020-all.csv", index_col=0)
data_clean = data.copy()[:100]

data_clean['pp'] = np.nan
data_clean['pp_score'] = np.nan
cpu_count = mp.cpu_count()
data_split = np.array_split(data_clean, cpu_count)

def process_line(df):
    for i, row in df.iterrows():
        print(i)
        if row['language'] == 'en':
            text = re.sub(r"http\S+", "", row['tweet'])
            political_preference, score = political_orientation(text)
            df.loc[i, 'pp'] = political_preference
            df.loc[i, 'pp_score'] = score
    return df

t = mp.Pool(processes=4)
results = t.map(process_line, data_split)
t.close()
t.join()

pd.concat(results).to_csv('race&gender&pp-2020-all.csv')

# start_time = time.clock()
# for i, row in data_clean.iterrows():
#     if row['language'] == 'en':
#         text = re.sub(r"http\S+", "", row['tweet'])
#         political_preference, score = political_orientation(text)
#         data_clean.loc[i, 'pp'] = political_preference
#         data_clean.loc[i, 'pp_score'] = score
# print(time.clock() - start_time)

25
75
0
50
76
51
26
1
2
77
52
27
78
28
3
53
29
79
4
54
30
5
55
80
31
6
56
81
32
7
82
57
33
8
83
58
84
34
9
85
59
35
86
10
60
36
87
11
61
37
88
12
62
38
89
63
13
39
90
64
14
40
91
65
15
41
92
66
16
42
93
67
17
43
94
18
68
44
95
19
69
45
96
70
20
46
97
71
21
47
98
99
72
22
48
73
23
49
74
24


In [82]:
results[1]

Unnamed: 0,date,place,tweet,language,hashtags,username,name,link,retweet,search,near,reply_to,race/ethnicity,gender,pp,pp_score
25,2020-10-11 12:40:39,,"Prayers for our President, and first Lady , f...",en,[],TimGold69913864,@janetmGolden1,https://twitter.com/TimGold69913864/status/131...,False,trump OR biden,"Albertville, Alabama",[],,,0.0,0.0
26,2020-10-10 23:39:04,,Honestly after playing the Final Fantasy 7 gam...,en,[],MippieLou,MistyShay,https://twitter.com/MippieLou/status/131518009...,False,trump OR biden,"Albertville, Alabama",[],,,1.0,0.5859
27,2020-10-07 05:19:03,,@Xtinaresists @dpaulson123 @DanScavino @realDo...,en,[],EmilyStapler,Emily Stapler,https://twitter.com/EmilyStapler/status/131381...,False,trump OR biden,"Albertville, Alabama","[{'screen_name': 'Xtinaresists', 'name': 'Chri...",white,F,1.0,0.8271
28,2020-10-06 13:22:27,,"I’m attending Joe Biden for President’s event,...",en,['teamjoe'],trueanglican,Omar Reyes,https://twitter.com/trueanglican/status/131357...,False,trump OR biden,"Albertville, Alabama",[],hispanic,M,2.0,0.3595
29,2020-10-04 09:40:56,,@sharlycan What I like to hear beautiful ma’da...,en,['trump'],RoyDriscal,Roy D,https://twitter.com/RoyDriscal/status/13127948...,False,trump OR biden,"Albertville, Alabama","[{'screen_name': 'sharlycan', 'name': 'sharls'...",,M,1.0,0.7506
30,2020-10-02 16:17:40,,Thoughts and prayers for President Trump and ...,en,[],LauraBkay5,Peekingnews.com,https://twitter.com/LauraBkay5/status/13121699...,False,trump OR biden,"Albertville, Alabama",[],,,1.0,0.2263
31,2020-09-29 06:44:42,,@irishmike224 Shut the hell up with your trump...,en,[],RoyDriscal,Roy D,https://twitter.com/RoyDriscal/status/13109385...,False,trump OR biden,"Albertville, Alabama","[{'screen_name': 'irishmike224', 'name': 'M.K....",,M,2.0,0.8585
32,2020-09-25 08:15:42,,Chip in to elect Joe Biden and other Democrats...,en,[],MippieLou,MistyShay,https://twitter.com/MippieLou/status/130951190...,False,trump OR biden,"Albertville, Alabama",[],,,0.0,0.0
33,2020-09-23 14:04:57,,@RealJamesWoods Ole 'Plugs' &amp; 'Pimp' Biden...,en,['likefatherlikeson'],ChuckE35,Chuck Ellis,https://twitter.com/ChuckE35/status/1308875021...,False,trump OR biden,"Albertville, Alabama","[{'screen_name': 'RealJamesWoods', 'name': 'Ja...",,M,0.0,0.0
34,2020-09-19 19:27:19,,Donald Trump has an obligation to nominate a p...,en,[],Gknott1970,G. William Knott,https://twitter.com/Gknott1970/status/13075065...,False,trump OR biden,"Albertville, Alabama",[],,,1.0,0.1027


In [87]:
data = pd.read_csv("race&gender-2020-all.csv", index_col=0)
data_clean = data.copy()

data_clean['pp'] = np.nan
data_clean['pp_score'] = np.nan
cpu_count = mp.cpu_count()
data_split = np.array_split(data_clean, cpu_count-1)

def process_line(df):
    for i, row in df.iterrows():
        print(i)
        if row['language'] == 'en':
            text = re.sub(r"http\S+", "", row['tweet'])
            political_preference, score = political_orientation(text)
            df.loc[i, 'pp'] = political_preference
            df.loc[i, 'pp_score'] = score
    return df

t = mp.Pool(processes=cpu_count-1)
results = t.map(process_line, data_split)
t.close()
t.join()

pd.concat(results).to_csv('race&gender&pp-2020-all.csv')
# data_clean.to_csv('race&gender&pp-2020-alltest.csv')


1481
84075
166683
1482
84076
1483
166684
84077
1484
166685
84078
1485
166686
84079
1486
166687
84080
1487
166688
84081
1488
166689
84082
1489
166690
84083
1490
166691
84084
1491
166692
84085
1492
166693
84086
1493
166694
84087
1494
166695
84088
1495
166696
84089
1496
166697
84090
1497
166698
84091
1498
166699
84092
1499
166700
84093
1500
166701
84094
1501
166702
84095
1502
166703
84096
1503
166704
84097
1504
166705
84098
1505
166706
84099
1506
166707
84100
1507
166708
84101
1508
166709
84102
1509
166710
84103
1510
166711
84104
1511
166712
84105
84106
1512
166713
84107
1513
166714
84108
1514
166715
84109
1515
1516
166716
84110
84111
1517
166717
1518
84112
166718
1519
84113
166719
1520
84114
166720
1521
84115
166721
1522
84116
166722
1523
84117
166723
1524
84118
166724
1525
84119
166725
1526
84120
166726
1527
84121
166727
166728
1528
84122
166729
1529
84123
166730
1530
84124
166731
1531
84125
166732
1532
84126
166733
1533
84127
166734
1534
84128
166735
1535
84129
166736
1536
84130
16673

KeyboardInterrupt: 