In [1]:
import pandas as pd
import numpy as np
import re
import scipy.sparse as spsp
import itertools as it
from math import log
import networkx as nx
import twokenize
from keras.preprocessing.text import Tokenizer
pd.set_option('display.max_rows', 10000)

In [2]:
# multinomial coefficient implementation 1
def multinomial(params):
    if len(params) == 1:
        return 1
    return comb(sum(params), params[-1], exact=True) * multinomial(params[:-1])

In [3]:
# multinomial coefficient implementation 2
def multinomial2(lst):
    res, i = 1, 1
    for a in lst:
        for j in range(1,a+1):
            res *= i
            res //= j
            i += 1
    return res

In [4]:
# p_value is the implementation of equation A6 in Martinez-Romo et al(2011): DOI: 10.1103/PhysRevE.84.046108
# lst is a list with length 4: k1 + k2 + k3 + k4 = N, k1 = k, k2 = n1 - k, k3 = n2 - k
def p_value_A6(lst):
    N = lst[0]+lst[1]+lst[2]+lst[3]
    c1, c2 = 1, 1
    for i in range(lst[2]):
        c1 *= (1-(lst[1]+lst[0])/(N-i))

    for j in range(lst[0]):
        c2 *= ((lst[1]+lst[0]-j)*(lst[2]+lst[0]-j)/(N-lst[2]-j)/(lst[0]-j))

    return c1*c2

In [5]:
def extract_hash_tags(s):
    return set([re.sub(r"#+", "#", k) for k in set([k for j in set(
                       [i for i in s.split() if i.startswith("#")]) for k in re.findall(r"\W[\w']+", j) if k.startswith("#")])])

In [6]:
tweet = "There are a #few #hashtags in #this text but #only a #few: http://example.org/#comments"
print(extract_hash_tags(tweet))

{'#hashtags', '#only', '#few', '#this'}


In [7]:
### Read in tweet file
path = '/Users/msaif/Google Drive/Fall 2020 (Internship)/Processed/'
filename = 'trumpbiden0306_FINAL.csv'
tweets = pd.read_csv(path + filename)
tweets.head()

Unnamed: 0,poster,recipient,relationship,state,poster_id,tweet_id,tweet,quote_status,in_reply_to_user_id,mentioned_user_id,hashtags
0,_AngieBalderas,_AngieBalderas,tweet,CA,19831890.0,1.24e+18,"Tax cuts for the rich, judges, and deregulatio...",False,,,['Trumpdemic']
1,_Damian11,AC360,mentions,IL,1314546000.0,1.26e+18,Thanks #NancyPelosiAmericanHero 🤣🤣 #morbidlyob...,True,,227837742.0,"['NancyPelosiAmericanHero', 'morbidlyobese', '..."
2,_DavidCarter_,JoeBiden,reply,IL,48452320.0,1.26e+18,@JoeBiden @RepGalonski .@BernieSanders told us...,False,939091.0,,['MedicareForAll']
3,_deadsrsly_,_deadsrsly_,tweet,MA,4780615000.0,1.25e+18,Close friends on insta. Shit post on twitter. ...,False,,,['brand']
4,_EricCarr,IvankaTrump,reply,FL,237481200.0,1.24e+18,@IvankaTrump Thank you @IvankaTrump #AmericaSt...,False,52544275.0,,['AmericaStrong']


In [8]:
### Extract hashtags from tweets and calculate some statistics
ht = {}
for s in tweets['tweet']:
    for t in extract_hash_tags(s.lower()):
        if t in ht.keys():
            ht[t] += 1
        else:
            ht[t] = 1
            
rank_ht = sorted(ht.items(), key=lambda kv: kv[1], reverse=True)
print(rank_ht[0:100])

[('#covid19', 374), ('#trump', 332), ('#coronavirus', 303), ('#trump2020', 230), ('#maga', 203), ('#biden2020', 136), ('#trumpvirus', 105), ('#blacklivesmatter', 103), ('#kag', 99), ('#obamagate', 97), ('#kag2020', 79), ('#covidー19', 72), ('#wwg1wga', 72), ('#trumpliesamericansdie', 69), ('#covid_19', 68), ('#fakenews', 66), ('#trumpownseverydeath', 65), ('#joebiden', 62), ('#coronaviruspandemic', 58), ('#biden', 56), ('#trumpmeltdown', 55), ('#leadright', 54), ('#dumptrump2020', 54), ('#gop', 52), ('#trumpgenocide', 49), ('#trumpliespeopledie', 48), ('#bernie2020', 46), ('#foxnews', 46), ('#trumpresignnow', 44), ('#votebluetoendthisnightmare', 44), ('#maga2020', 44), ('#trump2020landslide', 43), ('#votebluetosaveamerica', 43), ('#trumpistheworstpresidentever', 43), ('#resist', 43), ('#cnn', 42), ('#votebluenomatterwho', 41), ('#voteblue2020', 41), ('#donaldtrump', 41), ('#trump2020nowmorethanever', 40), ('#trumpisanidiot', 40), ('#dumptrump', 40), ('#covidiots', 39), ('#covid', 39), (

In [9]:
### Construct a hashtag adjacency matrix based on coocurrence in the same tweet
ht_size = len(ht.keys())
ht_id = dict(zip(ht.keys(), range(ht_size)))
mt_hashtag = spsp.lil_matrix((ht_size, ht_size), dtype = float)

for s in tweets['tweet']:
    for c in it.combinations(extract_hash_tags(s.lower()), 2):
        mt_hashtag[ht_id[c[0]], ht_id[c[1]]] += 1
        mt_hashtag[ht_id[c[1]], ht_id[c[0]]] = mt_hashtag[ht_id[c[0]], ht_id[c[1]]]

# G = nx.from_scipy_sparse_matrix(mt_hashtag)
# nx.write_gexf(G, 'hashtag_mt.gexf')

In [10]:
print(len(ht_id))

6769


In [11]:
len(ht)

6769

In [12]:
print(ht_id)



In [13]:
### Build a similarity network of hashtags
inv_ht_id = {v: k for k, v in ht_id.items()}
N = tweets.shape[0]
p0 = 1e-4 #1e-6 default
mt_htwt = spsp.tril(mt_hashtag, k=-1, format='coo')

for idx, (i,j,w) in enumerate(zip(mt_htwt.row, mt_htwt.col, mt_htwt.data)):
    k = int(w)
    n1 = ht[inv_ht_id[i]]
    n2 = ht[inv_ht_id[j]]
    pvalue = p_value_A6([k, n1-k, n2-k, N-n1-n2+k])
    # pvalue = multinomial([k, n1-k, n2-k, N-n1-n2+k])/(comb(N, n1, exact=True)*comb(N, n2, exact=True))
    if pvalue < p0 and pvalue > 0:
        mt_htwt.data[idx] = log(p0/pvalue)
    elif pvalue == 0:
        mt_htwt.data[idx] = np.inf
    else:
        mt_htwt.data[idx] = 0.0

mt2 = (mt_htwt.tocsr().transpose()+mt_htwt.tocsc()).tolil()

In [14]:
print(mt2)

  (3, 4274)	4.583657522412939
  (3, 4275)	3.485879610096949
  (3, 4276)	4.583657522412939
  (3, 4278)	4.583657522412939
  (3, 4280)	4.583657522412939
  (4, 85)	1.6285322435575422
  (4, 396)	6.466128214057927
  (7, 2843)	7.06856417220094
  (11, 82)	7.861481965264112
  (12, 25)	0.7341553769400886
  (12, 30)	4.5869411768286925
  (12, 79)	1.0916566967189025
  (12, 82)	8.464327599456883
  (12, 442)	4.202016047862423
  (12, 465)	3.2635718074370543
  (12, 1763)	2.2927751227703155
  (12, 2045)	1.0998100079354844
  (12, 4587)	0.4891309467577443
  (12, 5188)	1.0998100079354844
  (12, 5189)	0.4891309467577443
  (14, 297)	2.0692655708219037
  (19, 138)	15.372545566550317
  (19, 188)	14.503747575420498
  (19, 311)	6.452711953171683
  (19, 1072)	21.354033261958357
  (19, 3073)	0.9010476813125986
  (19, 3335)	5.984957266813998
  (19, 3338)	0.9010476813125986
  (19, 3340)	0.9010476813125986
  (19, 3341)	5.984957266813998
  (19, 3342)	5.984957266813998
  (19, 3343)	0.9010476813125986
  (19, 4688)	0.901

In [15]:
### Setup an enlarged seed set of hashtags with labels
ref_antib = {'neverbiden','democratsaredestroyingamerica','sleepyjoe','ibelievetarareade','blexit',
             'metoounlessitsbiden','dementiajoe','democratshateamerica','demexit','timesupbiden',
             'democratsaredangerous','rapistjoebiden','democratstheenemywithin','quidprojoe','chinajoe',
             'democratsareadisgrace','demexit2020','joebidenisaracist','democratsarecorrupt','donothingdems',
             'whyimnotvotingforbiden','votedemocratsout','walkawayfromdemocrats'}
             #only at p=1e4 is neverbiden allowed

ref_prot = {'trump2020','leadright','maga','kag','kag2020','wwg1wga','trumptrain','draintheswamp','patriotsunite',
            'choochoobaby','patriots','obamagate','maga2020','voteredtosaveamerica','trump2020landslide','qanon',
            'trump2020nowmorethanever','americafirst','wethepeople','voteredtosaveamerica2020','fourmoreyears',
            'keepamericagreat','trump2020landslidevictory','blacksfortrump','trumpbeatsbiden','armyfortrump',
            'trumppence2020','leadright','trump4eva'}

ref_antit = {'dumptrump','trumpliesamericansdie','ditchmitch','moscowmitch','trumpgenocide',
             'trumppandemic','trumpistheworstpresidentever','gopgenocide','republicansarekillingus',
             'trumpliedpeopledied','trumpout2020','votetrumpout','cult45','trumpvirus','trumpownseverydeath',
             'liarinchief','trumpdeathtoll','worstpresidentinhistory','dumptrump2020','traitortrump',
             'worstpresidentinhistory','impotus','worstpresidentever','25thamendmentnow','trumpcrimefamily',
             'trumpisaracist','trumpisalaughingstock','trumpisanidiot','trumpisaloser','trumpgate','trumpplague',
             'byedon','trumpisunwell','gopcorruptionovercountry','fucktrump','wherearethetests','fbr','resist',
             'resistance','trumpmeltdown','trumphasnoplan','trumpresignnow','bunkerboy','trumpisnotwell',
             'voteoutthegop','gopbetrayedamerica','fakepresident','kingtrump','bunkerboytrump',
             'trumpliesaboutcoronavirus','bunkerbabytrump','notmypresident','bunkerdon','trumpwearsadultdiapers',
             'trumprecession','idiotinchief,''votetrumpout2020','nevertrump','cowardinchief','removetrumpnow',
             'trumpnotfitforoffice','trumpvirus2020','byedon2020','trumpforprison2020','trumpderangementsyndrome',
             'moscowmitchmcconnell','trumpdeathtoll100k','trumpdemic','trumpcult','killerinchief','blametrump',
             'ripdonaldtrump','trumpisacoward','impotus45moron','trumppencemustresignow',
             'trumpcrimefamilyforprison','trumpincitesviolence','trumpisacompletefailure','traitorinchief',
             '25thamendmentbeforewealldie','trumpisa_danger_toamerica','trumpisanationaldisgrace','trumptreason',
             'bloodontrumpshands','everythingtrumptouchesdies','trumppenceoutnow','amendment25unfittolead',
             'removetrump','riptrump','presidentdeath','trumporamerica','failedpresident','wherearethemasks',
             'worstadministrationever','trumpisamoron','presidementia','fbrparty','trumpisunfitforoffice',
             'trumpisafailure','trumppenceaccountable','sexualpredatortrump','trumpliesamericadies',
             'thisistrumpsamerica','trumpisaclown','trumpviruscoverup','dictatortrump','trumptheworstpresidentever',
             'americaortrump','trumpdepression','tr45on','ettd','lockhimup','ripdonaldtrump'}

ref_prob = {'biden2020','bidenbeatstrump','ridinwithbiden','teamjoe','joebiden2020','bluewave',
            'votebluetoendthisnightmare','votebluenomatterwho','joebidenforpresident2020','bidenharris',
            'bidenforpresident','votebiden','ridenwithbiden','bidenharris2020','joementum','presidentbiden',
            'gojoe','joe2020','voteblue','votebluetosaveamerica','voteblue2020','gojoe2020','demcastbiden',
            'bluetsunami2020','votebiden2020','anyonebuttrump2020','votejoe','gojoebiden','teambiden','demcast',
            'voteforyourlife','saveourdemocracy'}
            #gojoe #joe2020 #voteblue #votebluetosaveamerica #voteblue2020 #gojoe2020 not allowed at p=1e6

ref_prot = set(['#'+i for i in ref_prot])
ref_antit = set(['#'+i for i in ref_antit])
ref_prob = set(['#'+i for i in ref_prob])
ref_antib = set(['#'+i for i in ref_antib])

In [16]:
cls_ht = {k: v for k, v in ht_id.items() if mt2.rows[v]!=[]}

for t in ref_prob:
    if t in cls_ht:
        cls_ht[t]=cls_ht['#biden2020']

for t in ref_antit:
    if t in cls_ht:
        cls_ht[t]=cls_ht['#dumptrump']

for t in ref_prot:
    if t in cls_ht:
        cls_ht[t]=cls_ht['#trump2020']

for t in ref_antib:
    if t in cls_ht:
        cls_ht[t]=cls_ht['#neverbiden']

miss = 1
seedtag = ['#biden2020', '#dumptrump', '#trump2020', '#neverbiden']
seedcls = {cls_ht[i]: i for i in seedtag}

In [17]:
print(seedcls.keys())

dict_keys([434, 1161, 311, 746])


In [18]:
print(seedcls)

{434: '#biden2020', 1161: '#dumptrump', 311: '#trump2020', 746: '#neverbiden'}


In [19]:
### Use label propagation method to classify more hashtags in the similarity network
while miss > 0:
    miss = 0
    rnd = np.random.permutation(ht_size)
    for i in rnd:
        cols = mt2.rows[i]
        if cols == [] or cls_ht[inv_ht_id[i]] in seedcls.keys():
            continue

        w = [mt2[i,j] for j in cols]
        c = [cls_ht[inv_ht_id[j]] for j in cols]
        cw = pd.DataFrame({'c': c, 'w': w})
        reduce = cw.groupby(['c'])['w'].sum()
        winner = reduce[reduce == reduce.max()].keys().tolist()

        if cls_ht[inv_ht_id[i]] in winner:
            continue
        else:
            if len(set(winner) & seedcls.keys()) != 0:
                winner = list(set(winner) & seedcls.keys())

            cls_ht[inv_ht_id[i]] = np.random.choice(winner, 1)[0]
            miss+=1

In [20]:
len(set(cls_ht.values()))

91

In [21]:
### Pruning the classfied hashtags in the network to only keep the significant ones
pd_cls_ht = pd.DataFrame({'id':[ht_id[i] for i in cls_ht.keys()], 'ht':list(cls_ht.keys()), 'cls':list(cls_ht.values()), 'fr': [ht[i] for i in cls_ht.keys()]})

prot_threshold = 0.001 * pd_cls_ht[(pd_cls_ht['cls']==cls_ht['#trump2020'])]['fr'].max()
prot = pd_cls_ht[(pd_cls_ht['cls']==cls_ht['#trump2020']) & (pd_cls_ht['fr']>prot_threshold)]

prob_threshold = 0.001 * pd_cls_ht[(pd_cls_ht['cls']==cls_ht['#biden2020'])]['fr'].max()
prob = pd_cls_ht[(pd_cls_ht['cls']==cls_ht['#biden2020']) & (pd_cls_ht['fr']>prob_threshold)]

antit_threshold = 0.001 * pd_cls_ht[(pd_cls_ht['cls']==cls_ht['#dumptrump'])]['fr'].max()
antit = pd_cls_ht[(pd_cls_ht['cls']==cls_ht['#dumptrump']) & (pd_cls_ht['fr']>antit_threshold)]

antib_threshold = 0.001 * pd_cls_ht[(pd_cls_ht['cls']==cls_ht['#neverbiden'])]['fr'].max()
antib = pd_cls_ht[(pd_cls_ht['cls']==cls_ht['#neverbiden']) & (pd_cls_ht['fr']>antib_threshold)]

In [22]:
pd_cls_ht.sort_values(by=['fr'], ascending=False).head(10)

Unnamed: 0,id,ht,cls,fr
12,30,#covid19,30,374
4,12,#trump,30,332
27,82,#coronavirus,30,303
76,311,#trump2020,311,230
11,28,#maga,311,203
92,434,#biden2020,434,136
58,223,#trumpvirus,1161,105
62,264,#blacklivesmatter,1397,103
42,138,#kag,311,99
15,39,#obamagate,311,97


In [23]:
### Export to networkx graph for gephi for visualization
cls_ids = prot['id'].tolist() + prob['id'].tolist() + antit['id'].tolist() + antib['id'].tolist()
mt_cls = mt2[cls_ids,:][:, cls_ids]
dmt_cls = mt_cls.toarray()
dmt_cls[np.isinf(dmt_cls)] = 999.
mt_cls = spsp.lil_matrix(dmt_cls)

g_cls = nx.from_scipy_sparse_matrix(mt_cls)

sub_id = dict(zip(range(len(cls_ids)), cls_ids))
for i in g_cls.nodes:
    g_cls.nodes[i]['ht'] = inv_ht_id[sub_id[i]]
    g_cls.nodes[i]['cls'] = int(cls_ht[inv_ht_id[sub_id[i]]])
    g_cls.nodes[i]['fr'] = int(ht[inv_ht_id[sub_id[i]]])

nx.write_gexf(g_cls, "g_cls.gexf")

In [23]:
### Classify tweets according to labeled hashtags to generate the training data

#cls_ids_ht = {inv_ht_id[t]: cls_ht[inv_ht_id[t]] for t in cls_ids}
#lb_id = {'Anti-Biden':cls_ht['#neverbiden'], 'Anti-Trump':cls_ht['#dumptrump'], 'Pro-Trump':cls_ht['#trump2020'], 'Pro-Biden':cls_ht['#biden2020']}

cls_lb = pd_cls_ht.loc[pd_cls_ht.cls.isin(seedcls.keys()), ['ht','cls']]
cls_ids_ht = dict(zip(cls_lb['ht'], cls_lb['cls']))

tweet_label = np.zeros(len(tweets))
supB = set([cls_ht['#biden2020'], cls_ht['#dumptrump']])
supT = set([cls_ht['#trump2020'], cls_ht['#neverbiden']])

for i, s in enumerate(tweets['tweet']):
    l = [cls_ids_ht[t] for t in extract_hash_tags(s.lower()) if t in cls_ids_ht]
    if l == []: # unlabeled
        tweet_label[i] = -1
        continue
    u, idx = np.unique(l, return_inverse=True)
    mx = np.bincount(idx)
    winner = u[mx == mx.max()]
    if len(winner) > 1:
        if set(winner) == supB: # support Biden
            tweet_label[i] = 99
        if set(winner) == supT: # support Trump
            tweet_label[i] = 1000
    elif len(winner) == 1:
        tweet_label[i] = winner[0]
    else:
        tweet_label[i] = np.nan

In [24]:
tweets.head(6000)

Unnamed: 0,poster,recipient,relationship,state,poster_id,tweet_id,tweet,quote_status,in_reply_to_user_id,mentioned_user_id,hashtags
0,_AngieBalderas,_AngieBalderas,tweet,CA,19831890.0,1.24e+18,"Tax cuts for the rich, judges, and deregulatio...",False,,,['Trumpdemic']
1,_Damian11,AC360,mentions,IL,1314546000.0,1.26e+18,Thanks #NancyPelosiAmericanHero 🤣🤣 #morbidlyob...,True,,227837700.0,"['NancyPelosiAmericanHero', 'morbidlyobese', '..."
2,_DavidCarter_,JoeBiden,reply,IL,48452320.0,1.26e+18,@JoeBiden @RepGalonski .@BernieSanders told us...,False,939091.0,,['MedicareForAll']
3,_deadsrsly_,_deadsrsly_,tweet,MA,4780615000.0,1.25e+18,Close friends on insta. Shit post on twitter. ...,False,,,['brand']
4,_EricCarr,IvankaTrump,reply,FL,237481200.0,1.24e+18,@IvankaTrump Thank you @IvankaTrump #AmericaSt...,False,52544280.0,,['AmericaStrong']
5,_gabriellabc,GG30000,mentions,PA,103769300.0,1.27e+18,Just started first ep of #TopChef Italy finals...,False,,105747400.0,"['TopChef', 'LFG']"
6,_gkazzz,_gkazzz,tweet,CA,946928000.0,1.24e+18,"At today's press conference, the #CDC read a g...",False,,,"['CDC', 'TrumpCrash', 'CoronavirusOutbreak']"
7,_gkazzz,_gkazzz,tweet,CA,946928000.0,1.24e+18,#Trump SAYS something and thinks that he's DON...,False,,,"['Trump', 'TrumpSlump', 'TrumptheWorstPresiden..."
8,_iamjheani,_iamjheani,tweet,CA,1.16e+18,1.25e+18,Read &amp; share this. \n\nMel Gibson was blac...,True,,,"['outofshadowsdocumentary', 'savethechildren']"
9,_johnny_cakes__,realDonaldTrump,reply,NY,3162833000.0,1.27e+18,@realDonaldTrump @LacyJohnsonMN Hey John Barro...,False,25073880.0,,['BunkerBitch']


In [30]:
tweet_label[30]

310.0

In [25]:
tweets['label'] = tweet_label

In [26]:
## Output new label column to CSV
tweets.to_csv('tweetlabels_training.csv',index = False)

In [31]:
### Output training data to the formats used for word embedding learning
l_tw = pd.DataFrame({'tt': tweets['tweet'].str.lower(), 'lb': tweet_label})
train_l_tw = l_tw[l_tw['lb']>0]

train_B = train_l_tw[(train_l_tw['lb'] == float(cls_ht['#biden2020'])) | (train_l_tw['lb'] == float(cls_ht['#dumptrump'])) | (train_l_tw['lb'] == 99.0)]
train_T = train_l_tw[(train_l_tw['lb'] == float(cls_ht['#trump2020'])) | (train_l_tw['lb'] == float(cls_ht['#neverbiden'])) | (train_l_tw['lb'] == 1000.0)]

with open('train_B.txt', 'w') as out:
    for tw in train_B['tt']:
        out.write(tw+'\n')

with open('train_T.txt', 'w') as out:
    for tw in train_T['tt']:
        out.write(tw+'\n')

tokenized_corpus = []
for t in train_l_tw['tt']:
    tokenized_corpus.append([tk for tk in twokenize.tokenizeRawTweetText(t) if not (tk.startswith('@') or re.match(twokenize.url, tk) or re.match(twokenize.punctSeq, tk))])
    #tokenized_corpus.append([tk for tk in twokenize.tokenizeRawTweetText(t) if not (tk.startswith('@') or re.match(twokenize.url, tk) or re.match(twokenize.punctSeq, tk) or (tk.startswith('#') and (tk in cls_ids_ht)))])

tok = Tokenizer(char_level=False)
tok.fit_on_texts(tokenized_corpus)
#print(tok.word_counts)
#print(tok.document_count)
#print(tok.word_index)
#print(tok.word_docs)

with open('vocab.csv', 'w') as out:
    out.write('<unk> 0\n')
    for k, v in tok.word_index.items():
        out.write(' '.join([k, str(v)]))
        out.write('\n')