# Import libraries

In [2]:
!pip install emoji
!pip install gensim



In [3]:
from google.colab import drive

import pandas as pd
import os
import string
import numpy as np

## Preprocessing
import emoji
import re
import html
import nltk
import regex
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize.casual import TweetTokenizer
from collections import Counter #token counter

# W2v
from gensim.models import Word2Vec

#Cosine
from scipy.spatial.distance import cosine


# Load dataset

In [4]:
drive.mount('/content/drive', force_remount = True)
root_dir = '/content/drive/MyDrive/ProgettoAI/dataset/'

trump = pd.read_csv(root_dir+'tweet_with_entities_trump.csv', index_col=0  ).rename(columns={'tweet' : 'text'})
clinton = pd.read_csv(root_dir+'tweet_with_entities_clinton.csv', index_col=0  ).rename(columns={'tweet' : 'text'})

Mounted at /content/drive


In [5]:
trump['text'].iloc[0]

'Join me for a 3pm Rallying - tomorrow at the MidAmericaCenter in RailsWestRailroadMuseum! Tickets:… https://t.co/dfzsbICiXc'

# Preprocessing

In [6]:
START_OF_LINE = r"^"
OPTIONAL = "?"
ANYTHING = "."
ZERO_OR_MORE = "*"
ONE_OR_MORE = "+"

SPACE = "\s"
SPACES = SPACE + ONE_OR_MORE
NOT_SPACE = "[^\s]" + ONE_OR_MORE
EVERYTHING_OR_NOTHING = ANYTHING + ZERO_OR_MORE

ERASE = ""
FORWARD_SLASH = "\/"
NEWLINES = r"[\r\n]"

RE_TWEET = START_OF_LINE + "RT" + SPACES

HYPERLINKS = ("http" + "s" + OPTIONAL + ":" + FORWARD_SLASH + FORWARD_SLASH
              + NOT_SPACE + NEWLINES + ZERO_OR_MORE)

HASH = "#"
TAG = "@"

def cleaning_up(tweets):
    print("CLEANING...")
    clean =[]
    
    for tweet in tweets['text']:

        tweet = html.unescape(html.unescape(tweet)) #caratteri html
        tweet = re.sub(RE_TWEET, ERASE, tweet)  #re:
        tweet = re.sub(HYPERLINKS, ERASE, tweet) #url
        tweet = re.sub(HASH, ERASE, tweet) #hash simbolo
        tweet = re.sub(TAG, ERASE, tweet) #delete @
        #tweet = re.sub("@[^\s]+",ERASE,tweet) #tag username
        tweet = re.sub('<.*?>', ERASE, tweet) #tag html
        tweet = emoji.demojize(tweet) #traduzione delle emoji
        tweet = re.sub(r":", ' ', tweet) #serve per le emoji
        tweet = re.sub(r'\d+', '', tweet) #number
        tweet = tweet.translate(str.maketrans('', '', string.punctuation) )
    
        clean.append(tweet)
      
    print("---CLEANED---")

    return clean

def get_wordnet_pos(tag):
    if tag[1].startswith('J'):
        return wordnet.ADJ
    elif tag[1].startswith('S'):
        return wordnet.ADJ_SAT
    elif tag[1].startswith('V'):
        return wordnet.VERB
    elif tag[1].startswith('N'):
        return wordnet.NOUN
    elif tag[1].startswith('R'):
        return wordnet.ADV
    else:
        return ''

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

def tweet_tokenize(tweets):
    
    print("TOKENIZATION...")
    all_tokens=[]
    tokens =[] 
    tokenizer = word_tokenize

    for tweet in tweets:
        temp=[]
        all_tokens = tokenizer(tweet)
        lemmatizer = WordNetLemmatizer()
        stop_words = stopwords.words('english')
        for token in all_tokens:
                token = token.lower()
                if not (
                    not token.isalpha()
                    or token in stop_words 
                    or token in string.punctuation
                    or (token.isalpha() and len(token) < 2)):
                        tagged = nltk.pos_tag([token])
                        parsed = get_wordnet_pos(tagged[0])
                        #if parsed != '':
                         #   token = lemmatizer.lemmatize(token, parsed)
                        temp.append(token)
        tokens.append(temp) 

    print("---TOKENIZED---")

    return tokens

def get_word_frequencies(corpus):
  frequencies = Counter()
  for sentence in corpus:
    for word in sentence:
      frequencies[word] += 1
  freq = frequencies.most_common()
  return freq

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
trump_cleaned = cleaning_up(trump)
trump_tokens = tweet_tokenize(trump_cleaned)

clinton_cleaned = cleaning_up(clinton)
clinton_tokens = tweet_tokenize(clinton_cleaned)

CLEANING...
---CLEANED---
TOKENIZATION...
---TOKENIZED---
CLEANING...
---CLEANED---
TOKENIZATION...
---TOKENIZED---


In [8]:
get_word_frequencies(trump_tokens)

[('thank', 517),
 ('makeamericagreatagain', 394),
 ('trump', 380),
 ('great', 346),
 ('donaldtrump', 276),
 ('donaldtrumponsocialmedia', 272),
 ('hillary', 262),
 ('people', 219),
 ('unitedstates', 198),
 ('crooked', 182),
 ('hillaryclinton', 174),
 ('get', 132),
 ('vote', 120),
 ('big', 117),
 ('like', 114),
 ('presidentoftheunitedstates', 112),
 ('would', 109),
 ('thetonightshow', 109),
 ('cnn', 106),
 ('many', 106),
 ('join', 104),
 ('tedcruz', 101),
 ('pm', 100),
 ('one', 99),
 ('jervycruz', 98),
 ('poll', 97),
 ('foxnews', 97),
 ('last', 89),
 ('enjoyrecords', 88),
 ('never', 87),
 ('new', 87),
 ('going', 82),
 ('back', 82),
 ('today', 80),
 ('said', 80),
 ('make', 79),
 ('country', 78),
 ('speech', 78),
 ('time', 76),
 ('media', 76),
 ('much', 73),
 ('marcorubio', 72),
 ('win', 71),
 ('iowa', 70),
 ('see', 69),
 ('want', 69),
 ('votetrump', 69),
 ('support', 67),
 ('watch', 66),
 ('job', 65),
 ('americafirst', 64),
 ('jobs', 64),
 ('bad', 64),
 ('interviewed', 64),
 ('us', 61),
 

In [9]:
get_word_frequencies(clinton_tokens)

[('donaldtrump', 677),
 ('unitedstates', 320),
 ('presidentoftheunitedstates', 291),
 ('hillary', 283),
 ('people', 190),
 ('make', 187),
 ('hillaryclinton', 186),
 ('one', 175),
 ('us', 163),
 ('donaldtrumps', 141),
 ('families', 132),
 ('need', 128),
 ('would', 121),
 ('women', 119),
 ('country', 115),
 ('like', 113),
 ('let', 110),
 ('together', 105),
 ('hydrogen', 105),
 ('americans', 104),
 ('get', 102),
 ('vote', 100),
 ('every', 95),
 ('first', 94),
 ('know', 89),
 ('donaldtrumponsocialmedia', 89),
 ('plan', 87),
 ('going', 86),
 ('never', 83),
 ('de', 80),
 ('time', 80),
 ('campaign', 80),
 ('today', 79),
 ('lets', 79),
 ('back', 75),
 ('watch', 73),
 ('work', 72),
 ('got', 72),
 ('live', 71),
 ('years', 69),
 ('good', 69),
 ('take', 69),
 ('say', 67),
 ('want', 66),
 ('right', 64),
 ('cant', 63),
 ('help', 62),
 ('better', 60),
 ('sure', 60),
 ('economy', 59),
 ('day', 59),
 ('hillarys', 58),
 ('says', 58),
 ('new', 58),
 ('great', 58),
 ('last', 57),
 ('many', 57),
 ('go', 56

In [10]:
i = 0
k = 0
for s in trump_tokens:
  for word in s:
    if word == 'hillary':
      print(i)
      k = k +1
  i = i + 1

  

17
59
88
90
93
112
117
172
191
197
216
231
232
235
242
281
283
284
290
290
302
303
311
313
317
361
363
409
410
412
414
427
482
484
486
531
533
534
538
539
552
555
556
562
564
566
572
573
577
585
586
587
591
595
600
605
619
635
639
640
649
656
657
668
669
671
672
675
678
680
682
683
700
704
706
764
766
767
768
775
776
777
778
782
782
787
810
811
814
827
835
835
842
843
844
849
851
852
853
854
859
862
864
865
866
873
881
882
884
891
906
912
913
915
919
933
941
942
945
948
949
951
952
954
956
966
972
991
993
1002
1004
1006
1008
1012
1014
1015
1016
1038
1061
1065
1076
1076
1078
1099
1100
1113
1114
1117
1120
1143
1144
1151
1152
1164
1193
1194
1195
1195
1198
1202
1203
1208
1212
1223
1243
1262
1266
1266
1280
1284
1285
1292
1293
1297
1299
1300
1301
1304
1307
1310
1311
1311
1313
1314
1323
1326
1326
1327
1328
1329
1330
1339
1342
1346
1349
1350
1382
1383
1386
1387
1389
1390
1392
1394
1429
1440
1447
1448
1453
1454
1461
1461
1462
1472
1485
1487
1496
1500
1506
1509
1514
1516
1526
1527
1547
1551
1556

In [11]:
k

262

In [12]:
trump_cleaned[2293]

'Hillary could lose to DonaldTrump in DemocraticPartyUnitedStates NewYorkCity\nMakeAmericaGreatAgain DonaldTrump\n'

In [13]:
trump_tokens[2293]

['hillary',
 'could',
 'lose',
 'donaldtrump',
 'democraticpartyunitedstates',
 'newyorkcity',
 'makeamericagreatagain',
 'donaldtrump']

#Polarity

In [14]:
def polarity(model, topic, positive, negative):
  positive_score = 0
  negative_score = 0

  for t in topic:

    positive_temp = 0
    for p in positive:
      positive_temp = positive_temp + ( 1 - cosine(model[t], model[p]))
    positive_temp = positive_temp/len(positive)
    

    negative_temp = 0
    for n in negative:
      negative_temp = negative_temp + (1 - cosine(model[t], model[n]))
    negative_temp = negative_temp/len(negative)

    positive_score = positive_score + positive_temp
    negative_score = negative_score + negative_temp

  print('Positive score: ', positive_score)
  print('Negative score: ', negative_score)
  print('Difference score: ', positive_score - negative_score)

# Trump

In [15]:
trump_model = Word2Vec(trump_tokens, sg=0, seed = 1, sample=0, window=5, min_count= 2 )
#vocabulary = list(trump_model.wv.vocab)

print(trump_model)

Word2Vec(vocab=2701, size=100, alpha=0.025)


In [16]:
summing = trump_model["hillaryclinton"] + trump_model["hillary"] 
trump_model.most_similar(positive=[summing])

  """Entry point for launching an IPython kernel.
  


[('hillary', 0.9996767640113831),
 ('hillaryclinton', 0.9993463754653931),
 ('beat', 0.9984186887741089),
 ('berniesanders', 0.9982447624206543),
 ('cant', 0.9980177879333496),
 ('barackobama', 0.9979698061943054),
 ('believe', 0.9978654980659485),
 ('lyin', 0.9977784156799316),
 ('wants', 0.9977516531944275),
 ('says', 0.9977107048034668)]

In [17]:
topic = [ 'barackobama']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong']

polarity(trump_model, topic, positive, negative)

Positive score:  0.9960370540618897
Negative score:  0.9994620680809021
Difference score:  -0.0034250140190124068


  if __name__ == '__main__':
  from ipykernel import kernelapp as app


In [18]:
topic = [ 'hillary', 'clinton', 'dem', 'hillaryclinton']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong']

polarity(trump_model, topic, positive, negative)

Positive score:  3.9646677136421205
Negative score:  3.9796245455741883
Difference score:  -0.014956831932067871


  if __name__ == '__main__':
  from ipykernel import kernelapp as app


In [19]:
1 - cosine(trump_model['hillary'], trump_model['bad'])

  """Entry point for launching an IPython kernel.


0.9954649806022644

In [20]:
1 - cosine(trump_model['trump'], trump_model['bad'])

  """Entry point for launching an IPython kernel.


0.9745658040046692

# Clinton

In [21]:
clinton_model = Word2Vec(clinton_tokens, sg=0, seed = 1, sample=0, window=5, min_count= 2 )
#vocabulary = list(trump_model.wv.vocab)

print(clinton_model)

Word2Vec(vocab=2766, size=100, alpha=0.025)


In [22]:
summing = clinton_model["great"]
clinton_model.most_similar(positive=[summing])

  """Entry point for launching an IPython kernel.
  


[('great', 0.9999999403953552),
 ('unitedstates', 0.9997862577438354),
 ('people', 0.9997547268867493),
 ('make', 0.9997491240501404),
 ('one', 0.9997435808181763),
 ('presidentoftheunitedstates', 0.9997369647026062),
 ('women', 0.9997368454933167),
 ('us', 0.9997341632843018),
 ('like', 0.9997197389602661),
 ('never', 0.9997180700302124)]

In [23]:
topic = [ 'donaldtrump', 'donald', 'republican']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong', 'war']

polarity(clinton_model, topic, positive, negative)

Positive score:  2.7165546298027037
Negative score:  2.85475688179334
Difference score:  -0.13820225199063652


  if __name__ == '__main__':
  from ipykernel import kernelapp as app


In [27]:
topic = [ 'barackobama']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong', 'war']

polarity(clinton_model, topic, positive, negative)

Positive score:  0.927637767791748
Negative score:  0.9712708592414856
Difference score:  -0.04363309144973759


  if __name__ == '__main__':
  from ipykernel import kernelapp as app


# Save model

In [30]:
trump_model.save('trump_w2v.model')
clinton_model.save('clinton_w2v.model')