# Import libraries

In [40]:
!pip install glove_python
!pip install emoji
!pip install gensim



In [41]:
from google.colab import drive

import pandas as pd
import os
import string
import numpy as np

## Preprocessing
import emoji
import re
import html
import nltk
import regex
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize.casual import TweetTokenizer
from collections import Counter #token counter

# Glove
from glove import Corpus, Glove

#Cosine
from scipy.spatial.distance import cosine

# Load dataset

In [42]:
drive.mount('/content/drive', force_remount = True)
root_dir = '/content/drive/MyDrive/ProgettoAI/dataset/'

trump = pd.read_csv(root_dir+'tweet_with_entities_trump.csv', index_col=0  ).rename(columns={'tweet' : 'text'})
clinton = pd.read_csv(root_dir+'tweet_with_entities_clinton.csv', index_col=0  ).rename(columns={'tweet' : 'text'})

Mounted at /content/drive


In [43]:
trump['text'].iloc[8]

'Thank you for your endorsement, @GovernorSununu. #MakeAmericaGreatAgain \nhttps://t.co/8BEeQPsuyd'

# Pre-processing

In [44]:
START_OF_LINE = r"^"
OPTIONAL = "?"
ANYTHING = "."
ZERO_OR_MORE = "*"
ONE_OR_MORE = "+"

SPACE = "\s"
SPACES = SPACE + ONE_OR_MORE
NOT_SPACE = "[^\s]" + ONE_OR_MORE
EVERYTHING_OR_NOTHING = ANYTHING + ZERO_OR_MORE

ERASE = ""
FORWARD_SLASH = "\/"
NEWLINES = r"[\r\n]"

RE_TWEET = START_OF_LINE + "RT" + SPACES

HYPERLINKS = ("http" + "s" + OPTIONAL + ":" + FORWARD_SLASH + FORWARD_SLASH
              + NOT_SPACE + NEWLINES + ZERO_OR_MORE)

HASH = "#"
TAG = "@"

def cleaning_up(tweets):
    print("CLEANING...")
    clean =[]
    
    for tweet in tweets['text']:

        tweet = html.unescape(html.unescape(tweet)) #caratteri html
        tweet = re.sub(RE_TWEET, ERASE, tweet)  #re:
        tweet = re.sub(HYPERLINKS, ERASE, tweet) #url
        tweet = re.sub(HASH, ERASE, tweet) #hash simbolo
        tweet = re.sub(TAG, ERASE, tweet)
        #tweet = re.sub("@[^\s]+",ERASE,tweet) #tag username
        tweet = re.sub('<.*?>', ERASE, tweet) #tag html
        tweet = emoji.demojize(tweet) #traduzione delle emoji
        tweet = re.sub(r":", ' ', tweet) #serve per le emoji
        tweet = re.sub(r'\d+', '', tweet) #number
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
        clean.append(tweet)
      
    print("---CLEANED---")

    return clean

def get_wordnet_pos(tag):
    if tag[1].startswith('J'):
        return wordnet.ADJ
    elif tag[1].startswith('S'):
        return wordnet.ADJ_SAT
    elif tag[1].startswith('V'):
        return wordnet.VERB
    elif tag[1].startswith('N'):
        return wordnet.NOUN
    elif tag[1].startswith('R'):
        return wordnet.ADV
    else:
        return ''

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

def tweet_tokenize(tweets):
    
    print("TOKENIZATION...")
    all_tokens=[]
    tokens =[] 
    tokenizer = word_tokenize

    for tweet in tweets:
        temp=[]
        all_tokens = tokenizer(tweet)
        lemmatizer = WordNetLemmatizer()
        stop_words = stopwords.words('english')
        for token in all_tokens:
                token = token.lower()
                if not (
                    not token.isalpha()
                    or token in stop_words 
                    or token in string.punctuation
                    or (token.isalpha() and len(token) < 2)):
                        tagged = nltk.pos_tag([token])
                        parsed = get_wordnet_pos(tagged[0])
                        #if parsed != '':
                         #   token = lemmatizer.lemmatize(token, parsed)
                        temp.append(token)
        tokens.append(temp) 

    print("---TOKENIZED---")

    return tokens

def get_word_frequencies(corpus):
  frequencies = Counter()
  for sentence in corpus:
    for word in sentence:
      frequencies[word] += 1
  freq = frequencies.most_common()
  return freq

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [45]:
trump_cleaned = cleaning_up(trump)
trump_tokens = tweet_tokenize(trump_cleaned)

clinton_cleaned = cleaning_up(clinton)
clinton_tokens = tweet_tokenize(clinton_cleaned)

CLEANING...
---CLEANED---
TOKENIZATION...
---TOKENIZED---
CLEANING...
---CLEANED---
TOKENIZATION...
---TOKENIZED---


In [46]:
trump_cleaned[8]

'Thank you for your endorsement GovernorSununu MakeAmericaGreatAgain \n'

# GloVe Embedding

In [47]:
# TRUMP
#Creating a corpus object
corpus_trump = Corpus() 
#Training the corpus to generate the co occurence matrix which is used in GloVe
corpus_trump.fit(trump_tokens, window=10)

model_trump = Glove(no_components=5, learning_rate=0.05) 
model_trump.fit(corpus_trump.matrix, epochs=30, no_threads=4, verbose=True)
model_trump.add_dictionary(corpus_trump.dictionary)

model_trump.add_dictionary(corpus_trump.dictionary)


Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [48]:
# CLINTON
#Creating a corpus object
corpus_clinton = Corpus() 
#Training the corpus to generate the co occurence matrix which is used in GloVe
corpus_clinton.fit(clinton_tokens, window=10)

model_clinton = Glove(no_components=5, learning_rate=0.05) 
model_clinton.fit(corpus_clinton.matrix, epochs=30, no_threads=4, verbose=True)
model_clinton.add_dictionary(corpus_clinton.dictionary)

model_clinton.add_dictionary(corpus_clinton.dictionary)

Performing 30 training epochs with 4 threads
Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4
Epoch 5
Epoch 6
Epoch 7
Epoch 8
Epoch 9
Epoch 10
Epoch 11
Epoch 12
Epoch 13
Epoch 14
Epoch 15
Epoch 16
Epoch 17
Epoch 18
Epoch 19
Epoch 20
Epoch 21
Epoch 22
Epoch 23
Epoch 24
Epoch 25
Epoch 26
Epoch 27
Epoch 28
Epoch 29


In [49]:
model_trump.word_vectors[model_trump.dictionary['trump']]
model_trump.most_similar('hillary')

[('crooked', 0.999053544242763),
 ('hillaryclinton', 0.9966795641283664),
 ('beat', 0.9925213993922313),
 ('northamericanfreetradeagreementtranspacificpartnership',
  0.9924602218856396)]

# Polarity

In [50]:
def polarity(model, topic, positive, negative):
  positive_score = 0
  negative_score = 0

  for t in topic:

    positive_temp = 0
    for p in positive:
      positive_temp = positive_temp + ( 1 - cosine(model.word_vectors[model.dictionary[t]], model.word_vectors[model.dictionary[p]]))
    positive_temp = positive_temp/len(positive)
    

    negative_temp = 0
    for n in negative:
      negative_temp = negative_temp + ( 1 - cosine(model.word_vectors[model.dictionary[t]], model.word_vectors[model.dictionary[n]]))
    negative_temp = negative_temp/len(negative)

    positive_score = positive_score + positive_temp
    negative_score = negative_score + negative_temp

  print('Positive score: ', positive_score)
  print('Negative score: ', negative_score)
  print('Difference score: ', positive_score - negative_score)

# Donald Trump

In [51]:
topic = [ 'hillary', 'clinton', 'dem', 'hillaryclinton', 'barackobama']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong']

polarity(model_trump, topic, positive, negative)

Positive score:  -0.7530677179500164
Negative score:  1.9485560368527763
Difference score:  -2.701623754802793


In [52]:
topic = [ 'trump', 'republican', 'america']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong']

polarity(model_trump, topic, positive, negative)

Positive score:  0.9170464628010069
Negative score:  -0.7236186985498821
Difference score:  1.640665161350889


In [53]:
topic = [ 'republican', 'rep', 'usa', 'america']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong']

polarity(model_trump, topic, positive, negative)

Positive score:  -0.599381912262045
Negative score:  -0.5118646670977314
Difference score:  -0.08751724516431358


# Hillary Clinton

In [54]:
topic = [ 'hillaryclinton', 'hillary', 'dem', 'barackobama']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong']

polarity(model_clinton, topic, positive, negative)

Positive score:  0.5335368242059649
Negative score:  -0.205782736467685
Difference score:  0.7393195606736499


# Save model

In [61]:
model_trump.save_word2vec_format('model.bin')

AttributeError: ignored

In [59]:
model_trump.save('trump_glove.txt')
model_clinton.save('clinton_glove.model')

AttributeError: ignored

In [56]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_input_file='./trump_glove.bin', word2vec_output_file="gensim_glove_vectors.txt")

(3525, 6)

In [57]:
from gensim.models.keyedvectors import KeyedVectors
model = KeyedVectors.load_word2vec_format("./gensim_glove_vectors.txt", binary=False)

UnicodeDecodeError: ignored

In [None]:
def loadGloveModel(File):
    print("Loading Glove Model")
    f = open(File,'r')
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

In [None]:
loadGloveModel('./trump_glove.bin')

In [62]:

m = Glove.load('./trump_glove.model')

In [64]:
!gcc --version

gcc (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0
Copyright (C) 2017 Free Software Foundation, Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.



In [65]:
!pip install mittens


Collecting mittens
  Downloading https://files.pythonhosted.org/packages/ce/c0/6e4fce5b3cb88edde2e657bb4da9885c0aeac232981706beed7f43773b00/mittens-0.2-py3-none-any.whl
Installing collected packages: mittens
Successfully installed mittens-0.2


In [68]:
from mittens import GloVe
