# Import libraries

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install tqdm

!pip install emoji



In [None]:
from transformers import BertTokenizer, BertModel
from transformers import pipeline

from google.colab import drive

import pandas as pd
import os
import string
import numpy as np

## Preprocessing
import emoji
import re
import html
import nltk
import regex

#Cosine
from scipy.spatial.distance import cosine

# Load dataset

In [None]:
drive.mount('/content/drive', force_remount = True)
root_dir = '/content/drive/MyDrive/ProgettoAI/dataset/'

trump = pd.read_csv(root_dir+'tweet_with_entities_trump.csv', index_col=0  ).rename(columns={'tweet' : 'text'})
clinton = pd.read_csv(root_dir+'tweet_with_entities_clinton.csv', index_col=0  ).rename(columns={'tweet' : 'text'})

Mounted at /content/drive


# Preprocessing


In [None]:
START_OF_LINE = r"^"
OPTIONAL = "?"
ANYTHING = "."
ZERO_OR_MORE = "*"
ONE_OR_MORE = "+"

SPACE = "\s"
SPACES = SPACE + ONE_OR_MORE
NOT_SPACE = "[^\s]" + ONE_OR_MORE
EVERYTHING_OR_NOTHING = ANYTHING + ZERO_OR_MORE

ERASE = ""
FORWARD_SLASH = "\/"
NEWLINES = r"[\r\n]"

RE_TWEET = START_OF_LINE + "RT" + SPACES

HYPERLINKS = ("http" + "s" + OPTIONAL + ":" + FORWARD_SLASH + FORWARD_SLASH
              + NOT_SPACE + NEWLINES + ZERO_OR_MORE)

HASH = "#"
TAG = "@"

def cleaning_up(tweets):
    print("CLEANING...")
    clean =[]
    
    for tweet in tweets['text']:

        tweet = html.unescape(html.unescape(tweet)) #caratteri html
        tweet = re.sub(RE_TWEET, ERASE, tweet)  #re:
        tweet = re.sub(HYPERLINKS, ERASE, tweet) #url
        tweet = re.sub(HASH, ERASE, tweet) #hash simbolo
        tweet = re.sub(TAG, ERASE, tweet)
        #tweet = re.sub("@[^\s]+",ERASE,tweet) #tag username
        tweet = re.sub('<.*?>', ERASE, tweet) #tag html
        tweet = emoji.demojize(tweet) #traduzione delle emoji
        tweet = re.sub(r":", ' ', tweet) #serve per le emoji
        tweet = re.sub(r'\d+', '', tweet) #number
        tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
        clean.append(tweet)
      
    print("---CLEANED---")

    return clean

def get_wordnet_pos(tag):
    if tag[1].startswith('J'):
        return wordnet.ADJ
    elif tag[1].startswith('S'):
        return wordnet.ADJ_SAT
    elif tag[1].startswith('V'):
        return wordnet.VERB
    elif tag[1].startswith('N'):
        return wordnet.NOUN
    elif tag[1].startswith('R'):
        return wordnet.ADV
    else:
        return ''

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

def tweet_tokenize(tweets):
    
    print("TOKENIZATION...")
    all_tokens=[]
    tokens =[] 
    tokenizer = word_tokenize

    for tweet in tweets:
        temp=[]
        all_tokens = tokenizer(tweet)
        lemmatizer = WordNetLemmatizer()
        stop_words = stopwords.words('english')
        for token in all_tokens:
                token = token.lower()
                if not (
                    not token.isalpha()
                    or token in stop_words 
                    or token in string.punctuation
                    or (token.isalpha() and len(token) < 2)):
                        tagged = nltk.pos_tag([token])
                        parsed = get_wordnet_pos(tagged[0])
                        #if parsed != '':
                         #   token = lemmatizer.lemmatize(token, parsed)
                        temp.append(token)
        tokens.append(temp) 

    print("---TOKENIZED---")

    return tokens

def get_word_frequencies(corpus):
  frequencies = Counter()
  for sentence in corpus:
    for word in sentence:
      frequencies[word] += 1
  freq = frequencies.most_common()
  return freq

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
trump_cleaned = cleaning_up(trump)

clinton_cleaned = cleaning_up(clinton)

CLEANING...
---CLEANED---
CLEANING...
---CLEANED---


# BERT Embedding


## Loading Pre-Trained BERT

In [None]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
#logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
% matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()



BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
def sentenceEmb (model, tokenizer, text, concatenation = False):
  marked_text = "[CLS] " + text + " [SEP]"
  # Tokenize our sentence with the BERT tokenizer.
  tokenized_text = tokenizer.tokenize(marked_text)

  # Map the token strings to their vocabulary indeces.
  indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

  segments_ids = []
  # Mark each of the 22 tokens as belonging to sentence "1".
  segments_ids = [1] * len(tokenized_text)

  #Extracting embedding
  tokens_tensor = torch.tensor([indexed_tokens])
  segments_tensors = torch.tensor([segments_ids])

  #extract hidden layers
  with torch.no_grad():

    outputs = model(tokens_tensor, segments_tensors)

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

  # Concatenate the tensors for all layers. We use `stack` here to
  # create a new dimension in the tensor.
  token_embeddings = torch.stack(hidden_states, dim=0)

  # Remove dimension 1, the "batches".
  token_embeddings = torch.squeeze(token_embeddings, dim=1)

  # Swap dimensions 0 and 1.
  token_embeddings = token_embeddings.permute(1,0,2)

  if concatenation:
    # Stores the token vectors, with shape [22 x 3,072]
    token_vecs = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:
        # `token` is a [12 x 768] tensor

        # Concatenate the vectors (that is, append them together) from the last 
        # four layers.
        # Each layer vector is 768 values, so `cat_vec` is length 3,072.
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        # Use `cat_vec` to represent `token`.
        token_vecs.append(cat_vec)
  else:
    # Stores the token vectors, with shape [22 x 768]
    token_vecs = []

    # `token_embeddings` is a [22 x 12 x 768] tensor.

    # For each token in the sentence...
    for token in token_embeddings:

        # `token` is a [12 x 768] tensor

        # Sum the vectors from the last four layers.
        sum_vec = torch.sum(token[-4:], dim=0)
        
        # Use `sum_vec` to represent `token`.
        token_vecs.append(sum_vec)

  return tokenized_text, token_vecs


In [None]:
tt, tv = sentenceEmb(model, tokenizer, 'hillaryclinton')

dt, tv = sentenceEmb(model, tokenizer, 'Make maerica great again')

In [None]:
tt[1:-1].flat()

AttributeError: ignored

In [52]:
#Trump tokenizer and embedding

tokens_trump = []
embeddings_trump = []

for text in trump_cleaned:
  tokenized_text, token_vecs = sentenceEmb(model, tokenizer, text)

  tokens_trump.append(tokenized_text)
  sentence_embedding = []
  for v in token_vecs:
    sentence_embedding.append(v.numpy())
  embeddings_trump.append(sentence_embedding)


#Clinton tokenizer and embedding
tokens_clinton = []
embeddings_clinton = []

for text in clinton_cleaned:
  tokenized_text, token_vecs = sentenceEmb(model, tokenizer, text)

  tokens_clinton.append(tokenized_text)
  sentence_embedding = []
  for v in token_vecs:
    sentence_embedding.append(v.numpy())
  embeddings_clinton.append(sentence_embedding)


In [54]:
sentence_embedding[0]

array([-7.01385915e-01,  1.37577176e+00,  2.08979845e+00, -2.17674121e-01,
       -8.38985145e-01,  2.56600767e-01,  4.17990148e-01,  8.82463217e-01,
       -1.60964370e+00, -2.23472714e+00, -1.04536319e+00,  1.51058626e+00,
        6.28996253e-01,  9.41049308e-02,  7.70534873e-01,  3.79816800e-01,
       -1.56935477e+00,  3.56678581e+00,  1.80336165e+00, -2.76877689e+00,
       -1.70014596e+00, -9.62671936e-02, -8.95076022e-02,  3.84426475e-01,
        1.42107630e+00, -1.19696617e+00, -7.86139667e-01,  1.22600913e+00,
       -1.31915554e-01,  1.39288642e-02,  6.60530925e-01,  1.37799606e-01,
        6.77503586e-01,  1.34147239e+00,  8.92655134e-01, -1.02731675e-01,
        2.67512053e-01, -3.33304666e-02,  8.58516634e-01, -1.35721505e+00,
       -6.66298568e-01, -5.48627853e-01,  1.51756191e+00, -2.19638586e+00,
        4.98484075e-02, -4.97163594e-01, -1.56651649e+01,  3.02153373e+00,
        1.13799417e+00, -2.19182515e+00,  1.99454618e+00, -1.11700392e+00,
        1.42123342e+00,  

In [None]:
def word_vec(tokens, embeddings, word):  # return the embedding of a word
  word_vecs = []
  i_sentence = 0
  for sentence in tokens:
    i_token = 0
    for t in sentence:
      if t == word:
        word_vecs.append(embeddings[i_sentence][i_token])

      i_token = i_token + 1
    i_sentence = i_sentence + 1

  return word_vecs

import numpy as np
def sum_vec(word_vecs):  
  vec_sum = np.zeros(len(word_vecs[0]))
  for v in word_vecs:
    vec_sum = vec_sum + v
  
  return vec_sum



In [None]:
def polarity(tokens, embeddings, topic, A, B):
  positive_score = 0
  negative_score = 0
  
  #Get list of vector
  t_list = []
  for t in topic:
    t_vec =  sum_vec(word_vec(tokens, embeddings, t))
    t_list.append(t_vec)

  a_list = []
  for a in A:
    a_vec =  sum_vec(word_vec(tokens, embeddings, a))
    a_list.append(a_vec)
      
  b_list = []
  for b in B:
    b_vec =  sum_vec(word_vec(tokens, embeddings, b))
    b_list.append(b_vec)

 # Polarity:
  for t in t_list:
    positive_temp = 0
    for a in a_list:
      positive_temp = positive_temp + ( 1 - cosine(t, a))
    positive_temp = positive_temp/len(a_list)
    

    negative_temp = 0
    for b in b_list:
      negative_temp = negative_temp + ( 1 - cosine(t, b))
    negative_temp = negative_temp/len(b_list)


    positive_score = positive_score + positive_temp
    negative_score = negative_score + negative_temp

  polarity = positive_score - negative_score
  print('Positive score: ', positive_score)
  print('Negative score: ', negative_score)
  print('Difference score: ', polarity)

  return polarity



# Donald Trump

In [None]:
topic = ['hillary', 'clinton', 'democratic', 'dem']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong']

polarity(tokens_trump, embeddings_trump, topic, positive, negative)

TypeError: ignored

In [None]:
topic = ['trump', 'republican']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong']

polarity(tokens_trump, embeddings_trump, topic, positive, negative)

In [None]:
topic = [ 'hillary', 'clinton', 'dem', 'democratic', 'obama', 'barack']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong']

polarity(tokens_trump, embeddings_trump, topic, positive, negative)

In [None]:
good_vecs = word_vec(tokens_trump, embeddings_trump, 'good')
good = sum_vec(good_vecs)

hillary_vecs = word_vec(tokens_trump, embeddings_trump, 'hillary')
hillary = sum_vec(hillary_vecs)

trump_vecs = word_vec(tokens_trump, embeddings_trump, 'trump')
trump = sum_vec(trump_vecs)

from scipy.spatial.distance import cosine
1 - cosine(good, trump)

# Hillary Clinton

In [None]:
topic = ['hillary', 'clinton', 'democratic', 'dem']
positive = ['good', 'great', 'nice', 'positive', 'love' ]
negative = ['bad', 'badly', 'negative', 'false', 'wrong']

polarity(tokens_clinton, embeddings_clinton, topic, positive, negative)

Positive score:  1.8833880798195333
Negative score:  1.757009232265519
Difference score:  0.12637884755401418


0.12637884755401418

In [None]:
good_vecs = word_vec(tokens_clinton, embeddings_clinton, 'good')
good = sum_vec(good_vecs)

hillary_vecs = word_vec(tokens_clinton, embeddings_clinton, 'hillary')
hillary = sum_vec(hillary_vecs)

trump_vecs = word_vec(tokens_clinton, embeddings_clinton, 'trump')
trump = sum_vec(trump_vecs)

from scipy.spatial.distance import cosine
1 - cosine(good, trump)

0.5290500518669969

# Save embedding

Trump

In [55]:

import csv

with open("tokens_trump_bert.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(tokens_trump)



In [56]:
 np.save('embedding_trump_bert.npy',embeddings_trump)     

  return array(a, dtype, copy=False, order=order, subok=True)


Clinton

In [None]:

with open("tokens_clinton_bert.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerows(tokens_clinton)


In [None]:
 #np.save('embedding_clinton_bert.npy',embeddings_clinton)     

  return array(a, dtype, copy=False, order=order, subok=True)


In [None]:
tokens_trump

[['[CLS]',
  'join',
  'me',
  'for',
  'a',
  'pm',
  'rally',
  '##ing',
  'tomorrow',
  'at',
  'the',
  'mid',
  '##ame',
  '##rica',
  '##cent',
  '##er',
  'in',
  'rails',
  '##west',
  '##rail',
  '##ro',
  '##ad',
  '##mus',
  '##eum',
  'tickets',
  '…',
  '[SEP]'],
 ['[CLS]',
  'once',
  'again',
  'we',
  'will',
  'have',
  'a',
  'government',
  'of',
  'by',
  'and',
  'for',
  'the',
  'people',
  'join',
  'the',
  'movement',
  'the',
  '##be',
  '##ach',
  '##boys',
  '##to',
  '##day',
  '[SEP]'],
 ['[CLS]',
  'on',
  'national',
  'voter',
  '##re',
  '##gist',
  '##ration',
  '##day',
  'make',
  'sure',
  'your',
  '##e',
  'registered',
  'to',
  'vote',
  'so',
  'we',
  'can',
  'make',
  '##ame',
  '##rica',
  '##gre',
  '##ata',
  '##gai',
  '##n',
  '…',
  '[SEP]'],
 ['[CLS]',
  'hillary',
  '##cl',
  '##inton',
  '##s',
  'campaign',
  'continues',
  'to',
  'make',
  'false',
  'united',
  '##sta',
  '##tes',
  '##house',
  '##com',
  '##mit',
  '##tee',
