##Creating a custom word embedding to understand sentiment of tweets with regular english words and emojis. 


In [1]:
#Install GetOldTweets (A package that allows us to query twitter and pull old tweets)
!pip install GetOldTweets3

Collecting GetOldTweets3
  Downloading https://files.pythonhosted.org/packages/ed/f4/a00c2a7c90801abc875325bb5416ce9090ac86d06a00cc887131bd73ba45/GetOldTweets3-0.0.11-py3-none-any.whl
Collecting pyquery>=1.2.10
  Downloading https://files.pythonhosted.org/packages/78/43/95d42e386c61cb639d1a0b94f0c0b9f0b7d6b981ad3c043a836c8b5bc68b/pyquery-1.4.1-py2.py3-none-any.whl
Collecting cssselect>0.7.9
  Downloading https://files.pythonhosted.org/packages/3b/d4/3b5c17f00cce85b9a1e6f91096e1cc8e8ede2e1be8e96b87ce1ed09e92c5/cssselect-1.1.0-py2.py3-none-any.whl
Installing collected packages: cssselect, pyquery, GetOldTweets3
Successfully installed GetOldTweets3-0.0.11 cssselect-1.1.0 pyquery-1.4.1


In [2]:
#Mounting drive to get the word embedding and vector embedding bins
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [3]:
#import necessary packages
import gensim.models as gs
import numpy as np
import pandas as pd

#Creating the word embedding

In [4]:
#Creating a Phrase2vec class to handle the concatenated dict (taken from github)
class Phrase2Vec:
    """Wrapper for the word2vec model and emoji2vec model, allowing us to compute phrases"""
    def __init__(self, dim, w2v, e2v=None):
        """Constructor for the Phrase2Vec model
        Args:
            dim: Dimension of the vectors in word2vec and emoji2vec
            w2v: Gensim object for word2vec
            e2v: Gensim object for emoji2vec
        """
        self.wordVecModel = w2v
        if e2v is not None:
            self.emojiVecModel = e2v
        else:
            self.emojiVecModel = dict()
        self.dimension = dim

    @classmethod
    def from_word2vec_paths(cls, dim, w2v_path='/data/word2vec/GoogleNews-vectors-negative300.bin',
                            e2v_path=None):
        """Creates a Phrase2Vec object based on paths for w2v and e2v
        Args:
            dim: Dimension of the vectors in word2vec and emoji2vec
            w2v_path: Path to word2vec vectors
            e2v_path: Path to emoji2vec vectors
        Returns:
        """
        if not os.path.exists(w2v_path):
            print(str.format('{} not found. Either provide a different path, or download binary from '
                             'https://code.google.com/archive/p/word2vec/ and unzip', w2v_path))

        w2v = gs.Word2Vec.load_word2vec_format(w2v_path, binary=True)
        if e2v_path is not None:
            e2v = gs.Word2Vec.load_word2vec_format(e2v_path, binary=True)
        else:
            e2v = dict()
        return cls(dim, w2v, e2v)

    def __getitem__(self, item):
        """Get the vector sum of all tokens in a phrase
        Args:
            item: Phrase to be converted into a vector sum
        Returns:
            phr_sum: Bag-of-words sum of the tokens in the phrase supplied
        """
        tokens = item.split(' ')
        phr_sum = np.zeros(self.dimension, np.float32)

        for token in tokens:
            if token in self.wordVecModel:
                phr_sum += self.wordVecModel[token]
            elif token in self.emojiVecModel:
                phr_sum += self.emojiVecModel[token]

        return phr_sum

    def from_emoji(self, emoji_vec, top_n=10):
        """Get the top n closest tokens for a supplied emoji vector
        Args:
            emoji_vec: Emoji vector
            top_n: number of results to return
        Returns:
            Closest n tokens for a supplied emoji_vec
        """
        return self.wordVecModel.most_similar(positive=emoji_vec, negative=[], topn=top_n)

    def __setitem__(self, key, value):
        self.wordVecModel[key] = value

The emoji2vec bin was taken from https://github.com/uclnlp/emoji2vec



In [5]:
#Read in the emoji vect word embedding bin
e2v_path = "/content/drive/My Drive/emoji2vec.bin"
e2v = gs.KeyedVectors.load_word2vec_format(e2v_path, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


The word2vec bin was taken from https://github.com/mmihaltz/word2vec-GoogleNews-vectors


In [6]:
#Read in the actual word vector embedding bin for words
w2v_path = "/content/drive/My Drive/GoogleNews-vectors-negative300.bin"
w2v = gs.KeyedVectors.load_word2vec_format(w2v_path, binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [7]:

# !gunzip '/content/drive/My Drive/GoogleNews-vectors-negative300.bin'

In [8]:
out_dim = 300
#Gives dictionary so when you call the word/emoji, you'll get a reference vector
p2v_our_emoji = Phrase2Vec(out_dim, w2v, e2v=e2v)

In [9]:
#Tokenize the new concatenated dictionary
p2vemojiToken = list(e2v.vocab.keys()) + list(w2v.vocab.keys())

word2index = dict((v,k) for (k,v) in enumerate(p2vemojiToken))

In [10]:
#Create the embedding matrix. len+1 to add an unknown bin for all unknown tokens. 
embedding_matrix = np.zeros((len(p2vemojiToken) + 1, 300)).astype(np.float32)
for i, word in enumerate(p2vemojiToken):
    embedding_vector = p2v_our_emoji[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [11]:
#Clean up resources that are noppt being used
del p2v_our_emoji,w2v,e2v
import gc
gc.collect()

207

##Read in and prepare the training set. The training set is taken from the "Sentiment140" Set. It is basically a labeled set of positive, negative, and neutral tweets. In total, the set consists of 1.6 million tweets. 


In [12]:
#Set unknown bin
unknown = len(embedding_matrix)-1
MAX_SEQUENCE_LENGTH = 50

#read in 1.6 million tweets that were prelabeled. The set is known as the Sentiment140 set. 
trialdata = pd.read_csv('/content/drive/My Drive/training.1600000.processed.noemoticon.csv', nrows = 16000000, encoding = 'latin-1')
trialdata.columns = ['sentiment','id','date','device','user','tweet']

#create xtrail and y trial 
xtrial,ytrial = unknown*np.ones((len(trialdata),MAX_SEQUENCE_LENGTH)).astype(np.int),[]
#The training set has the labeled tweets as 0 for neg, 2 for neutral, and 4 for positive
sentiments = [0,4,2]


#Iterate through tweets and add to xtrial/ytrial
for i,row in trialdata.iterrows():
    #print(row['text'])
    tweet = row['tweet'].split()
    xtrial[i,:len(tweet)] = [word2index.get(word,unknown) for word in row['tweet'].split()][:MAX_SEQUENCE_LENGTH]
    ytrial.append(sentiments.index(row['sentiment']))

ytrial = np.eye(3)[np.array(ytrial)]

#Split to train and validation set. 
index = np.arange(len(xtrial))
np.random.shuffle(index)
xtrain = xtrial[index[2000:]]
ytrain = ytrial[index[2000:]]


xvalid = xtrial[index[:2000]]
yvalid = ytrial[index[:2000]]

In [13]:
print(xtrain.shape)

(1597999, 50)


In [14]:
import keras 
from keras.layers import Embedding,Input,Dense,MaxPooling1D

#Create the Keras embedding layer
embedding_layer = Embedding(len(p2vemojiToken) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length= MAX_SEQUENCE_LENGTH,
                            trainable=False)

Using TensorFlow backend.


In [15]:
from keras import Model
from keras.layers import Embedding,Input,Dense,MaxPooling1D, Conv1D, Reshape

stride = 1
filtersize = 10

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(128, filtersize, activation='relu',strides =stride)(embedded_sequences)
x = MaxPooling1D(((MAX_SEQUENCE_LENGTH-filtersize)/stride,))(x)
x = Reshape((128,))(x)
preds = Dense(3, activation='softmax')(x)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 300)           900498600 
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 41, 128)           384128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1.0, 128)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 387       
Total params: 900,883,115
Trainable params: 384,515
Non-trainable params: 900,498,600
_______________________________________

In [16]:
# happy learning!
model.fit(xtrain, ytrain, epochs=3,batch_size=128, validation_data = (xvalid,yvalid))



Train on 1597999 samples, validate on 2000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f64f2d152b0>

In [17]:
#testprobs = model.predict(xtrial)

#testpreds = np.argmax(testprobs, axis=1)
#np.array(sentiments)[testpreds]

model.evaluate(xvalid,yvalid)



[0.45520978164672854, 0.7914999723434448]

#RNN Model with the same word embedding


In [19]:
#RNN Model
from keras.layers import LSTM
from keras.optimizers import RMSprop

filtersize = 10
opt = RMSprop(learning_rate = 0.0003)
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedded_sequences = embedding_layer(sequence_input)
x = LSTM(64)(embedded_sequences)

preds = Dense(3, activation='softmax')(x)


model1 = Model(sequence_input, preds)
model1.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['acc'])

print(model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 300)           900498600 
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 41, 128)           384128    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1.0, 128)          0         
_________________________________________________________________
reshape_1 (Reshape)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 387       
Total params: 900,883,115
Trainable params: 384,515
Non-trainable params: 900,498,600
_______________________________________

In [20]:
# happy learning!
model1.fit(xtrain, ytrain, epochs=3,batch_size=128, validation_data = (xvalid,yvalid))


Train on 1597999 samples, validate on 2000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f64f2e45ac8>

In [21]:
model1.evaluate(xtrain,ytrain)



[0.44766011127058875, 0.7852995991706848]

#CNN Model with an RNN Layer with the same word embedding

In [22]:
#Stacking models (using RNN as a layer in CNN)
filtersize = 5
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
opt = RMSprop(learning_rate = 0.0003)
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(32, filtersize, activation='relu', strides = 2)(embedded_sequences)
x = LSTM(32)(x)
x = Reshape((32,))(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(5)(x)
# x = Conv1D(128, 5, activation='relu')(x)
# x = MaxPooling1D(35)(x)  # global max pooling
# x = Flatten()(x)
# x = Dense(128, activation='relu')(x)
preds = Dense(3, activation='softmax')(x)
#preds = Dense(len(labels_index), activation='softmax')(x)

model2 = Model(sequence_input, preds)
model2.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['acc'])

print(model2.summary())

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 300)           900498600 
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 23, 32)            48032     
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                8320      
_________________________________________________________________
reshape_2 (Reshape)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 3)                 99        
Total params: 900,555,051
Trainable params: 56,451
Non-trainable params: 900,498,600
________________________________________

In [23]:
# happy learning!
model2.fit(xtrain, ytrain,
          epochs=3,batch_size=128, validation_data = (xvalid,yvalid))


Train on 1597999 samples, validate on 2000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x7f64f310ec88>

In [None]:
model.evaluate(xtrial,ytrial)



[0.35055361391133444, 0.8426399230957031]

Hand labeled set evaluated using the best model

In [24]:
emojidata = pd.read_csv('/content/LabeledTweets.csv')
print(emojidata.head())

#create xtrail and y trial 
xemoji,yemoji = unknown*np.ones((len(emojidata),MAX_SEQUENCE_LENGTH)).astype(np.int),[]
#The training set has the labeled tweets as 0 for neg, 2 for neutral, and 4 for positive
sentiments = [0,4,1]


#Iterate through tweets and add to xtrial/ytrial
for i,row in emojidata.iterrows():
    #print(row['text'])
    tweet = row['Tweet'].split()
    xemoji[i,:len(tweet)] = [word2index.get(word,unknown) for word in row['Tweet'].split()][:MAX_SEQUENCE_LENGTH]
    yemoji.append(sentiments.index(row['Label']))
print(yemoji)
for n,i in enumerate(yemoji):
  if i == 2:
    yemoji[n] = 4

# yemoji = np.eye(3)[np.array(yemoji)]
  


   Label                                              Tweet
0      1  The original manuscript of the #UN Charter pre...
1      0  Where is #BML on this?  How is this ok, partic...
2      0  @pritipatel Presumeably Cressida Dick will obt...
3      1  God speed to the person filming this and the a...
4      0  #bournemouthbeach oh pandemic people are lying...
[2, 0, 0, 2, 0, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 0, 2, 0, 0, 0, 2, 0, 0, 2, 0, 2, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 0, 0, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 0, 2, 2, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 0, 0, 0, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 0, 0, 2, 0, 2, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 

In [None]:
print(len(predictedProbs))

1000


In [27]:

testprobs = model2.predict(xemoji)

testpreds = np.argmax(testprobs, axis=1)
predictedProbs = np.array(sentiments)[testpreds]

correct = []
for n,i in enumerate(predictedProbs):
  if(i == yemoji[n]):
    correct.append(i)
success = len(correct)/len(predictedProbs)
print(success)

0.675


#Pull Tweets and Analyze on the CNN Model since it performed the best.

In [None]:
import GetOldTweets3 as got
import numpy as np
import datetime
import time

base = datetime.datetime(2020, 6, 15)
numdays = 30
date_list = [base - datetime.timedelta(days=x) for x in range(numdays)]
print(date_list)

percentageArray = []
for date in date_list:
    print(date)
    tweets = 1000
    #Pull tweets from the given date
    tweetCriteria = got.manager.TweetCriteria().setQuerySearch('$#BLM-filter:retweets').setSince('2020-05-02').setUntil(date.strftime("%Y-%m-%d")).setMaxTweets(tweets)
    tweet = got.manager.TweetManager.getTweets(tweetCriteria)
    tweet_list = []
    tweet_time = []
    for t in tweet:
        tweet_list.append(t.text)
        tweet_time.append(t.date)
    df = pd.DataFrame(tweet_list, tweet_time)
    df.to_csv('out.csv')
    testdata = pd.read_csv('out.csv')
    testdata.columns = ['date', 'tweet']
    xtest = unknown*np.ones((len(testdata),MAX_SEQUENCE_LENGTH)).astype(np.int)
    sentiments = [0,4,2]



    for i,row in testdata.iterrows():
        #print(row['text'])
        tweet = row['tweet'].split()

        ##add space before unicode starter
        #xtest[i,:len(tweet)] = [word2index.get(word,unknown) for word in row['tweet'].split()][:MAX_SEQUENCE_LENGTH]
        xtest[i,:len(tweet)] = [word2index.get(word,unknown) for word in row['tweet'].split()][:MAX_SEQUENCE_LENGTH]

    testprobs = model1.predict(xtest)

    testpreds = np.argmax(testprobs, axis=1)
    
    posnegresults = np.array(sentiments)[testpreds]
    pos = np.where(posnegresults == 4)[0]
    neg = np.where(posnegresults == 0)[0]
    percentagepos = len(pos)/(len(pos)+len(neg))
    percentageArray.append(percentagepos)
    print(percentagepos)
    #Timer to allow continuous pull from Twitter API
    time.sleep(60)

[datetime.datetime(2020, 6, 15, 0, 0), datetime.datetime(2020, 6, 14, 0, 0), datetime.datetime(2020, 6, 13, 0, 0), datetime.datetime(2020, 6, 12, 0, 0), datetime.datetime(2020, 6, 11, 0, 0), datetime.datetime(2020, 6, 10, 0, 0), datetime.datetime(2020, 6, 9, 0, 0), datetime.datetime(2020, 6, 8, 0, 0), datetime.datetime(2020, 6, 7, 0, 0), datetime.datetime(2020, 6, 6, 0, 0), datetime.datetime(2020, 6, 5, 0, 0), datetime.datetime(2020, 6, 4, 0, 0), datetime.datetime(2020, 6, 3, 0, 0), datetime.datetime(2020, 6, 2, 0, 0), datetime.datetime(2020, 6, 1, 0, 0), datetime.datetime(2020, 5, 31, 0, 0), datetime.datetime(2020, 5, 30, 0, 0), datetime.datetime(2020, 5, 29, 0, 0), datetime.datetime(2020, 5, 28, 0, 0), datetime.datetime(2020, 5, 27, 0, 0), datetime.datetime(2020, 5, 26, 0, 0), datetime.datetime(2020, 5, 25, 0, 0), datetime.datetime(2020, 5, 24, 0, 0), datetime.datetime(2020, 5, 23, 0, 0), datetime.datetime(2020, 5, 22, 0, 0), datetime.datetime(2020, 5, 21, 0, 0), datetime.datetime(20