# Word2Vec / Neural Networks

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import glob

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils.np_utils import to_categorical

import gensim

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

pd.set_option('max_colwidth', 1000)

Using TensorFlow backend.


In [2]:
training_data = pd.read_csv('data/SemEval Tweets/cleaned_training_data.csv', index_col=0, encoding='utf-8')

In [3]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61648 entries, 0 to 61647
Data columns (total 3 columns):
ID           61648 non-null int64
Tweet        61607 non-null object
Sentiment    61648 non-null int64
dtypes: int64(2), object(1)
memory usage: 1.9+ MB


For some reason some tweet column entries have been lost from the export and import of the csv. Will recreate the training_data dataframe using the same method as in the Bag Of Words model notebook.

In [4]:
all_files = glob.glob("data/SemEval Tweets/*.txt")
column_names = ['ID', 'Sentiment', 'Tweet']
list = []

for filename in all_files:
    df = pd.read_csv(filename, sep='\t', names=column_names, index_col=False, encoding='utf-8')
    list.append(df)

training_data = pd.concat(list, axis=0, ignore_index=True)

In [5]:
training_data

Unnamed: 0,ID,Sentiment,Tweet
0,260097528899452929,neutral,Won the match #getin . Plus\u002c tomorrow is a very busy day\u002c with Awareness Day\u2019s and debates. Gulp. Debates...
1,263791921753882624,neutral,Some areas of New England could see the first flakes of the season Tuesday.
2,264194578381410304,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony Romo. The man who likes to share the ball with everyone. Including the other team.
3,264041328420204544,neutral,#Thailand Washington - US President Barack Obama vowed Wednesday as he visited storm-ravaged New Jersey shore to... http://t.co/Xzl4LFhs
4,263816256640126976,neutral,Did y\u2019all hear what Tony Romo dressed up as for Halloween? A Giants quaterback! Cause that\u2019s all he could throw to sunday night.
...,...,...,...
50078,639855845958885376,positive,@Racalto_SK ok good to know. Punting at MetLife in December is a task just hope hes up for it.
50079,639979760735662080,neutral,everyone who sat around me at metlife was so annoying but i didnt let it ruin such an amazing night
50080,640196838260363269,neutral,what giants or niners fans would wanna go to the sunday night game at Metlife?
50081,640975710354567168,positive,Anybody want a ticket for tomorrow Colombia vs Peru at MetLife?


#### Tweet Cleanup

In [6]:
training_data = training_data.drop_duplicates()
training_data

Unnamed: 0,ID,Sentiment,Tweet
0,260097528899452929,neutral,Won the match #getin . Plus\u002c tomorrow is a very busy day\u002c with Awareness Day\u2019s and debates. Gulp. Debates...
1,263791921753882624,neutral,Some areas of New England could see the first flakes of the season Tuesday.
2,264194578381410304,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony Romo. The man who likes to share the ball with everyone. Including the other team.
3,264041328420204544,neutral,#Thailand Washington - US President Barack Obama vowed Wednesday as he visited storm-ravaged New Jersey shore to... http://t.co/Xzl4LFhs
4,263816256640126976,neutral,Did y\u2019all hear what Tony Romo dressed up as for Halloween? A Giants quaterback! Cause that\u2019s all he could throw to sunday night.
...,...,...,...
50078,639855845958885376,positive,@Racalto_SK ok good to know. Punting at MetLife in December is a task just hope hes up for it.
50079,639979760735662080,neutral,everyone who sat around me at metlife was so annoying but i didnt let it ruin such an amazing night
50080,640196838260363269,neutral,what giants or niners fans would wanna go to the sunday night game at Metlife?
50081,640975710354567168,positive,Anybody want a ticket for tomorrow Colombia vs Peru at MetLife?


In [7]:
training_data['Tweet'] = training_data['Tweet'].str.replace('\\\\u002c', ',')
training_data['Tweet'] = training_data['Tweet'].str.replace('\\\\u2019', "'")
training_data['Tweet'] = training_data['Tweet'].str.replace('\\\\""', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
training_data

Unnamed: 0,ID,Sentiment,Tweet
0,260097528899452929,neutral,"Won the match #getin . Plus, tomorrow is a very busy day, with Awareness Day's and debates. Gulp. Debates..."
1,263791921753882624,neutral,Some areas of New England could see the first flakes of the season Tuesday.
2,264194578381410304,negative,@francesco_con40 2nd worst QB. DEFINITELY Tony Romo. The man who likes to share the ball with everyone. Including the other team.
3,264041328420204544,neutral,#Thailand Washington - US President Barack Obama vowed Wednesday as he visited storm-ravaged New Jersey shore to... http://t.co/Xzl4LFhs
4,263816256640126976,neutral,Did y'all hear what Tony Romo dressed up as for Halloween? A Giants quaterback! Cause that's all he could throw to sunday night.
...,...,...,...
50078,639855845958885376,positive,@Racalto_SK ok good to know. Punting at MetLife in December is a task just hope hes up for it.
50079,639979760735662080,neutral,everyone who sat around me at metlife was so annoying but i didnt let it ruin such an amazing night
50080,640196838260363269,neutral,what giants or niners fans would wanna go to the sunday night game at Metlife?
50081,640975710354567168,positive,Anybody want a ticket for tomorrow Colombia vs Peru at MetLife?


In [9]:
training_data['Sentiment'] = training_data['Sentiment'].replace('negative', 0)
training_data['Sentiment'] = training_data['Sentiment'].replace('neutral', 1)
training_data['Sentiment'] = training_data['Sentiment'].replace('positive', 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
training_data['Sentiment'].value_counts()

1    22175
2    19552
0     7691
Name: Sentiment, dtype: int64

In [11]:
col_titles = ['ID', 'Tweet', 'Sentiment']
training_data = training_data.reindex(columns=col_titles)

In [12]:
training_data

Unnamed: 0,ID,Tweet,Sentiment
0,260097528899452929,"Won the match #getin . Plus, tomorrow is a very busy day, with Awareness Day's and debates. Gulp. Debates...",1
1,263791921753882624,Some areas of New England could see the first flakes of the season Tuesday.,1
2,264194578381410304,@francesco_con40 2nd worst QB. DEFINITELY Tony Romo. The man who likes to share the ball with everyone. Including the other team.,0
3,264041328420204544,#Thailand Washington - US President Barack Obama vowed Wednesday as he visited storm-ravaged New Jersey shore to... http://t.co/Xzl4LFhs,1
4,263816256640126976,Did y'all hear what Tony Romo dressed up as for Halloween? A Giants quaterback! Cause that's all he could throw to sunday night.,1
...,...,...,...
50078,639855845958885376,@Racalto_SK ok good to know. Punting at MetLife in December is a task just hope hes up for it.,2
50079,639979760735662080,everyone who sat around me at metlife was so annoying but i didnt let it ruin such an amazing night,1
50080,640196838260363269,what giants or niners fans would wanna go to the sunday night game at Metlife?,1
50081,640975710354567168,Anybody want a ticket for tomorrow Colombia vs Peru at MetLife?,2


In [13]:
negative_sample = pd.read_csv('data/sentiment140/negative_sample_addin.csv')

In [14]:
negative_sample = negative_sample.drop('Unnamed: 0', axis=1)


In [15]:
negative_sample

Unnamed: 0,ID,Tweet,Sentiment
0,2246713398,"@TheRealScarab PA system bugs are a bummer, sorry.",0
1,2237331866,"oh daaamnnn! the firemen ball's on the 14th and i'll still be in Cannes So many handsome men united 2gether, and i miss it! *out tonite*",0
2,1468288564,"I don't want to be cold in April, but I am",0
3,2246385590,@JaredOngie haha its too cold down here bt other than that im quite fine.. jst extremely bored wht are your plans for the day?,0
4,2072038618,Upset I can't find my CHI!,0
...,...,...,...
12225,1977229871,Sounders are down 0-1 against the Columbus crew at the half.,0
12226,1990568686,Prayers for the family of the air France plane that is missing.,0
12227,1794321826,@pianoduet we miss you too,0
12228,2053449275,never felt so down in the dumps,0


In [16]:
training_data = training_data.append(negative_sample)
training_data = training_data.reset_index(drop=True)
training_data

Unnamed: 0,ID,Tweet,Sentiment
0,260097528899452929,"Won the match #getin . Plus, tomorrow is a very busy day, with Awareness Day's and debates. Gulp. Debates...",1
1,263791921753882624,Some areas of New England could see the first flakes of the season Tuesday.,1
2,264194578381410304,@francesco_con40 2nd worst QB. DEFINITELY Tony Romo. The man who likes to share the ball with everyone. Including the other team.,0
3,264041328420204544,#Thailand Washington - US President Barack Obama vowed Wednesday as he visited storm-ravaged New Jersey shore to... http://t.co/Xzl4LFhs,1
4,263816256640126976,Did y'all hear what Tony Romo dressed up as for Halloween? A Giants quaterback! Cause that's all he could throw to sunday night.,1
...,...,...,...
61643,1977229871,Sounders are down 0-1 against the Columbus crew at the half.,0
61644,1990568686,Prayers for the family of the air France plane that is missing.,0
61645,1794321826,@pianoduet we miss you too,0
61646,2053449275,never felt so down in the dumps,0


In [17]:
training_data['Sentiment'].value_counts()

1    22175
0    19921
2    19552
Name: Sentiment, dtype: int64

In [18]:
def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 

In [19]:
def tweet_cleaner(tweet_column):
    #remove links
    tweet_column = tweet_column.str.replace('\w+:\/\/\S+', '', case=False)
    #remove @user
    tweet_column = tweet_column.str.replace('@[A-Za-z0-9_]+', '')
    #remove underscores
    tweet_column = tweet_column.str.replace('_', ' ')
    #remove apostrophes
    tweet_column = tweet_column.str.replace("'", '')
    #remove hashtags
    tweet_column = tweet_column.str.replace('#[A-Za-z0-9]+', '')
    #remove punctuation 
    tweet_column = tweet_column.str.replace("[^\w\d\s#’]", " ")
    #lower case
    tweet_column = tweet_column.str.lower()
    #remove whitespace
    tweet_column = tweet_column.str.replace(r'\s\s+', ' ')
    #remove beginning and end whitespace
    tweet_column = tweet_column.str.lstrip(' ')
    tweet_column = tweet_column.str.rstrip(' ')
    
    return tweet_column   

In [20]:
training_data['Tweet'] = training_data['Tweet'].apply(remove_stopwords)

In [21]:
training_data['Tweet'] = tweet_cleaner(training_data['Tweet'])

In [22]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61648 entries, 0 to 61647
Data columns (total 3 columns):
ID           61648 non-null int64
Tweet        61648 non-null object
Sentiment    61648 non-null int64
dtypes: int64(2), object(1)
memory usage: 1.4+ MB


In [23]:
review_lines= []
lines = training_data['Tweet'].values.tolist()

In [24]:
for line in lines:
    tokens = word_tokenize(line)
    #lemmatize  
    lemmatizer = nltk.stem.WordNetLemmatizer()
    words = [lemmatizer.lemmatize(token) for token in tokens]
    review_lines.append(words)

In [25]:
review_lines[1]

['some',
 'area',
 'new',
 'england',
 'could',
 'see',
 'first',
 'flake',
 'season',
 'tuesday']

In [26]:
#train word2vec model
w2v_model = gensim.models.Word2Vec(sentences=review_lines, size=100, window=5, workers=4, min_count=5)

In [27]:
w2v_model.wv.most_similar('mom')

[('dad', 0.9590665698051453),
 ('yall', 0.9422159194946289),
 ('ima', 0.9404256939888),
 ('okay', 0.9401607513427734),
 ('hey', 0.935899019241333),
 ('bc', 0.9352407455444336),
 ('fucking', 0.9343057870864868),
 ('omg', 0.9313623905181885),
 ('cry', 0.931204080581665),
 ('as', 0.9265304803848267)]

Very strange word vectors, will use pre-trained GloVe Embeddings (100 Dimension Tweet Embeddings)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(training_data['Tweet'], training_data['Sentiment'], test_size=0.2, random_state=1, stratify=training_data['Sentiment'])
print('# Train data samples:', X_train.shape[0])
print('# Test data samples:', X_test.shape[0])

# Train data samples: 49318
# Test data samples: 12330


In [29]:
NB_WORDS = 20000 #Max number of words
NB_START_EPOCHS = 10  # Number of epochs
MAX_LEN = 24  # Max number of words in a sequence
GLOVE_DIM = 100  # Number of dimensions of the GloVe word embeddings

In [30]:
#tokenize
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(X_train)

#convert to sequences
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [31]:
X_train[1]

'some areas new england could see first flakes season tuesday'

In [32]:
#split sequences
seq_lengths = X_train.apply(lambda x: len(x.split(' ')))

In [33]:
#padding
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [34]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [35]:
#split training data into train and validation sets
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(X_train_seq_trunc, y_train_oh, test_size=0.2, random_state=1)


print('Shape of validation set:',X_valid_emb.shape)

Shape of validation set: (9864, 24)


In [36]:
#create embedding dict from txt file
glove_file = 'glove.twitter.27B.' + str(GLOVE_DIM) + 'd.txt'
emb_dict = {}
glove = open(f'data/glove.twitter.27B/glove.twitter.27B.{GLOVE_DIM}d.txt', encoding='utf-8')
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [37]:
#create embedding matrix
emb_matrix = np.zeros((NB_WORDS, GLOVE_DIM))

for w, i in tk.word_index.items():
    # Limit word index tokens
    if i < NB_WORDS:
        vect = emb_dict.get(w)
        # Check if the word from the training data occurs in the GloVe word embeddings
        # Otherwise the vector is kept with only zeros
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

In [38]:
emb_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.50931001,  0.25145999,  0.13897   , ...,  0.78486001,
        -0.36993   , -0.41058999],
       [ 0.64710999,  0.13349   , -0.23288   , ..., -0.05992   ,
        -0.26833001, -0.70072001],
       ...,
       [ 0.29385   ,  0.048252  ,  0.35034999, ...,  0.46015   ,
         0.53100997, -0.27226001],
       [-0.13681   ,  0.68089998, -0.23052999, ...,  0.54215002,
        -0.14811   ,  0.66140997],
       [-0.83674002, -0.013398  ,  0.073317  , ..., -0.16075   ,
         0.43700999,  0.021054  ]])

# RNN

In [39]:
glove_model = Sequential()
glove_model.add(Embedding(NB_WORDS, GLOVE_DIM, input_length=MAX_LEN))
glove_model.add(LSTM(300, activation='relu'))
glove_model.add(Dense(3, activation='softmax'))
glove_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
glove_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 24, 100)           2000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 300)               481200    
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 903       
Total params: 2,482,103
Trainable params: 2,482,103
Non-trainable params: 0
_________________________________________________________________


In [40]:
glove_model.layers[0].set_weights([emb_matrix])
glove_model.layers[0].trainable = False

In [41]:
#THIS WILL TAKE A WHILE TO RUN
glove_model.fit(X_train_emb, y_train_emb, batch_size=128, epochs=10, validation_data=(X_valid_emb, y_valid_emb), verbose=1)

  'Discrepancy between trainable weights and collected trainable'


Train on 39454 samples, validate on 9864 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x20e825c9780>

Because the accuracy is marginally higher than the Bag of Words model, I will use the Bag of Words model for the sentiment analysis.