Walkthrough: [Sentiment Analysis on Twitter using Word2Vec](http://ahmedbesbes.com/sentiment-analysis-on-twitter-using-word2vec-and-keras.html)

In [1]:
!pip install gensim



In [2]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence # we'll talk about this down below

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

Using TensorFlow backend.


In [4]:
def ingest():
    data = pd.read_csv('./training.1600000.processed.noemoticon.csv', header=None, names=['Sentiment', 'ItemID', 'Date', 'SentimentSource', 'SentimentAuthor','SentimentText'])
    data.drop(['ItemID', 'SentimentSource'], axis=1, inplace=True)
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map(int)
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print 'dataset loaded with shape', data.shape    
    return data

data = ingest()
data.head(5)

dataset loaded with shape (1600000, 4)


Unnamed: 0,Sentiment,Date,SentimentAuthor,SentimentText
0,0,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...
2,0,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire
4,0,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
data.groupby(['Sentiment']).count()

Unnamed: 0_level_0,Date,SentimentAuthor,SentimentText
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,800000,800000,800000
4,800000,800000,800000


In [6]:
def tokenize(tweet):
    try:
        tweet = unicode(tweet.decode('utf-8').lower())
        tokens = tokenizer.tokenize(tweet)
        tokens = filter(lambda t: not t.startswith('@'), tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        return tokens
    except:
        return 'NC'

In [7]:
def postprocess(data, n=1000000):
    data = data.head(n)
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(data)

progress-bar: 100%|██████████| 1000000/1000000 [01:20<00:00, 12371.17it/s]


## Word2Vec

In [8]:
n=1000000
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(n).tokens),
                                                    np.array(data.head(n).Sentiment), test_size=0.2)

In [9]:
data.head(3)
# print x_train.size
# print x_test.size
# data.groupby(['Sentiment']).count()

Unnamed: 0,Sentiment,Date,SentimentAuthor,SentimentText,tokens
0,0,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[-, awww, ,, that's, a, bummer, ., you, should..."
1,0,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can't, update, his, face..."
2,0,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...,"[i, dived, many, times, for, the, ball, ., man..."


In [10]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')
x_train[0]

799677it [00:04, 198710.24it/s]
199920it [00:01, 155283.35it/s]


TaggedDocument(words=[u'-', u'i', u'have', u'no', u'idea', u'...', u'sorry', u'...'], tags=['TRAIN_0'])

In [11]:
n_dim=200
tweet_w2v = Word2Vec(size=n_dim, min_count=10)

sentences = [x.words for x in tqdm(x_train)]
print len(sentences)
tweet_w2v.build_vocab(sentences)
tweet_w2v.train(sentences, total_examples=len(sentences), epochs=2)

100%|██████████| 799677/799677 [00:00<00:00, 1395782.24it/s]


799677


17179662

In [12]:
tweet_w2v['good']

array([-0.39284831,  1.36364448,  0.46635178,  0.2713958 , -0.48652118,
        0.50193161,  0.84410262, -1.70936477, -1.53120172,  0.80197912,
        0.57396722,  0.68029863, -0.8885026 , -2.00265718,  0.5918566 ,
        0.67837614,  0.73199332,  1.94352651,  0.74260366,  0.33350798,
       -2.12764955,  0.14545818,  1.39353657,  1.01106155, -0.14764364,
        1.9596951 ,  1.22798669,  0.23021062, -2.53860641, -1.98425472,
       -2.14621234,  1.89197671, -0.16925699, -1.7334522 , -0.93809384,
        0.43863964, -1.62318647, -0.15077855, -0.07147733, -1.13040972,
        0.44699579,  1.0332346 , -0.09699228,  0.45051339,  0.83192277,
       -0.21107209,  1.07202363, -0.15585245,  0.01553344, -0.17187543,
        0.13921049, -0.24037331, -0.82979113,  0.38501233, -0.26025748,
        1.03123605, -0.57371223,  0.39221483, -0.6640591 , -1.73946011,
       -0.17789911, -1.19150722, -0.46982923, -1.10960889, -0.06245032,
        1.57024109,  2.03154755,  1.3740834 ,  0.61822087,  1.62

In [13]:
tweet_w2v.most_similar('good')

[(u'great', 0.7294263243675232),
 (u'cool', 0.6542844772338867),
 (u'nice', 0.6475257873535156),
 (u'bad', 0.6421359777450562),
 (u'goood', 0.640008807182312),
 (u'rough', 0.6335230469703674),
 (u'fantastic', 0.626408040523529),
 (u'tough', 0.62546306848526),
 (u'pleasant', 0.6167764663696289),
 (u'terrible', 0.6051438450813293)]

In [14]:
tweet_w2v.most_similar('bar')

[(u'table', 0.8246245384216309),
 (u'cafe', 0.814526379108429),
 (u'restaurant', 0.7878074049949646),
 (u'market', 0.7773846387863159),
 (u'shop', 0.7375199794769287),
 (u'pub', 0.735166609287262),
 (u'target', 0.728895902633667),
 (u'grill', 0.7050116062164307),
 (u'hotel', 0.7033606767654419),
 (u'ball', 0.6947258710861206)]

In [15]:
tweet_w2v.most_similar('facebook')

[(u'myspace', 0.904416561126709),
 (u'fb', 0.8932054042816162),
 (u'twitter', 0.8309180736541748),
 (u'flickr', 0.8028220534324646),
 (u'yahoo', 0.7822731137275696),
 (u'aim', 0.7739897966384888),
 (u'status', 0.7601168751716614),
 (u'youtube', 0.759611964225769),
 (u'msn', 0.756036639213562),
 (u'skype', 0.7522198557853699)]

In [16]:
tweet_w2v.most_similar('iphone')

[(u'upgrade', 0.7877534627914429),
 (u'app', 0.7873260974884033),
 (u'software', 0.7820095419883728),
 (u'mac', 0.7801790237426758),
 (u'itunes', 0.7801249027252197),
 (u'pc', 0.777249813079834),
 (u'3.0', 0.7729201912879944),
 (u'update', 0.7724477052688599),
 (u'tweetdeck', 0.7670104503631592),
 (u'blackberry', 0.7582669258117676)]

In [17]:
!pip install bokeh



In [18]:
# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 10000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 10000. each is of 200 dimensions
word_vectors = [tweet_w2v[w] for w in tweet_w2v.wv.vocab.keys()[:5000]]

# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = tweet_w2v.wv.vocab.keys()[:5000]

# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.118706
[t-SNE] KL divergence after 100 iterations with early exaggeration: 0.966649
[t-SNE] Error after 300 iterations: 0.966649


![alt text](https://github.com/mulloymorrow/notebooks/blob/master/bokeh_plot.png "Bokeh Plot")

In [19]:
print 'building tf-idf matrix ...'
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in x_train])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print 'vocab size :', len(tfidf)

building tf-idf matrix ...
vocab size : 23026


In [20]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [21]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

100%|██████████| 799677/799677 [01:19<00:00, 10024.60it/s]
100%|██████████| 199920/199920 [00:17<00:00, 11378.88it/s]


In [22]:
from keras.models import Sequential
from keras.layers import Dense

model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

from keras.callbacks import TensorBoard
from keras_tqdm import TQDMNotebookCallback
tensorBoardCallback = TensorBoard(log_dir='./logs/twittersentiment', write_graph=True)
model.fit(train_vecs_w2v, y_train, nb_epoch=9, batch_size=64, verbose=0, callbacks=[TQDMNotebookCallback(), tensorBoardCallback],)





<keras.callbacks.History at 0x7f3bd2bfd350>

In [23]:
train_vecs_w2v

array([[ 0.82599945, -0.57772837,  1.05451329, ..., -0.1278278 ,
        -0.55098089, -0.46753695],
       [-0.72978697, -0.24671593,  1.28129483, ..., -0.35002373,
         0.45771551,  0.27133315],
       [-0.22051631, -0.42906915,  0.84291023, ...,  0.21921565,
         0.62473854, -0.16559235],
       ..., 
       [ 0.25265982,  0.54207512, -1.28215199, ...,  0.14834653,
         0.23589818, -0.97843311],
       [-0.29757413, -1.52787302, -0.27852199, ..., -0.81587341,
         0.95921117, -0.04558614],
       [-0.37591557, -0.53767461,  1.56600403, ..., -0.170078  ,
         0.47914968, -0.69260922]])

In [None]:
score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=0)
print score[1]

0.481627651072


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, Flatten, Dropout
from keras.layers.pooling import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard

input_dim=200
max_review_length=200
embedding_vecor_length = 300
top_words=10000
model = Sequential()
# model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, 3, border_mode='same'))
model.add(Convolution1D(32, 3, border_mode='same'))
model.add(Convolution1D(16, 3, border_mode='same'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(250,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# Log to tensorboard
from keras.callbacks import TensorBoard
from keras_tqdm import TQDMNotebookCallback

batch_size=128
tensorBoardCallback = TensorBoard(log_dir='./logs/twittersentiment', write_graph=True)
model.fit(train_vecs_w2v, y_train, nb_epoch=9, verbose=0, callbacks=[TQDMNotebookCallback(), tensorBoardCallback], batch_size=batch_size)

# model.fit(X_train, y_train, nb_epoch=3, verbose=0, callbacks=[TQDMNotebookCallback()], batch_size=64)

# Evaluation on the test set
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

## Other Twitter Data
Twitter Data from: ***SemEval-2016 Task 4: Sentiment Analysis on Twitter***