#### Sentiment analysis on Twitter using word2vec and keras

In [1]:
import pandas as pd
from nltk.tokenize import TweetTokenizer

In [19]:
#!curl -L -o data/tweet.zip 'https://drive.google.com/uc?id=0B04GJPshIjmPRnZManQwWEdTZjg&export=download'
# descargar la data con headers https://github.com/adamwulf/movie-review-sentiment-data

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   383    0   383    0     0    165      0 --:--:--  0:00:02 --:--:--   165
100 77.5M    0 77.5M    0     0   556k      0 --:--:--  0:02:22 --:--:--  617k912k    0     0   317k      0 --:--:--  0:00:12 --:--:--  379k 524k      0 --:--:--  0:01:40 --:--:--  608k


In [2]:
data = pd.read_csv('./data/tweet/training.1600000.processed.noemoticon.csv', encoding='latin1')
data = data.head(100000)
print("shape %s"%(str(data.shape)))
data.head()

shape (100000, 6)


Unnamed: 0,Sentiment,ItemID,Date,Blank,SentimentSource,SentimentText
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
data.drop(['ItemID', 'Date', 'Blank', 'SentimentSource'],axis=1,inplace=True)
data = data[data.Sentiment.isnull()==False]
data = data[data.SentimentText.isnull()==False]
data.head()

Unnamed: 0,Sentiment,SentimentText
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [4]:
data['SentimentText'] = data['SentimentText'].str.replace('http\S+|www.\S+', '', case=False)
data.head()

Unnamed: 0,Sentiment,SentimentText
0,0,"@switchfoot - Awww, that's a bummer. You sho..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [5]:
tt = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case=False)
data['tokens'] = data['SentimentText'].apply(tt.tokenize)
data.head()

Unnamed: 0,Sentiment,SentimentText,tokens
0,0,"@switchfoot - Awww, that's a bummer. You sho...","[-, awww, ,, that's, a, bummer, ., you, should..."
1,0,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can't, update, his, face..."
2,0,@Kenichan I dived many times for the ball. Man...,"[i, dived, many, times, for, the, ball, ., man..."
3,0,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,0,"@nationwideclass no, it's not behaving at all....","[no, ,, it's, not, behaving, at, all, ., i'm, ..."


In [8]:
import numpy as np
from sklearn.model_selection import train_test_split

In [19]:
x_train, x_test, y_train, y_test = train_test_split(data.tokens,
                                                    data.Sentiment, test_size=0.1)

In [22]:
print("train set ")
print("-"*10)
print(x_train.head())
      
print("\n\n test set")
print("-"*10)
print(y_train.head())

train set 
----------
79274    [missing, the, party, at, the, saloon, for, a,...
87839                          [is, sad, coz, you're, sad]
87404                                   [i, hope, so, ...]
88983    [todo, list, this, week, :, pack, ,, watch, st...
30756    [yup, ,, as, i, predicted, my, macbook, ran, o...
Name: tokens, dtype: object


 test set
----------
79274    0
87839    0
87404    0
88983    0
30756    0
Name: Sentiment, dtype: int64


In [26]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import gensim
from gensim.models.word2vec import Word2Vec

In [27]:
LabeledSentence = gensim.models.doc2vec.LabeledSentence 

def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

  import sys
90000it [00:00, 146840.87it/s]
10000it [00:00, 266272.89it/s]


In [29]:
x_train[0]

LabeledSentence(words=['missing', 'the', 'party', 'at', 'the', 'saloon', 'for', 'a', 'final', 'cut', 'project', '.'], tags=['TRAIN_0'])

In [31]:
tweet_w2v = Word2Vec(size=200, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)])

100%|██████████| 90000/90000 [00:00<00:00, 1640643.07it/s]
100%|██████████| 90000/90000 [00:00<00:00, 2564034.12it/s]


ValueError: You must specify either total_examples or total_words, for proper job parameters updationand progress calculations. The usual value is total_examples=model.corpus_count.

### References 
- https://ahmedbesbes.com/sentiment-analysis-on-twitter-using-word2vec-and-keras.html