In [1]:
import numpy as np
import re
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
training = pd.read_csv('trainingandtestdata/training.1600000.processed.noemoticon.csv', encoding="latin1", header=None)

In [3]:
training.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
training = training.drop(columns = [1,2,3, 4])

In [5]:
training = training.rename(columns={0:'sentiment', 5:'Tweet'})

In [6]:
training.head()

Unnamed: 0,sentiment,Tweet
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [7]:
from tqdm import tqdm, tqdm_notebook
tqdm.pandas(desc = "progress-bar")

In [8]:
from nltk.tokenize import TweetTokenizer
tkznr = TweetTokenizer()

In [9]:
# let us define a function for tokenizing the tweets

In [10]:
def tokenize(tweet):
    try:
        tweet = str(tweet.lower())#make all letters lower case
        tokens = tkznr.tokenize(tweet)
        prefix = 'http'
        for word in tokens:
            if word.startswith(prefix):
                tokens.remove(word)
                tokens.remove('-')
            if word.startswith('#'):
                tokens.remove(words)
            if word.startswith('@'):
                tokens.remove(words)
        return tokens
    except:
        return 'NC'

In [11]:
#Define funtion to process all tweets and incorporate the tokenize function 
def process(data):
    #data = data.head(n=n)
    data['tokens'] = data['Tweet'].progress_map(tokenize)
    data = data[data['tokens']!='NC']
    data.reset_index(inplace = True)
    data.drop(columns = 'index', inplace= True)
    return data

In [12]:
training_tokenized = process(training)

progress-bar: 100%|███████████████████████████████████████████████████████| 1600000/1600000 [01:08<00:00, 23263.42it/s]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [13]:
training_tokenized.head()

Unnamed: 0,sentiment,Tweet,tokens
0,0,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can't, update, his, face..."
1,0,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
2,0,Need a hug,"[need, a, hug]"
3,0,spring break in plain city... it's snowing,"[spring, break, in, plain, city, ..., it's, sn..."
4,0,I just re-pierced my ears,"[i, just, re-pierced, my, ears]"


In [14]:
#Positive sentiment is labeled as zero let's convert it to 1
def convert2one(x):
    if x > 0 :
        return 1
    else:
        return x

In [15]:
training_tokenized['sentiment'] = training_tokenized['sentiment'].apply(convert2one)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Implementing word2vec using gensim

We will be using gensim to create a word2vec for the corpus that we have here. We will be using the gensim package to create a vector space of the words that exist in the corpus. Once we map all the words in the corpus to a vector space we will convert our tokenized tweets into vectors on the basis of the word2vec mapping.

In [16]:
#Splitting the data into training and test set

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(training_tokenized['tokens'], training_tokenized['sentiment'], test_size = 0.2, shuffle = True)

Before performing word2vec we need to convert the training set into a tagged document

In [19]:
from gensim.models.doc2vec import TaggedDocument



In [20]:
def tag_doc(data, label_type, explorer = 10):
    taggedtweets = []
    i = 0
    for tweet in tqdm_notebook(data):
        label = '{}_{}'.format(label_type, i)
        tag_the_tweet = TaggedDocument(tweet, label)
        taggedtweets.append(tag_the_tweet)
        if i == explorer:
            print(tweet)
            print(tag_the_tweet)
        i+=1
        
    return taggedtweets
    
        

In [21]:
train_taggedtweets = tag_doc(X_train, 'TRAIN', explorer=0)

HBox(children=(IntProgress(value=0, max=640624), HTML(value='')))

['is', 'really', 'bummed', 'that', 'it', 'crowd', ':', 'episode', '4', 'season', '2', 'does', 'not', 'work']
TaggedDocument(['is', 'really', 'bummed', 'that', 'it', 'crowd', ':', 'episode', '4', 'season', '2', 'does', 'not', 'work'], TRAIN_0)



In [22]:
train_taggedtweets[12]

TaggedDocument(words=['im', 'passing', 'over', 'the', 'hoover', 'dam', 'now', '.', 'almost', 'there'], tags='TRAIN_12')

Now let us train our a word2vec model and get a vector representation of most words in the training set. 

In [23]:
from gensim.models import Word2Vec

In [24]:
tweet_word2vec = Word2Vec(size = 200, min_count= 10)

In [25]:
tweet_word2vec.build_vocab([tweet.words for tweet in tqdm_notebook(train_taggedtweets)])


HBox(children=(IntProgress(value=0, max=640624), HTML(value='')))




In [26]:
tweet_word2vec.train([tweet.words for tweet in tqdm_notebook(train_taggedtweets)], total_examples=len(train_taggedtweets), epochs=10)

HBox(children=(IntProgress(value=0, max=640624), HTML(value='')))




(67813861, 94567140)

In [27]:
tweet_word2vec.wv.most_similar(positive = 'good')

  if np.issubdtype(vec.dtype, np.int):


[('goood', 0.7706956267356873),
 ('goooood', 0.7457799911499023),
 ('gooood', 0.7304670214653015),
 ('great', 0.7254845499992371),
 ('rough', 0.6570172905921936),
 ('gd', 0.6378801465034485),
 ('gud', 0.6343272924423218),
 ('nice', 0.6214916706085205),
 ('terrible', 0.6093655824661255),
 ('fantastic', 0.6032646298408508)]

In [28]:
tweet_word2vec.wv.most_similar(positive = 'boy')

  if np.issubdtype(vec.dtype, np.int):


[('girl', 0.6843696236610413),
 ('man', 0.5516564846038818),
 ('kid', 0.5508373975753784),
 ('puppy', 0.5402565002441406),
 ('guy', 0.5377308130264282),
 ('boyfriend', 0.5344223976135254),
 ('fella', 0.5124790668487549),
 ('girlfriend', 0.5088516473770142),
 ('nephew', 0.5062916874885559),
 ('kitten', 0.5003682374954224)]

In [29]:
tweet_word2vec.wv.most_similar('team')

  if np.issubdtype(vec.dtype, np.int):


[('league', 0.6075648069381714),
 ('championship', 0.5920431613922119),
 ('teams', 0.5652551651000977),
 ('tournament', 0.5589582920074463),
 ('crew', 0.5536748170852661),
 ('1-0', 0.547774612903595),
 ('leaders', 0.5429645776748657),
 ('yankees', 0.5413712859153748),
 ('match', 0.5280784368515015),
 ('players', 0.5245149731636047)]

We can see that the word2vec model does a good job of grouping similar words together. If we look only at the vector representation of one the words we will see that it is a vector of length 200 (since we trained it that way). Also we have to remember that words that are does not occur more than 10 times are rejected

In [30]:
tweet_word2vec.wv['good'];

In order to train the data we need to convert each of the tweet into vectors. In order to do this we will add all the vector equivalent of words in a given tweet mutiplied by the weight carried by the word, which we can relate to the frequency of occurrence of the words in the corpus. Once we have the vector equivalent of the tweets we can then start building the model for sentiment analysis.

In [31]:
#Getting the weights of each word
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
vectorizer = TfidfVectorizer(analyzer=lambda x:x, min_df=10)

In [33]:
vectorizer.fit_transform([x.words for x in train_taggedtweets])

<640624x20993 sparse matrix of type '<class 'numpy.float64'>'
	with 8306130 stored elements in Compressed Sparse Row format>

In [34]:
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

Now we are fully equipped to make a vector equivalent of all tweets

In [35]:
def build_vectors(tweet, tfidf):
    tweet_vector = np.zeros(200).reshape(1, 200)
    count = 0.0
    for word in tweet:
        try:
            tweet_vector+=tfidf[word]*tweet_word2vec.wv[word].reshape(1, 200)
            
        except:
            continue
        count+=1
    if count !=0:
        return tweet_vector/count
    else:
        return tweet_vector

In [36]:
vectorized_tweet = []
for tweet in tqdm_notebook(X_train):
    vectorized_tweet.append(build_vectors(tweet, tfidf))

        

HBox(children=(IntProgress(value=0, max=640624), HTML(value='')))




In [37]:
train_tweets = np.array(vectorized_tweet).reshape(-1, 200)

In [38]:
train_tweets.shape

(640624, 200)

In [39]:
test_tweets = []
for tweet in tqdm_notebook(X_test):
    test_tweets.append(build_vectors(tweet, tfidf))

HBox(children=(IntProgress(value=0, max=160156), HTML(value='')))




In [40]:
test_tweets = np.array(test_tweets).reshape(-1, 200)

In [41]:
test_tweets.shape

(160156, 200)

Let us use an XGBoost classifier to perform a classification. We do not require neural nets unless we are using special networks such as RNN 

In [42]:
#Let us scale the data in both training and test
from sklearn.preprocessing import StandardScaler, MinMaxScaler
sc = MinMaxScaler()
x_train = train_tweets
x_test = test_tweets
#We do not need to scale for xgboost
#reshaping the data for LSTMs
#x_train_re = x_train.reshape(-1, 200, 1)
#x_test_re = x_test.reshape(-1, 200, 1)

In [43]:
#print('Shape of x_train: ', x_train_re.shape)
#print('Shape of x_test: ', x_test_re.shape)


In [45]:
y_train = np.array(y_train).flatten()
y_test = np.array(y_test).flatten()

In [46]:
np.array(y_train).reshape(-1,1).shape

(640624, 1)

In [47]:
from xgboost import XGBClassifier

In [48]:
model = XGBClassifier(max_depth=5,
                      n_estimators=500, 
                      objective = 'gpu:binary:logistic',
                      n_jobs=-1, 
                      kvargs = {'tree_method':'gpu_exact'}, )

In [49]:
model.fit(x_train, y=y_train, eval_set=[(x_train, y_train), (x_test, y_test)], eval_metric= 'error')

[0]	validation_0-error:0.306682	validation_1-error:0.310291
[1]	validation_0-error:0.299545	validation_1-error:0.302948
[2]	validation_0-error:0.294567	validation_1-error:0.297485
[3]	validation_0-error:0.290576	validation_1-error:0.294076
[4]	validation_0-error:0.287833	validation_1-error:0.290923
[5]	validation_0-error:0.284921	validation_1-error:0.287595
[6]	validation_0-error:0.282417	validation_1-error:0.284754
[7]	validation_0-error:0.279805	validation_1-error:0.282606
[8]	validation_0-error:0.276613	validation_1-error:0.279758
[9]	validation_0-error:0.274877	validation_1-error:0.277511
[10]	validation_0-error:0.272946	validation_1-error:0.275968
[11]	validation_0-error:0.270775	validation_1-error:0.274582
[12]	validation_0-error:0.268932	validation_1-error:0.272116
[13]	validation_0-error:0.26677	validation_1-error:0.269924
[14]	validation_0-error:0.265376	validation_1-error:0.268763
[15]	validation_0-error:0.263626	validation_1-error:0.266946
[16]	validation_0-error:0.262263	va

[135]	validation_0-error:0.206809	validation_1-error:0.21544
[136]	validation_0-error:0.206661	validation_1-error:0.215328
[137]	validation_0-error:0.206522	validation_1-error:0.215334
[138]	validation_0-error:0.206357	validation_1-error:0.21504
[139]	validation_0-error:0.206254	validation_1-error:0.214947
[140]	validation_0-error:0.206099	validation_1-error:0.214859
[141]	validation_0-error:0.205846	validation_1-error:0.214672
[142]	validation_0-error:0.205639	validation_1-error:0.214535
[143]	validation_0-error:0.205529	validation_1-error:0.214397
[144]	validation_0-error:0.205336	validation_1-error:0.214316
[145]	validation_0-error:0.205169	validation_1-error:0.214054
[146]	validation_0-error:0.205027	validation_1-error:0.214116
[147]	validation_0-error:0.204981	validation_1-error:0.213885
[148]	validation_0-error:0.204846	validation_1-error:0.213841
[149]	validation_0-error:0.204616	validation_1-error:0.213748
[150]	validation_0-error:0.204457	validation_1-error:0.213735
[151]	vali

[268]	validation_0-error:0.192066	validation_1-error:0.205006
[269]	validation_0-error:0.19198	validation_1-error:0.204869
[270]	validation_0-error:0.191974	validation_1-error:0.204838
[271]	validation_0-error:0.191875	validation_1-error:0.2049
[272]	validation_0-error:0.191786	validation_1-error:0.204925
[273]	validation_0-error:0.191769	validation_1-error:0.204713
[274]	validation_0-error:0.191693	validation_1-error:0.204669
[275]	validation_0-error:0.19163	validation_1-error:0.204563
[276]	validation_0-error:0.191554	validation_1-error:0.204594
[277]	validation_0-error:0.191493	validation_1-error:0.204513
[278]	validation_0-error:0.191427	validation_1-error:0.204538
[279]	validation_0-error:0.191346	validation_1-error:0.204332
[280]	validation_0-error:0.191251	validation_1-error:0.204207
[281]	validation_0-error:0.191225	validation_1-error:0.204113
[282]	validation_0-error:0.191092	validation_1-error:0.204307
[283]	validation_0-error:0.190983	validation_1-error:0.204245
[284]	valida

[401]	validation_0-error:0.183591	validation_1-error:0.200242
[402]	validation_0-error:0.183557	validation_1-error:0.200292
[403]	validation_0-error:0.183507	validation_1-error:0.200249
[404]	validation_0-error:0.183396	validation_1-error:0.200261
[405]	validation_0-error:0.183406	validation_1-error:0.200167
[406]	validation_0-error:0.183432	validation_1-error:0.200192
[407]	validation_0-error:0.183352	validation_1-error:0.200242
[408]	validation_0-error:0.18331	validation_1-error:0.200111
[409]	validation_0-error:0.183237	validation_1-error:0.200067
[410]	validation_0-error:0.183204	validation_1-error:0.200036
[411]	validation_0-error:0.183189	validation_1-error:0.200117
[412]	validation_0-error:0.183121	validation_1-error:0.199943
[413]	validation_0-error:0.183022	validation_1-error:0.199943
[414]	validation_0-error:0.18292	validation_1-error:0.199924
[415]	validation_0-error:0.182842	validation_1-error:0.200074
[416]	validation_0-error:0.182764	validation_1-error:0.200055
[417]	vali

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, kvargs={'tree_method': 'gpu_exact'},
       learning_rate=0.1, max_delta_step=0, max_depth=5,
       min_child_weight=1, missing=None, n_estimators=500, n_jobs=-1,
       nthread=None, objective='gpu:binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [107]:
y_pred = model.predict(x_test)

  if diff:


In [108]:
y_true = y_test.flatten()

In [109]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [110]:
confusion_matrix(y_true, y_pred)

array([[80703, 13218],
       [18547, 47688]], dtype=int64)

In [111]:
accuracy_score(y_true, y_pred)

0.8016621294238118

We are able to get around 80% accuracy on the test set which is using XGboost (no parameter tuning done). It would also be interesting to see if we can do better with RNNs which are better with sequences.

In [61]:
model.save_model('xgb_twitter_sentiment_model.model')
model.save_model('xgb-twitter_sentiment.bin')

In [64]:
x_train.shape

(442834, 200)

In [118]:
sample = ["It's", "really", "bad", "weather", "out", "there"]
#ample = ['I', 'like', 'the', 'weather', 'today']

In [119]:
sample_vector =build_vectors(sample, tfidf)

In [120]:
sample_vector.shape

(1, 200)

In [121]:
model.predict(sample_vector)

  if diff:


array([0], dtype=int64)