# Imports

In [1]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle
import pickle
import h5py
import json
import matplotlib.pyplot as plt
import re

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from nltk import word_tokenize

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from tensorflow.keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras.callbacks import Callback
from keras.models import model_from_json


# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook



# Loading Data

In [2]:
def ingest():
    data = pd.read_csv('data.csv', encoding='latin-1') # Enter your file location
    data.columns=["Date","SentimentText","Sentiment"]
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map( {'Positive':1, "Negative":0}) 
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', data.shape  )  
    return data

data = ingest()
data.head()

dataset loaded with shape (12524, 3)


Unnamed: 0,Date,SentimentText,Sentiment
0,2020-03-25 06:36:25+00:00,"$DG #Dollar General #Options #maxpain Chart, O...",1.0
1,2020-03-25 06:18:32+00:00,??Education is not only a ladder of opportuni...,
2,2020-03-25 06:05:46+00:00,$GCLT news coming soon #fintech #finance https...,
3,2020-03-25 06:35:29+00:00,That amazing feeling when you finally close a ...,1.0
4,2020-03-25 06:15:49+00:00,From OHM to SHIB: 5 Most Impressive Altcoins o...,1.0


# Processing Data

In [3]:
data.dropna(inplace = True)
data.head()

Unnamed: 0,Date,SentimentText,Sentiment
0,2020-03-25 06:36:25+00:00,"$DG #Dollar General #Options #maxpain Chart, O...",1.0
3,2020-03-25 06:35:29+00:00,That amazing feeling when you finally close a ...,1.0
4,2020-03-25 06:15:49+00:00,From OHM to SHIB: 5 Most Impressive Altcoins o...,1.0
5,2020-03-25 06:12:38+00:00,https://t.co/8gINJWxBxN #Finance #StockMarket ...,1.0
6,2020-03-25 06:32:38+00:00,TP HITS 1814???????ø\n\nTo get daily signals l...,1.0


In [4]:
def TextClean(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'@[a-z0-9_]\S+', '', tweet)
    tweet = re.sub(r'#[a-z0-9_]\S+', '', tweet)
    tweet = re.sub(r'&[a-z0-9_]\S+', '', tweet)
    tweet = re.sub(r'[?!.+,;$%&"]+', '', tweet)
    tweet = re.sub(r'rt[\s]+', '', tweet)
    tweet = re.sub(r'\d+', '', tweet)
    tweet = re.sub(r'\$', '', tweet)
    tweet = re.sub(r'rt+', '', tweet)
    tweet = re.sub(r'https?:?\/\/\S+', '', tweet)
    
    return tweet

In [5]:
tokenizer = TweetTokenizer()
def tokenize(tweet):
    try:
        tweet = tweet.lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = list(filter(lambda t: not t.startswith('@'), tokens))
        tokens = list(filter(lambda t: not t.startswith('#'), tokens))
        tokens = list(filter(lambda t: not t.startswith('http'), tokens))
        return tokens
    except:
        return 'NC'

In [6]:
def postprocess(data):
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(data)
data.head()

progress-bar: 100%|██████████████████████████████████████████████████████████████| 8008/8008 [00:02<00:00, 2892.50it/s]


Unnamed: 0,Date,SentimentText,Sentiment,tokens
0,2020-03-25 06:36:25+00:00,"$DG #Dollar General #Options #maxpain Chart, O...",1.0,"[$, dg, general, chart, ,, open, interest, cha..."
1,2020-03-25 06:35:29+00:00,That amazing feeling when you finally close a ...,1.0,"[that, amazing, feeling, when, you, finally, c..."
2,2020-03-25 06:15:49+00:00,From OHM to SHIB: 5 Most Impressive Altcoins o...,1.0,"[from, ohm, to, shib, :, 5, most, impressive, ..."
3,2020-03-25 06:12:38+00:00,https://t.co/8gINJWxBxN #Finance #StockMarket ...,1.0,"[$, avct, $, bjdx, market, down, after, 1, day..."
4,2020-03-25 06:32:38+00:00,TP HITS 1814???????ø\n\nTo get daily signals l...,1.0,"[tp, hits, 1814, ?, ?, ?, ø, to, get, daily, s..."


# Labelise Data

In [7]:
LabeledSentence = gensim.models.doc2vec.TaggedDocument # we'll talk about this down below

def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

In [8]:
#Splitting for training and testing
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(8000).tokens),
                                                    np.array(data.head(8000).Sentiment), test_size=0.2)


In [9]:
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST') 

6400it [00:00, 37802.70it/s]
1600it [00:00, ?it/s]


In [10]:
data_labellised= labelizeTweets(np.array(data.tokens), 'data')

8008it [00:00, 366609.03it/s]


# Builidng word2vec vocabulary and training

In [11]:
n=5000
n_dim = 200
tweet_w2v = Word2Vec(vector_size=200, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(data_labellised)])

100%|██████████████████████████████████████████████████████████████████████████████████████| 8008/8008 [00:00<?, ?it/s]


In [12]:
tweet_w2v.train([x.words for x in tqdm(data_labellised)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.epochs) 

100%|██████████████████████████████████████████████████████████████████████████| 8008/8008 [00:00<00:00, 512519.82it/s]


(667601, 1106010)

In [13]:
#Save the w2v model
tweet_w2v.save('w2vmodel')
#Load the w2v model
new_w2vmodel = gensim.models.Word2Vec.load('w2vmodel')

In [14]:
#convert any word(present in vocabulary) to vector.
tweet_w2v.wv.key_to_index['happy']

415

In [15]:
#Find similar words
tweet_w2v.wv.most_similar('happy')

[('ship', 0.9468527436256409),
 ('wonderful', 0.9378995299339294),
 ('soon', 0.9376754760742188),
 ('pick', 0.9323378801345825),
 ('super', 0.9287994503974915),
 ('strategy', 0.9198437929153442),
 ('serve', 0.9156471490859985),
 ('god', 0.9155476689338684),
 ('guys', 0.9145596623420715),
 ('going', 0.9136531352996826)]

# TF-IDF matrix of data

In [16]:
print ('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in data_labellised])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

building tf-idf matrix ...
vocab size : 1761


In [17]:
#Save the tfidf 
# with open("tfidfdict.txt", "wb") as myFile:
#     pickle.dump(tfidf, myFile)
# with open("tfidfdict.txt", "rb") as myFile:
#     tfidf = pickle.load(myFile)


# Build tweet vector to give input to FeedForward Neural Network

In [18]:
def buildWordVector(tokens, vector_size):
    vec = np.zeros(vector_size).reshape((1, vector_size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v.wv[word].reshape((1, vector_size)) * tfidf[word] #combining w2v vectors with tfidf value of words in the tweet.
            count += 1.
        except KeyError: # handling the case where the token is not vec += model_w2v.wv[word].reshape((1, size))
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [19]:
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

6400it [00:03, 1985.03it/s]
1600it [00:00, 2034.29it/s]


# Training 3 layered FFNN

In [20]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=100, batch_size=600, verbose=2)

Epoch 1/100
11/11 - 1s - loss: 0.5138 - accuracy: 0.8136 - 1s/epoch - 107ms/step
Epoch 2/100
11/11 - 0s - loss: 0.3785 - accuracy: 0.8647 - 62ms/epoch - 6ms/step
Epoch 3/100
11/11 - 0s - loss: 0.3531 - accuracy: 0.8661 - 38ms/epoch - 3ms/step
Epoch 4/100
11/11 - 0s - loss: 0.3445 - accuracy: 0.8666 - 31ms/epoch - 3ms/step
Epoch 5/100
11/11 - 0s - loss: 0.3389 - accuracy: 0.8666 - 38ms/epoch - 3ms/step
Epoch 6/100
11/11 - 0s - loss: 0.3354 - accuracy: 0.8664 - 31ms/epoch - 3ms/step
Epoch 7/100
11/11 - 0s - loss: 0.3308 - accuracy: 0.8669 - 53ms/epoch - 5ms/step
Epoch 8/100
11/11 - 0s - loss: 0.3270 - accuracy: 0.8673 - 47ms/epoch - 4ms/step
Epoch 9/100
11/11 - 0s - loss: 0.3242 - accuracy: 0.8666 - 53ms/epoch - 5ms/step
Epoch 10/100
11/11 - 0s - loss: 0.3216 - accuracy: 0.8669 - 47ms/epoch - 4ms/step
Epoch 11/100
11/11 - 0s - loss: 0.3193 - accuracy: 0.8675 - 47ms/epoch - 4ms/step
Epoch 12/100
11/11 - 0s - loss: 0.3173 - accuracy: 0.8700 - 38ms/epoch - 3ms/step
Epoch 13/100
11/11 - 0s -

<keras.callbacks.History at 0x19ffe7f6730>

In [21]:
# Evaluating accuracy score
score = model.evaluate(test_vecs_w2v, y_test, batch_size=800, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

2/2 - 0s - loss: 0.3113 - accuracy: 0.8737 - 232ms/epoch - 116ms/step
loss :  0.31129220128059387 
 accuracy :  0.8737499713897705
