# Imports

In [1]:
import pandas as pd # provide sql-like data manipulation tools. very handy.
pd.options.mode.chained_assignment = None
import numpy as np # high dimensional vector computing library.
from copy import deepcopy
from string import punctuation
from random import shuffle
import pickle
import h5py
import json
import matplotlib.pyplot as plt
import re

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
from nltk import word_tokenize

from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from tensorflow.keras.optimizers import SGD
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.utils import np_utils
from keras.callbacks import Callback
from keras.models import model_from_json


# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook



# Loading Data

In [2]:
def ingest():
    data = pd.read_csv('training.csv', encoding='latin-1') # Enter your file location
    data.columns=["Date","SentimentText","Sentiment"]
    data = data[data.Sentiment.isnull() == False]
    data['Sentiment'] = data['Sentiment'].map( {4:1, 0:0}) #Converting 4 to 1
    data = data[data['SentimentText'].isnull() == False]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', data.shape  )  
    return data

data = ingest()
data.head()

dataset loaded with shape (8698, 3)


Unnamed: 0,Date,SentimentText,Sentiment
0,2020-03-25 06:18:32+00:00,??Education is not only a ladder of opportuni...,0
1,2020-03-25 06:05:46+00:00,$GCLT news coming soon #fintech #finance https...,0
2,2020-03-25 06:35:29+00:00,That amazing feeling when you finally close a ...,1
3,2020-03-25 06:15:49+00:00,From OHM to SHIB: 5 Most Impressive Altcoins o...,1
4,2020-03-25 06:12:38+00:00,https://t.co/8gINJWxBxN #Finance #StockMarket ...,1


# Processing Data

In [3]:
def TextClean(tweet):
    tweet = tweet.lower()
    tweet = re.sub(r'@[a-z0-9_]\S+', '', tweet)
    tweet = re.sub(r'#[a-z0-9_]\S+', '', tweet)
    tweet = re.sub(r'&[a-z0-9_]\S+', '', tweet)
    tweet = re.sub(r'[?!.+,;$%&"]+', '', tweet)
    tweet = re.sub(r'rt[\s]+', '', tweet)
    tweet = re.sub(r'\d+', '', tweet)
    tweet = re.sub(r'\$', '', tweet)
    tweet = re.sub(r'rt+', '', tweet)
    tweet = re.sub(r'https?:?\/\/\S+', '', tweet)
    
    return tweet

In [4]:
tokenizer = TweetTokenizer()
def tokenize(tweet):
    try:
        tweet = tweet.lower()
        tokens = tokenizer.tokenize(tweet)
        tokens = list(filter(lambda t: not t.startswith('@'), tokens))
        tokens = list(filter(lambda t: not t.startswith('#'), tokens))
        tokens = list(filter(lambda t: not t.startswith('http'), tokens))
        return tokens
    except:
        return 'NC'

In [5]:
def postprocess(data):
    data['tokens'] = data['SentimentText'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(data)
data.head()

progress-bar: 100%|██████████████████████████████████████████████████████████████| 8698/8698 [00:02<00:00, 3250.48it/s]


Unnamed: 0,Date,SentimentText,Sentiment,tokens
0,2020-03-25 06:18:32+00:00,??Education is not only a ladder of opportuni...,0,"[, ?, ?, education, is, not, only, a, ladder,..."
1,2020-03-25 06:05:46+00:00,$GCLT news coming soon #fintech #finance https...,0,"[$, gclt, news, coming, soon]"
2,2020-03-25 06:35:29+00:00,That amazing feeling when you finally close a ...,1,"[that, amazing, feeling, when, you, finally, c..."
3,2020-03-25 06:15:49+00:00,From OHM to SHIB: 5 Most Impressive Altcoins o...,1,"[from, ohm, to, shib, :, 5, most, impressive, ..."
4,2020-03-25 06:12:38+00:00,https://t.co/8gINJWxBxN #Finance #StockMarket ...,1,"[$, avct, $, bjdx, market, down, after, 1, day..."


# Labelise Data

In [6]:
LabeledSentence = gensim.models.doc2vec.TaggedDocument # we'll talk about this down below

def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

In [7]:
#Splitting for training and testing
x_train, x_test, y_train, y_test = train_test_split(np.array(data.head(8000).tokens),
                                                    np.array(data.head(8000).Sentiment), test_size=0.2)


In [8]:
x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST') 

6400it [00:00, 136540.97it/s]
1600it [00:00, 102382.82it/s]


In [9]:
data_labellised= labelizeTweets(np.array(data.tokens), 'data')

8698it [00:00, 202696.11it/s]


# Builidng word2vec vocabulary and training

In [10]:
n=8000
n_dim = 200
tweet_w2v = Word2Vec(vector_size=200, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(data_labellised)])

100%|██████████████████████████████████████████████████████████████████████████████████████| 8698/8698 [00:00<?, ?it/s]


In [11]:
tweet_w2v.train([x.words for x in tqdm(data_labellised)],total_examples=tweet_w2v.corpus_count, epochs=tweet_w2v.epochs) 

100%|██████████████████████████████████████████████████████████████████████████| 8698/8698 [00:00<00:00, 557463.08it/s]


(613160, 1068920)

In [12]:
#Save the w2v model
tweet_w2v.save('w2vmodel')
#Load the w2v model
new_w2vmodel = gensim.models.Word2Vec.load('w2vmodel')

In [13]:
#convert any word(present in vocabulary) to vector.
tweet_w2v.wv.key_to_index['up']

64

In [14]:
#Find similar words
tweet_w2v.wv.most_similar('down')

[('person', 0.9880647659301758),
 ('losing', 0.9856185913085938),
 ('size', 0.9850658178329468),
 ('were', 0.9850647449493408),
 ('companies', 0.9845852255821228),
 ('returns', 0.9843725562095642),
 ('increase', 0.9835025072097778),
 ('through', 0.9824102520942688),
 ('safe', 0.9821949005126953),
 ('buying', 0.9821088314056396)]

# TF-IDF matrix of data

In [15]:
print ('building tf-idf matrix ...')
vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
matrix = vectorizer.fit_transform([x.words for x in data_labellised])
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
print ('vocab size :', len(tfidf))

building tf-idf matrix ...
vocab size : 1800


In [16]:
#Save the tfidf 
# with open("tfidfdict.txt", "wb") as myFile:
#     pickle.dump(tfidf, myFile)
# with open("tfidfdict.txt", "rb") as myFile:
#     tfidf = pickle.load(myFile)


# Build tweet vector to give input to FeedForward Neural Network

In [17]:
def buildWordVector(tokens, vector_size):
    vec = np.zeros(vector_size).reshape((1, vector_size))
    count = 0.
    for word in tokens:
        try:
            vec += tweet_w2v.wv[word].reshape((1, vector_size)) * tfidf[word] #combining w2v vectors with tfidf value of words in the tweet.
            count += 1.
        except KeyError: # handling the case where the token is not vec += model_w2v.wv[word].reshape((1, size))
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [18]:
train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x: x.words, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

6400it [00:02, 2297.82it/s]
1600it [00:00, 2363.29it/s]


# Training 3 layered FFNN

In [19]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=200))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=100, batch_size=600, verbose=2)

Epoch 1/100
11/11 - 1s - loss: 0.6548 - accuracy: 0.6017 - 1s/epoch - 97ms/step
Epoch 2/100
11/11 - 0s - loss: 0.6176 - accuracy: 0.6511 - 47ms/epoch - 4ms/step
Epoch 3/100
11/11 - 0s - loss: 0.6050 - accuracy: 0.6598 - 53ms/epoch - 5ms/step
Epoch 4/100
11/11 - 0s - loss: 0.5965 - accuracy: 0.6583 - 47ms/epoch - 4ms/step
Epoch 5/100
11/11 - 0s - loss: 0.5891 - accuracy: 0.6644 - 47ms/epoch - 4ms/step
Epoch 6/100
11/11 - 0s - loss: 0.5857 - accuracy: 0.6714 - 31ms/epoch - 3ms/step
Epoch 7/100
11/11 - 0s - loss: 0.5788 - accuracy: 0.6700 - 47ms/epoch - 4ms/step
Epoch 8/100
11/11 - 0s - loss: 0.5760 - accuracy: 0.6734 - 38ms/epoch - 3ms/step
Epoch 9/100
11/11 - 0s - loss: 0.5717 - accuracy: 0.6803 - 31ms/epoch - 3ms/step
Epoch 10/100
11/11 - 0s - loss: 0.5684 - accuracy: 0.6825 - 38ms/epoch - 3ms/step
Epoch 11/100
11/11 - 0s - loss: 0.5644 - accuracy: 0.6869 - 31ms/epoch - 3ms/step
Epoch 12/100
11/11 - 0s - loss: 0.5612 - accuracy: 0.6909 - 38ms/epoch - 3ms/step
Epoch 13/100
11/11 - 0s - 

<keras.callbacks.History at 0x29f104cc8e0>

In [20]:
# Evaluating accuracy score
score = model.evaluate(test_vecs_w2v, y_test, batch_size=800, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

2/2 - 0s - loss: 0.5619 - accuracy: 0.7006 - 232ms/epoch - 116ms/step
loss :  0.5619224309921265 
 accuracy :  0.7006250023841858


# Saving model

In [21]:
#Saving the model
model_json = model.to_json() # serialize model to JSON
with open("model.json", "w") as json_file:
    json_file.write(model_json)

model.save_weights("smodel.h5") # serialize weights to HDF5
print("Saved model to disk")

#Loading the model
# newmodel = model_from_json(open('model.json').read())
# newmodel.load_weights('smodel.h5')

Saved model to disk


# Predicting for test file (Validation)

In [22]:
def ingesttest():
    testdata = pd.read_csv('testing.csv', encoding='latin-1')
    testdata.columns=["Date","SentimentText","Sentiment"] 
    testdata = testdata[testdata.Sentiment.isnull() == False]
    testdata['Sentiment'] = testdata['Sentiment'].map( {4:1, 0:0, 2:1})
    testdata = testdata[testdata['SentimentText'].isnull() == False]
    testdata.reset_index(inplace=True)
    testdata.drop('index', axis=1, inplace=True)
    print ('dataset loaded with shape', testdata.shape  )  
    return testdata

testdata = ingesttest()

dataset loaded with shape (3824, 3)


In [23]:
testdata = postprocess(testdata)
testdata.head(5)

progress-bar: 100%|██████████████████████████████████████████████████████████████| 3824/3824 [00:01<00:00, 3237.94it/s]


Unnamed: 0,Date,SentimentText,Sentiment,tokens
0,2020-03-31 04:55:00+00:00,$BLDE - Blade Air Mobility: Poised To Fly High...,1,"[$, blde, -, blade, air, mobility, :, poised, ..."
1,2020-03-31 04:44:56+00:00,https://t.co/WTo4dnVIQu | Subscribe Now ! | st...,0,"[|, subscribe, now, !, |, stockmarket, ?, ÿ]"
2,2020-03-31 04:54:42+00:00,Courtesy of Your Home Real Estate Pro\n\n#budg...,1,"[courtesy, of, your, home, real, estate, pro, ..."
3,2020-03-31 04:43:47+00:00,4 Simple Things You Can Do Today To Improve Yo...,1,"[4, simple, things, you, can, do, today, to, i..."
4,2020-03-31 04:54:29+00:00,cryptocurrency news.\n\nAdvice For Crypto Star...,0,"[cryptocurrency, news, ., advice, for, crypto,..."


In [24]:
test_X=np.array(testdata.tokens)
test_y=np.array(testdata.Sentiment)

In [25]:
test_w2v_vecs = np.concatenate([buildWordVector(z, n_dim) for z in tqdm(map(lambda x:x, test_X))])

3824it [00:01, 2306.48it/s]


In [26]:
test_w2v_vecs.shape

(3824, 200)

In [27]:
# model.predict_classes(test_w2v_vecs)
score = model.evaluate(test_w2v_vecs,test_y, batch_size=128, verbose=2)
print(model.metrics_names[0],": ",score[0],"\n",model.metrics_names[1],": ",score[1])

30/30 - 0s - loss: 0.7436 - accuracy: 0.5442 - 232ms/epoch - 8ms/step
loss :  0.7435910701751709 
 accuracy :  0.5441945791244507
