In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

from __future__ import print_function

import os
import sys
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, Bidirectional,LSTM, Embedding, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

In [2]:
def split_document_to_c(row, fName, chunkLength):
    chunkText = row[fName]
    if(chunkText!='[]'):
        label = row['label']
        lines = sent_tokenize(chunkText)
        results = []
        for i in range(0,len(lines), chunkLength):
            if(i+chunkLength < len(lines)):
                chunk = ''.join(lines[i:i+chunkLength])
            else:
                chunk = ''.join(lines[i:len(lines)])
            results.append((chunk,label))
        return results
    
    
def prepare_dataSet(dataSet,fName, chunk_size=3):
    chunkedTextLabels = dataSet.apply(split_document_to_c, args=(fName, chunk_size), axis=1)
    X=[]
    y=[]
    #print("Size before chunking: ", len(chunkedTextLabels))
    for c in chunkedTextLabels:
        if(c is not None):
            for chunk in c:
                X.append(chunk[0])
                y.append(chunk[1])
    return X,y

In [3]:
dataSets = pd.read_csv('../Data/final_dataset_joined_aapl240_onlyMentions.csv')
dataSets.loc[dataSets['label']==-1,'label'] = 0
trainDf = dataSets[dataSets.stock_time <= "2018-12-01 00:00:00"]
testDf = dataSets[dataSets.stock_time > "2018-12-01 00:00:00"]

FileNotFoundError: [Errno 2] No such file or directory: '../Data/final_dataset_joined_aapl240_onlyMentions.csv'

In [None]:
XTrainLabels,yTrainLabels = prepare_dataSet(trainDf,'filteredtext_aapl')
XTestLabels,yTestLabels = prepare_dataSet(testDf,'filteredtext_aapl')
print("Train Shape: ",len(XTrainLabels),len(yTrainLabels))
print("Test Shape: ",len(XTestLabels),len(yTestLabels))

Train Shape:  59278 59278
Test Shape:  13349 13349


In [None]:
MAX_SEQ_LENGTH = 100
MAX_WORDS = 30000
EMBD_DIM = 100

totX = XTrainLabels+ XTestLabels
totY = yTrainLabels + yTestLabels

print('Processing text dataset')
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(totX)
line_seq = tokenizer.texts_to_sequences(totX)

wordInd = tokenizer.word_index
print('Found %s unique tokens.' % len(wordInd))

data = pad_sequences(line_seq, maxlen=MAX_SEQ_LENGTH)

labels = to_categorical(np.asarray(totY))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Processing text dataset
Found 132808 unique tokens.
Shape of data tensor: (72627, 100)
Shape of label tensor: (72627, 2)


In [None]:
XTrain = data[:len(XTrainLabels)]
yTrain = labels[:len(yTrainLabels)]
XTest = data[len(XTrainLabels):]
yTest = labels[len(yTrainLabels):]
print('Shape of train data tensor:', XTrain.shape)
print('Shape of train label tensor:', yTrain.shape)
print('Shape of test data tensor:', XTest.shape)
print('Shape of test label tensor:', yTest.shape)

Shape of train data tensor: (59278, 100)
Shape of train label tensor: (59278, 2)
Shape of test data tensor: (13349, 100)
Shape of test label tensor: (13349, 2)


In [None]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove')
print('Indexing word vectors.')
embdInd = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embdInd[word] = coefs

print('Found %s word vectors.' % len(embdInd))


print('Preparing embedding matrix.')
# prepare embedding matrix
numberOfWords = min(MAX_WORDS, len(wordInd) + 1)
embdMatrx = np.zeros((numberOfWords, EMBD_DIM))
for word, i in wordInd.items():
    if i >= MAX_WORDS:
        continue
    embdVector = embdInd.get(word)
    if embdVector is not None:
        # words not found in embedding index will be all-zeros.
        embdMatrx[i] = embdVector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed

Indexing word vectors.
Found 400000 word vectors.
Preparing embedding matrix.


In [None]:
model = Sequential()
model.add(Embedding(numberOfWords,
                            EMBD_DIM,
                            embeddings_initializer=Constant(embdMatrx),
                            input_length=MAX_SEQ_LENGTH,
                            trainable=False))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

# try using different optimizers and different optimizer configs
adam = Adam(0.01)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=adam)

print('Train...')
model.fit(XTrain, yTrain,
          batch_size=128,
          epochs=50,
          validation_data=[XTest, yTest])

Train...
Train on 59278 samples, validate on 13349 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50