In [27]:
import pandas as pd
import numpy as np

from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

from sklearn.model_selection import train_test_split

# fix random seed for reproducibility
numpy.random.seed(2017)

The bear-bull contest is essentially sentiment classification from the text content of the news headlines.
As such a good example to follow for LSTM is this:
http://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [6]:
# Load the tdm csv file
df = pd.read_csv('Basic_TDM.csv')

df.head

<bound method NDFrame.head of        00  000  000s  00am  011  02  03  04  05  07   ...    zombie  zone  \
0       0    0     0     0    0   0   0   0   0   0   ...         0     0   
1       0    0     0     0    0   0   0   0   0   0   ...         0     0   
2       0    0     0     0    0   0   0   0   0   0   ...         0     0   
3       0    0     0     0    0   0   0   0   0   0   ...         0     0   
4       0    0     0     0    0   0   0   0   0   0   ...         0     0   
5       0    0     0     0    0   0   0   0   0   0   ...         0     0   
6       0    0     0     0    0   0   0   0   0   0   ...         0     0   
7       0    0     0     0    0   0   0   0   0   0   ...         0     0   
8       0    0     0     0    0   0   0   0   0   0   ...         0     0   
9       0    0     0     0    0   0   0   0   0   0   ...         0     0   
10      0    0     0     0    0   0   0   0   0   0   ...         0     0   
11      0    0     0     0    0   0   0   0   

The input for the LSTM is a sequence of words, so the TDM will need to be converted. The easiest is to replace all words with their index in the list of columns. Then a doc will be represented by a list of integers

In [7]:
def create_input_sequence(sparse_vector):
    # Take a TDM vector as input, return indices of each non-zero term
    indices = [i for i, x in enumerate(sparse_vector) if x > 0]

    return indices

In [8]:
# Use the function to create a sequence dataset
seq_data = []
for i in range(df.shape[0]):
    seq_data.append(create_input_sequence(df.iloc[i]))

max([len(x) for x in seq_data])

33

In [9]:
seq_data[0]

[2249, 3545, 6145, 12148, 15170, 15192]

In [11]:
max_seq_length = 33
padded_seq = sequence.pad_sequences(seq_data, maxlen=max_seq_length)

Load in the labels from the original dataset

In [13]:
train = pd.read_csv("Datasets/Combined_News_DJIA_train.csv")
Input_y = train['Target']

In [22]:
# Reshape the sequence data
train = np.reshape(padded_seq, (1611,825))

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    train, Input_y, test_size=0.33, random_state=42, stratify=Input_y)

In [30]:
# Build and train model

embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(df.shape[1], embedding_vecor_length, input_length=max_seq_length*25))
model.add(Dropout(0.5, seed=2017))
model.add(LSTM(100))
model.add(Dropout(0.5, seed=2017))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=64)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
embedding_6 (Embedding)          (None, 825, 32)       500896      embedding_input_6[0][0]          
____________________________________________________________________________________________________
dropout_3 (Dropout)              (None, 825, 32)       0           embedding_6[0][0]                
____________________________________________________________________________________________________
lstm_5 (LSTM)                    (None, 100)           53200       dropout_3[0][0]                  
____________________________________________________________________________________________________
dropout_4 (Dropout)              (None, 100)           0           lstm_5[0][0]                     
___________________________________________________________________________________________

<keras.callbacks.History at 0x298b0074470>

In [31]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 52.82%
