In [1]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding, Flatten
from keras.layers import Conv1D, GlobalMaxPool1D, MaxPooling1D
from keras.datasets import imdb

Using TensorFlow backend.


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline

In [3]:
import numpy as np
import pandas as pd

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [5]:
import spacy
nlp = spacy.load('en')

In [6]:
max_features = 5000
max_len = 1000
batch_size = 8
embedding_dims = 50
filters = 250
kernel_size = 4
hidden_dims = 250
epochs = 30

In [7]:
merged_data = pd.read_pickle('merged_data_pct_change.pkl')

In [8]:
merged_data = merged_data[merged_data['1day pct change'].abs() > 3.]

# X, y, split

In [9]:
X = merged_data['rawText']
y = merged_data['1day pct change'] > 0
#y = np_utils.to_categorical(y, 2)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [11]:
tfidf = TfidfVectorizer(max_df=0.5, min_df=10, ngram_range=(1,2), stop_words='english')
svd = TruncatedSVD(max_len)
#lsa = make_pipeline(tfidf, svd)
lsa = make_pipeline(tfidf)

# Run SVD on the training data, then project the training data.
X_train = lsa.fit_transform(X_train)
X_test = lsa.transform(X_test)

In [12]:
X_train = sequence.pad_sequences(X_train.toarray(), maxlen=max_len)
X_test = sequence.pad_sequences(X_test.toarray(), maxlen=max_len)

In [13]:
X_train.shape

(874, 1000)

In [14]:
X_test.shape

(219, 1000)

In [15]:
if X_train.shape[1] < max_len:
    max_len = X_train.shape[1]

In [16]:
model = Sequential()
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=max_len))
model.add(Conv1D(filters=32,
                 kernel_size=3,
                 padding='same',
                 activation='relu'))
model.add(Flatten())
model.add(Dense(200, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
history = model.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = epochs,
                    validation_data = (X_test, y_test), verbose=2)

Train on 874 samples, validate on 219 samples
Epoch 1/30
12s - loss: 0.6935 - acc: 0.5183 - val_loss: 0.6944 - val_acc: 0.4795
Epoch 2/30
10s - loss: 0.6926 - acc: 0.5229 - val_loss: 0.6957 - val_acc: 0.4795
Epoch 3/30
10s - loss: 0.6924 - acc: 0.5229 - val_loss: 0.6950 - val_acc: 0.4795
Epoch 4/30
10s - loss: 0.6926 - acc: 0.5229 - val_loss: 0.6946 - val_acc: 0.4795
Epoch 5/30
10s - loss: 0.6925 - acc: 0.5229 - val_loss: 0.6953 - val_acc: 0.4795
Epoch 6/30
10s - loss: 0.6925 - acc: 0.5229 - val_loss: 0.6958 - val_acc: 0.4795
Epoch 7/30
10s - loss: 0.6927 - acc: 0.5229 - val_loss: 0.6950 - val_acc: 0.4795
Epoch 8/30
10s - loss: 0.6924 - acc: 0.5229 - val_loss: 0.6956 - val_acc: 0.4795
Epoch 9/30
10s - loss: 0.6923 - acc: 0.5229 - val_loss: 0.6959 - val_acc: 0.4795
Epoch 10/30
10s - loss: 0.6926 - acc: 0.5229 - val_loss: 0.6952 - val_acc: 0.4795
Epoch 11/30
10s - loss: 0.6924 - acc: 0.5229 - val_loss: 0.6961 - val_acc: 0.4795
Epoch 12/30
10s - loss: 0.6923 - acc: 0.5229 - val_loss: 0.69

In [17]:
model = Sequential()
model.add(Embedding(max_features, 
                    embedding_dims,
                    input_length=max_len))
model.add(Dropout(0.2))

model.add(Conv1D(filters, 
                 kernel_size, 
                 padding='valid', 
                 activation='relu', 
                 strides=1))
model.add(GlobalMaxPool1D())

model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])

model.summary()

history = model.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = epochs,
                    validation_data = (X_test, y_test), verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1000, 50)          250000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000, 50)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 997, 250)          50250     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 250)               62750     
_________________________________________________________________
dropout_2 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation_1 (Activation)    (None, 250)               0         
__________

In [18]:
model = Sequential()
model.add(Embedding(max_features, 
                    embedding_dims, 
                    input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])

model.summary()

history = model.fit(X_train, y_train,
                    batch_size = batch_size,
                    epochs = epochs,
                    validation_data = (X_test, y_test), verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 1000, 50)          250000    
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 1000, 32)          4832      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 250, 32)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               2000250   
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 251       
Total params: 2,255,333
Trainable params: 2,255,333
Non-trainable params: 0
_________________________________________________________________


In [19]:
model = Sequential()
model.add(Embedding(max_features, 
                    embedding_dims, 
                    input_length=max_len))
model.add(Conv1D(filters=8, kernel_size=2, padding='same', activation='relu')) 
# input_shape=(max_len, 1)
model.add(Conv1D(filters=8, kernel_size=4, padding='same', activation='relu'))
model.add(Conv1D(filters=8, kernel_size=8, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=6))
model.add(Conv1D(filters=16, kernel_size=2, padding='same', activation='relu'))
model.add(Conv1D(filters=16, kernel_size=4, padding='same', activation='relu'))
model.add(Conv1D(filters=16, kernel_size=8, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=8))
model.add(Conv1D(filters=64, kernel_size=2, padding='same', activation='relu'))
model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
model.add(Conv1D(filters=64, kernel_size=8, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=8))
model.add(Flatten())
model.add(Dense(1024, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])

model.summary()
'''
history = model.fit(X_train.reshape(X_train.shape[0],X_train.shape[1],1), y_train,
                    batch_size = 16,
                    epochs = 20,
                    validation_data = (X_test.reshape(X_test.shape[0],X_test.shape[1],1), y_test), verbose=2)
'''
history = model.fit(X_train, y_train,
                    batch_size = 16,
                    epochs = 20,
                    validation_data = (X_test, y_test), verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1000, 50)          250000    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 1000, 8)           808       
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 1000, 8)           264       
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 1000, 8)           520       
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 166, 8)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 166, 16)           272       
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 166, 16)           1040      
__________