In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from gensim.models.word2vec import Word2Vec
from gensim.utils import simple_preprocess

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [407]:
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Lambda
from keras.layers import Layer
from keras.layers import Softmax
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Dropout
from keras import optimizers
from keras.regularizers import l2

import tensorflow as tf
import keras
import keras.backend as K

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [408]:
# Params
maxlen=20
embedding_dim = 100
text_vocabulary_size = 140000
n_epochs = 500

### Training

In [49]:
train_df= pd.read_pickle('train_df.pkl')

In [50]:
test_df = pd.read_pickle('test_df.pkl')

## Associate word_indices with embeddings

### Train

In [17]:
train_df.loc[:, ['news_title']] = train_df.loc[:, ['news_title']].fillna('')

In [18]:
# Preprocess
train_df['words'] = train_df.news_title.map(simple_preprocess)

### Create weight matrix

In [10]:
embeddings_index = dict()
f = open("D:\\GitHub\\glove.6B\\glove.6B.100d.txt", encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [19]:
# Tokenize and fit on text
t = Tokenizer()
t.fit_on_texts(train_df['news_title'])

In [20]:
# From https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
embedding_matrix = np.zeros((text_vocabulary_size, 100))

In [21]:
#create a weight matrix for words in training docs
for word, i in t.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [23]:
# Text to sequence
train_df['embeddings'] = t.texts_to_sequences(train_df['words'])

In [24]:
# Pad sequences
train_embed = pad_sequences(train_df['embeddings'], maxlen=maxlen, padding='post')

In [25]:
train_df

Unnamed: 0,index,Date,datetime,news_title,source,stock,words,embeddings
0,0,2006-10-20 00:00:00-04:00,2006-10-20 16:16:16-04:00,"Inco's Net Soars on Higher Metal Prices, Break...",Bloomberg,1,"[inco, net, soars, on, higher, metal, prices, ...","[14411, 222, 1237, 3, 172, 1541, 53, 3018, 1404]"
1,1,2006-10-20 00:00:00-04:00,2006-10-20 16:25:00-04:00,"Hey buddy, can you spare $600 for a Google sha...",Reuters,1,"[hey, buddy, can, you, spare, for, google, share]","[12905, 17975, 440, 2332, 6801, 5, 318, 246]"
2,2,2006-10-20 00:00:00-04:00,2006-10-20 18:15:00-04:00,Exxon Mobil offers plan to end Alaska dispute.,Reuters,1,"[exxon, mobil, offers, plan, to, end, alaska, ...","[1044, 4648, 449, 56, 1, 184, 3089, 788]"
3,3,2006-10-20 00:00:00-04:00,2006-10-20 20:08:44-04:00,"Jim Cramer: Diageo, Anheuser-Busch, Monster Wo...",Bloomberg,1,"[jim, cramer, diageo, anheuser, busch, monster...","[4543, 24198, 5067, 3019, 3686, 3519, 3090, 318]"
4,4,2006-10-20 00:00:00-04:00,2006-10-21 14:21:00-04:00,AOL CEO says sales may shrink for two years -p...,Reuters,1,"[aol, ceo, says, sales, may, shrink, for, two,...","[2528, 49, 4, 10, 11, 2031, 5, 93, 191, 666]"
5,5,2006-10-20 00:00:00-04:00,2006-10-21 20:11:00-04:00,Pluspetrol says losing $2.4 mln/day in Peru pr...,Reuters,1,"[pluspetrol, says, losing, mln, day, in, peru,...","[24199, 4, 1091, 1165, 68, 2, 847, 1336]"
6,6,2006-10-20 00:00:00-04:00,2006-10-22 06:46:00-04:00,EU to urge China to open economy further.,Reuters,1,"[eu, to, urge, china, to, open, economy, further]","[80, 1, 1657, 21, 1, 257, 73, 873]"
7,7,2006-10-20 00:00:00-04:00,2006-10-22 12:14:00-04:00,"Fed to keep hawkish tone, hold rates steady.",Reuters,1,"[fed, to, keep, hawkish, tone, hold, rates, st...","[63, 1, 487, 15302, 5407, 492, 171, 851]"
8,8,2006-10-20 00:00:00-04:00,2006-10-22 20:36:00-04:00,Weatherford profit jumps 78 percent.,Reuters,1,"[weatherford, profit, jumps, percent]","[12258, 27, 310, 121]"
9,9,2006-10-20 00:00:00-04:00,2006-10-22 21:51:00-04:00,Saudi Arabia tells Japan to cut its Nov crude ...,Reuters,1,"[saudi, arabia, tells, japan, to, cut, its, no...","[398, 1571, 406, 100, 1, 59, 256, 2052, 207, 276]"


In [26]:
train_df['embeddings'] = [np.squeeze(x) for x in np.split(train_embed, train_embed.shape[0])]

In [27]:
train_df['embeddings'] = [embedding_matrix[x] for x in train_df['embeddings']]

### Test

In [28]:
test_df.loc[:, ['news_title']] = test_df.loc[:, ['news_title']].fillna('')

In [29]:
# Preprocess
test_df['words'] = test_df.news_title.map(simple_preprocess)

In [30]:
# Text to sequence
test_df['embeddings'] = t.texts_to_sequences(test_df['words'])

In [31]:
# Pad sequences
test_embed = pad_sequences(test_df['embeddings'], maxlen=maxlen, padding='post')

In [32]:
test_df['embeddings'] = [np.squeeze(x) for x in np.split(test_embed, test_embed.shape[0])]

In [33]:
test_df['embeddings'] = [embedding_matrix[x] for x in test_df['embeddings']]

### Aggregate events by day

In [34]:
train_df['stock'] = train_df['stock'].astype(int)
test_df['stock'] = test_df['stock'].astype(int)

In [35]:
train_df.head()

Unnamed: 0,index,Date,datetime,news_title,source,stock,words,embeddings
0,0,2006-10-20 00:00:00-04:00,2006-10-20 16:16:16-04:00,"Inco's Net Soars on Higher Metal Prices, Break...",Bloomberg,1,"[inco, net, soars, on, higher, metal, prices, ...","[[0.45153000950813293, 1.2962000370025635, -0...."
1,1,2006-10-20 00:00:00-04:00,2006-10-20 16:25:00-04:00,"Hey buddy, can you spare $600 for a Google sha...",Reuters,1,"[hey, buddy, can, you, spare, for, google, share]","[[0.37766000628471375, 0.42607998847961426, 1...."
2,2,2006-10-20 00:00:00-04:00,2006-10-20 18:15:00-04:00,Exxon Mobil offers plan to end Alaska dispute.,Reuters,1,"[exxon, mobil, offers, plan, to, end, alaska, ...","[[1.0338000059127808, 0.4104999899864197, 0.09..."
3,3,2006-10-20 00:00:00-04:00,2006-10-20 20:08:44-04:00,"Jim Cramer: Diageo, Anheuser-Busch, Monster Wo...",Bloomberg,1,"[jim, cramer, diageo, anheuser, busch, monster...","[[-0.5946000218391418, 0.24015000462532043, 0...."
4,4,2006-10-20 00:00:00-04:00,2006-10-21 14:21:00-04:00,AOL CEO says sales may shrink for two years -p...,Reuters,1,"[aol, ceo, says, sales, may, shrink, for, two,...","[[0.2813799977302551, -0.5907800197601318, 0.4..."


In [36]:
g = train_df.groupby('Date')

agg_train_df = pd.concat([g.embeddings.apply(np.mean, axis=0),
                          g.stock.apply(np.mean, axis=0)
                         ],
                         axis=1)

agg_train_df.reset_index(inplace=True)

agg_train_df.to_pickle('agg_train_df.pkl')

In [37]:
g = test_df.groupby('Date')

agg_test_df = pd.concat([g.embeddings.apply(np.mean, axis=0),
                          g.stock.apply(np.mean, axis=0)
                         ],
                         axis=1)

agg_test_df.reset_index(inplace=True)

agg_test_df.to_pickle('agg_test_df.pkl')

## Load data

In [5]:
agg_train_df = pd.read_pickle('agg_train_df.pkl')

In [6]:
agg_test_df = pd.read_pickle('agg_test_df.pkl')

In [7]:
agg_train_df.head()

Unnamed: 0,Date,embeddings,stock
0,2006-10-20 00:00:00-04:00,"[[0.15673499777913094, 0.1809510998427868, 0.4...",1.0
1,2006-10-23 00:00:00-04:00,"[[-0.0007124438293670353, 0.062406974550532665...",1.0
2,2006-10-24 00:00:00-04:00,"[[0.20981161274248733, 0.13421877161599696, 0....",1.0
3,2006-10-25 00:00:00-04:00,"[[0.21190344616479706, 0.13870281755225733, 0....",1.0
4,2006-10-26 00:00:00-04:00,"[[0.2520672485232353, 0.06778985364362597, 0.3...",0.0


In [8]:
# Convert back from series of arrays to large array of (events, maxlen, embedding_dims)
x_train = np.array(agg_train_df.embeddings.to_list())
y_train = np.array(agg_train_df.stock)

In [9]:
# Convert back from series of arrays to large array of (events, maxlen, embedding_dims)
x_test = np.array(agg_test_df.embeddings.to_list())
y_test = np.array(agg_test_df.stock)

## Stock Prediction Model

In [701]:
#l2_penalty = 0.0000001
l2_penalty = 0

In [702]:
del cnn_model

In [703]:
K.clear_session()
import gc
gc.collect()

0

In [704]:
# Take real inputs
event_in_ = Input(shape=(None, maxlen, embedding_dim), dtype='float32', name='x_train')

# # Average embeddings for o1, p, o2
average = keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=2))
event_ = average(event_in_) # Output dim (100)


# Hidden layer
hidden1_ = keras.layers.Dense(units=100, activation='relu', use_bias=True, kernel_initializer='glorot_uniform', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(event_)

# event_day_ = Conv1D(filters=embedding_dim, kernel_size=1, strides=1, padding='same', activation='relu',
#                     use_bias=True, kernel_regularizer=l2(l2_penalty), bias_regularizer=l2(l2_penalty)
#                    )(hidden1_)

event_week_ = Conv1D(filters=embedding_dim, kernel_size=5, strides=1, padding='causal', activation='relu',
                     use_bias=True, kernel_regularizer=l2(l2_penalty), bias_regularizer=l2(l2_penalty)
                    )(hidden1_)

event_month_ = Conv1D(filters=embedding_dim, kernel_size=20, strides=1, padding='causal', activation='relu',
                      use_bias=True, kernel_regularizer=l2(l2_penalty), bias_regularizer=l2(l2_penalty)
                     )(hidden1_)


# # Max pooling of weekly and monthly events
max_pool_week_ = MaxPooling1D(pool_size=3, strides=1, padding='same', data_format='channels_last'
                              )(event_week_)

max_pool_month_ = MaxPooling1D(pool_size=3, strides=1, padding='same', data_format='channels_last'
                              )(event_month_)

# # Concatenate daily, weekly, monthly
concat_ = keras.layers.Concatenate(axis=2)([hidden1_, event_week_, event_month_])

# Hidden layer
hidden2_ = keras.layers.Dense(units=50, activation='relu', use_bias=True, kernel_initializer='normal', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(concat_)

# Softmax layer
y_pred_ = keras.layers.Dense(units=1, activation='sigmoid', use_bias=True, kernel_initializer='normal', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(hidden2_)

cnn_model = Model(inputs=event_in_, outputs=y_pred_)


In [705]:
sgd = optimizers.SGD(lr=0.001, clipvalue=0.5)
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
cnn_model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [710]:
cnn_model.fit(x=np.expand_dims(x_train, 0),
          y=np.expand_dims(np.expand_dims(y_train, -1), 0), 
          batch_size=200, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1e1877a3780>

In [711]:
y_pred = cnn_model.predict(x=np.expand_dims(x_test, 0), batch_size=1)

In [712]:
confusion_matrix(y_test, y_pred.squeeze() > .5, labels=[1, 0])

array([[148,   0],
       [104,   1]], dtype=int64)

In [713]:
round(accuracy_score(y_test, y_pred.squeeze() > .5), 2)

0.59

In [714]:
f1_score(y_test, y_pred.squeeze() > .5)

0.74

In [850]:
K.clear_session()
import gc
gc.collect()

38

In [851]:
# Simplified model

In [852]:
l2_penalty = 0.000001

In [853]:
# Take real inputs
event_in_ = Input(shape=(None, maxlen, embedding_dim), dtype='float32', name='x_train')
# Average embeddings for o1, p, o2
average = keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=2))
event_ = average(event_in_) # Output dim (100)

# Hidden layer
hidden1_ = keras.layers.Dense(units=100, activation='relu', use_bias=True, kernel_initializer='glorot_uniform', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(event_)


# Hidden layer
hidden2_ = keras.layers.Dense(units=50, activation='relu', use_bias=True, kernel_initializer='normal', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(hidden1_)

# Hidden layer
hidden3_ = keras.layers.Dense(units=25, activation='relu', use_bias=True, kernel_initializer='normal', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(hidden2_)

# Softmax layer
y_pred_ = keras.layers.Dense(units=1, activation='sigmoid', use_bias=True, kernel_initializer='normal', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(hidden3_)

model = Model(inputs=event_in_, outputs=y_pred_)

In [854]:
sgd = optimizers.SGD(lr=0.01, clipvalue=0.5)
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [855]:
model.fit(x=np.expand_dims(x_train, 0),
          y=np.expand_dims(np.expand_dims(y_train, -1), 0), 
          batch_size=100, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1e194c5e898>

In [856]:
y_pred = model.predict(x=np.expand_dims(x_test, 0), batch_size=100)
confusion_matrix(y_test, y_pred.squeeze() > .5, labels=[1, 0])

array([[148,   0],
       [105,   0]], dtype=int64)

In [849]:
round(accuracy_score(y_test, y_pred.squeeze() > .5), 2)

0.58

In [846]:
del lstm_model

NameError: name 'lstm_model' is not defined

In [792]:
K.clear_session()
import gc
gc.collect()

8

In [231]:
l2_penalty = 0

In [232]:
# Take real inputs
event_in_ = Input(shape=(None, maxlen, embedding_dim), dtype='float32', name='x_train')
# # Average embeddings for o1, p, o2
average = keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=2))
event_ = average(event_in_) # Output dim (100)

# LSTM layer
lstm1_ = keras.layers.LSTM(units=100, return_sequences=True)(event_)
#lstm1_ = keras.layers.Dropout(rate = 0.9)(lstm1_)

# LSTM layer
lstm2_ = keras.layers.LSTM(units=50, return_sequences=True )(lstm1_)
#lstm2_ = keras.layers.Dropout(rate = 0.9)(lstm2_)

# LSTM layer
lstm3_ = keras.layers.LSTM(units=50, return_sequences=True )(lstm1_)
#lstm3_ = keras.layers.Dropout(rate = 0.9)(lstm3_)

# Dense layer
hidden_ = keras.layers.Dense(units=50, activation='relu', use_bias=True, kernel_initializer='normal', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(lstm3_)

# Softmax layer
y_pred_ = keras.layers.Dense(units=1, activation='sigmoid', use_bias=True, kernel_initializer='normal', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(hidden_)

lstm_model = Model(inputs=event_in_, outputs=y_pred_)


In [233]:
sgd = optimizers.SGD(lr=0.001, clipvalue=0.5)
adam = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
lstm_model.compile(loss='binary_crossentropy', optimizer=sgd, metrics=['accuracy'])

In [234]:
lstm_model.fit(x=np.expand_dims(x_train, 0),
          y=np.expand_dims(np.expand_dims(y_train, -1), 0), 
          batch_size=100, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1e161ef2080>

In [235]:
y_pred = lstm_model.predict(x=np.expand_dims(x_test, 0), batch_size=100).squeeze()
confusion_matrix(y_test, y_pred.squeeze() > .5, labels=[1, 0])

array([[148,   0],
       [105,   0]], dtype=int64)

In [236]:
round(accuracy_score(y_pred=y_pred>.5, y_true=y_test), 2)

0.58