In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
from gensim.models.word2vec import Word2Vec
from gensim.utils import simple_preprocess

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [3]:
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Lambda
from keras.layers import Layer
from keras.layers import Softmax
from keras.layers.convolutional import Conv1D
from keras.layers import MaxPooling1D

import tensorflow as tf
import keras
import keras.backend as K

In [4]:
# Params
maxlen=20
embedding_dim = 100
text_vocabulary_size = 140000
n_epochs = 500

## Load data

In [43]:
w2v =Word2Vec.load('w2v.model')

### Training

In [549]:
train_df= pd.read_pickle('train_df.pkl')

In [550]:
test_df = pd.read_pickle('test_df.pkl')

## Associate word_indices with embeddings

### Create weight matrix

In [551]:
# Tokenize and fit on text
t = Tokenizer()
t.fit_on_texts(vocab)

In [552]:
# From https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/
embedding_matrix = np.zeros((text_vocabulary_size, 100))

In [553]:
#create a weight matrix for words in training docs
for word, i in t.word_index.items():
    try:
        embedding_vector = w2v.wv.get_vector(word)
        embedding_matrix[i] = embedding_vector
    except KeyError:
        continue

### Train

In [554]:
train_df.loc[:, ['news_title']] = train_df.loc[:, ['news_title']].fillna('')

In [555]:
# Preprocess
train_df['words'] = train_df.news_title.map(simple_preprocess)

In [556]:
# Text to sequence
train_df['embeddings'] = t.texts_to_sequences(train_df['words'])

In [557]:
# Pad sequences
train_embed = pad_sequences(train_df['embeddings'], maxlen=maxlen, padding='post')

In [558]:
train_df

Unnamed: 0,index,Date,datetime,news_title,source,stock,words,embeddings
0,0,2006-10-20 00:00:00-04:00,2006-10-20 16:16:16-04:00,"Inco's Net Soars on Higher Metal Prices, Break...",Bloomberg,1,"[inco, net, soars, on, higher, metal, prices, ...","[4185, 1198, 8081, 86, 373, 7597, 418, 20386, ..."
1,1,2006-10-20 00:00:00-04:00,2006-10-20 16:25:00-04:00,"Hey buddy, can you spare $600 for a Google sha...",Reuters,1,"[hey, buddy, can, you, spare, for, google, share]","[266, 267, 182, 268, 269, 80, 270, 271]"
2,2,2006-10-20 00:00:00-04:00,2006-10-20 18:15:00-04:00,Exxon Mobil offers plan to end Alaska dispute.,Reuters,1,"[exxon, mobil, offers, plan, to, end, alaska, ...","[31, 32, 33, 34, 35, 36, 37, 38]"
3,3,2006-10-20 00:00:00-04:00,2006-10-20 20:08:44-04:00,"Jim Cramer: Diageo, Anheuser-Busch, Monster Wo...",Bloomberg,1,"[jim, cramer, diageo, anheuser, busch, monster...","[2243, 12399, 31064, 4988, 4989, 7611, 1391, 270]"
4,4,2006-10-20 00:00:00-04:00,2006-10-21 14:21:00-04:00,AOL CEO says sales may shrink for two years -p...,Reuters,1,"[aol, ceo, says, sales, may, shrink, for, two,...","[535, 536, 537, 538, 361, 539, 80, 216, 134, 540]"
5,5,2006-10-20 00:00:00-04:00,2006-10-21 20:11:00-04:00,Pluspetrol says losing $2.4 mln/day in Peru pr...,Reuters,1,"[pluspetrol, says, losing, mln, day, in, peru,...","[885, 537, 175, 886, 887, 88, 888, 889]"
6,6,2006-10-20 00:00:00-04:00,2006-10-22 06:46:00-04:00,EU to urge China to open economy further.,Reuters,1,"[eu, to, urge, china, to, open, economy, further]","[943, 35, 944, 945, 35, 946, 693, 644]"
7,7,2006-10-20 00:00:00-04:00,2006-10-22 12:14:00-04:00,"Fed to keep hawkish tone, hold rates steady.",Reuters,1,"[fed, to, keep, hawkish, tone, hold, rates, st...","[683, 35, 684, 685, 686, 687, 688, 689]"
8,8,2006-10-20 00:00:00-04:00,2006-10-22 20:36:00-04:00,Weatherford profit jumps 78 percent.,Reuters,1,"[weatherford, profit, jumps, percent]","[1186, 460, 1187, 310]"
9,9,2006-10-20 00:00:00-04:00,2006-10-22 21:51:00-04:00,Saudi Arabia tells Japan to cut its Nov crude ...,Reuters,1,"[saudi, arabia, tells, japan, to, cut, its, no...","[1944, 1945, 2006, 2007, 35, 822, 167, 2008, 9..."


In [459]:
train_df['embeddings'] = [np.squeeze(x) for x in np.split(train_embed, train_embed.shape[0])]

In [460]:
train_df['embeddings'] = [embedding_matrix[x] for x in train_df['embeddings']]

### Test

In [59]:
test_df.loc[:, ['news_title']] = test_df.loc[:, ['news_title']].fillna('')

In [60]:
# Preprocess
test_df['words'] = test_df.news_title.map(simple_preprocess)

In [61]:
# Text to sequence
test_df['embeddings'] = t.texts_to_sequences(test_df['words'])

In [62]:
# Pad sequences
test_embed = pad_sequences(test_df['embeddings'], maxlen=maxlen, padding='post')

In [63]:
test_df['embeddings'] = [np.squeeze(x) for x in np.split(test_embed, test_embed.shape[0])]

In [64]:
test_df['embeddings'] = [embedding_matrix[x] for x in test_df['embeddings']]

### Aggregate events by day

In [65]:
train_df['stock'] = train_df['stock'].astype(int)
test_df['stock'] = test_df['stock'].astype(int)

In [66]:
train_df.head()

Unnamed: 0,index,Date,datetime,news_title,source,stock,words,embeddings
0,0,2006-10-20 00:00:00-04:00,2006-10-20 16:16:16-04:00,"Inco's Net Soars on Higher Metal Prices, Break...",Bloomberg,1,"[inco, net, soars, on, higher, metal, prices, ...","[[-0.3715226352214813, 0.684317409992218, -1.1..."
1,1,2006-10-20 00:00:00-04:00,2006-10-20 16:25:00-04:00,"Hey buddy, can you spare $600 for a Google sha...",Reuters,1,"[hey, buddy, can, you, spare, for, google, share]","[[0.32294243574142456, -0.03166818246245384, -..."
2,2,2006-10-20 00:00:00-04:00,2006-10-20 18:15:00-04:00,Exxon Mobil offers plan to end Alaska dispute.,Reuters,1,"[exxon, mobil, offers, plan, to, end, alaska, ...","[[0.09510381519794464, 0.34654828906059265, -0..."
3,3,2006-10-20 00:00:00-04:00,2006-10-20 20:08:44-04:00,"Jim Cramer: Diageo, Anheuser-Busch, Monster Wo...",Bloomberg,1,"[jim, cramer, diageo, anheuser, busch, monster...","[[0.812869668006897, 0.0374612882733345, 0.543..."
4,4,2006-10-20 00:00:00-04:00,2006-10-21 14:21:00-04:00,AOL CEO says sales may shrink for two years -p...,Reuters,1,"[aol, ceo, says, sales, may, shrink, for, two,...","[[0.05968537926673889, 0.16134525835514069, -0..."


In [67]:
g = train_df.groupby('Date')

agg_train_df = pd.concat([g.embeddings.apply(np.mean, axis=0),
                          g.stock.apply(np.mean, axis=0)
                         ],
                         axis=1)

agg_train_df.reset_index(inplace=True)

agg_train_df.to_pickle('agg_train_df.pkl')

In [68]:
g = test_df.groupby('Date')

agg_test_df = pd.concat([g.embeddings.apply(np.mean, axis=0),
                          g.stock.apply(np.mean, axis=0)
                         ],
                         axis=1)

agg_test_df.reset_index(inplace=True)

agg_test_df.to_pickle('agg_test_df.pkl')

## Load data

In [583]:
agg_train_df = pd.read_pickle('agg_train_df.pkl')

In [584]:
agg_test_df = pd.read_pickle('agg_train_df.pkl')

In [585]:
# Convert back from series of arrays to large array of (events, maxlen, embedding_dims)
x_train = np.array(agg_train_df.embeddings.to_list())
y_train = np.array(agg_train_df.stock)

In [600]:
# Convert back from series of arrays to large array of (events, maxlen, embedding_dims)
x_test = np.array(agg_test_df.embeddings.to_list())
y_test = np.array(agg_test_df.stock)

## Stock Prediction Model

In [586]:
seqlen = x_train.shape[0]

In [587]:
from keras.regularizers import l2

In [588]:
l2_penalty = 0.01

In [589]:
K.clear_session()

In [590]:
# Take real inputs
event_in_ = Input(shape=(seqlen, maxlen, embedding_dim), dtype='float32', name='x_train')
print(event_in_.shape)
# # Average embeddings for o1, p, o2
average = keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=2))
event_ = average(event_in_) # Output dim (100)
print(event_.shape)

# Hidden layer
hidden1_ = keras.layers.Dense(units=100, activation='relu', use_bias=True, kernel_initializer='glorot_uniform', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(event_)

event_day_ = Conv1D(filters=embedding_dim, kernel_size=1, strides=1, padding='same', activation='relu',
                    use_bias=True, kernel_regularizer=l2(l2_penalty), bias_regularizer=l2(l2_penalty)
                   )(hidden1_)

event_week_ = Conv1D(filters=embedding_dim, kernel_size=5, strides=1, padding='causal', activation='relu',
                     use_bias=True, kernel_regularizer=l2(l2_penalty), bias_regularizer=l2(l2_penalty)
                    )(hidden1_)

event_month_ = Conv1D(filters=embedding_dim, kernel_size=20, strides=1, padding='causal', activation='relu',
                      use_bias=True, kernel_regularizer=l2(l2_penalty), bias_regularizer=l2(l2_penalty)
                     )(hidden1_)


# # Max pooling of weekly and monthly events
max_pool_week_ = MaxPooling1D(pool_size=3, strides=1, padding='same', data_format='channels_last'
                              )(event_week_)
print(max_pool_week_.shape)
max_pool_month_ = MaxPooling1D(pool_size=3, strides=1, padding='same', data_format='channels_last'
                              )(event_month_)

# # Concatenate daily, weekly, monthly
concat_ = keras.layers.Concatenate(axis=2)([event_day_, max_pool_week_, max_pool_month_])

# Hidden layer
hidden2_ = keras.layers.Dense(units=50, activation='relu', use_bias=True, kernel_initializer='normal', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(concat_)

# Softmax layer
y_pred_ = keras.layers.Dense(units=1, activation='sigmoid', use_bias=True, kernel_initializer='normal', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(hidden2_)

model = Model(inputs=event_in_, outputs=y_pred_)


(?, 1534, 20, 100)
(?, 1534, 100)
(?, 1534, 100)


In [591]:
from keras import optimizers

In [592]:
sgd = optimizers.SGD(lr=0.001, clipvalue=0.5)

In [593]:
model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

In [594]:
model.fit(x=np.expand_dims(x_train, 0),
          y=np.expand_dims(np.expand_dims(y_train, -1), 0), 
          batch_size=1, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x29cb283b748>

In [604]:
test = model.predict(x=np.expand_dims(x_test, 0), batch_size=1)

In [608]:
np.mean(test[0])

0.50105786

In [546]:
test2

array([[False],
       [False],
       [False],
       ...,
       [False],
       [False],
       [False]])

In [447]:
from keras.layers import Flatten

In [261]:
# Embed events
# Take training events and return embedded vectors
# Create copy of training events, randomly sample events from data, return embedded vectors
# Multiply each by tensor
# Calculate loss


# Take real inputs
event_in_ = Input(shape=(seqlen, maxlen, embedding_dim), dtype='float32', name='x_train')
print(event_in_.shape)
# # Average embeddings for o1, p, o2
average = keras.layers.Lambda(lambda x: keras.backend.mean(x, axis=2))
event_ = average(event_in_) # Output dim (100)
print(event_.shape)

# Hidden layer
event_day_ = Conv1D(filters=embedding_dim, kernel_size=1, strides=1, padding='same', activation='relu',
                    use_bias=True, kernel_regularizer=l2(l2_penalty), bias_regularizer=l2(l2_penalty)
                   )(event_)

event_week_ = Conv1D(filters=embedding_dim, kernel_size=5, strides=1, padding='causal', activation='relu',
                     use_bias=True, kernel_regularizer=l2(l2_penalty), bias_regularizer=l2(l2_penalty)
                    )(event_)

event_month_ = Conv1D(filters=embedding_dim, kernel_size=20, strides=1, padding='causal', activation='relu',
                      use_bias=True, kernel_regularizer=l2(l2_penalty), bias_regularizer=l2(l2_penalty)
                     )(event_)


# # Max pooling of weekly and monthly events
# # Narrow convolution of 3 neighbouring
# event_week_ = MaxPooling1D(pool_size=5, strides=None, padding='valid', data_format='channels_last'
#                           )(event_week_)

# max_pool_month_ = MaxPooling1D(pool_size=3, strides=None, padding='valid', data_format='channels_last'
#                               )(event_month_)

# # Concatenate daily, weekly, monthly
concat_ = keras.layers.Concatenate(axis=2)([event_day_, event_week_, event_month_])

# Hidden layer
hidden_ = keras.layers.Dense(units=50, activation='relu', use_bias=True, kernel_initializer='normal', 
                             bias_initializer='zeros', kernel_regularizer=l2(l2_penalty), 
                             bias_regularizer=l2(l2_penalty),
                            )(concat_)

y_pred_ = keras.layers.Flatten()(hidden_)
model = Model(inputs=event_in_, outputs=y_pred_)

(?, 1534, 20, 100)
(?, 1534, 100)


In [262]:
model.compile(loss='hinge',
              optimizer='adam',
              metrics=['accuracy'])

In [263]:
model.fit(x=np.expand_dims(x_train, 0),
          y=np.expand_dims(y_train, -1),
          batch_size=1, epochs=50)

ValueError: Error when checking target: expected flatten_4 to have shape (76700,) but got array with shape (1,)