In [1]:
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D, GlobalAveragePooling2D
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten, Reshape
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [3]:
import nltk
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Load data from file

In [5]:
merged_data = pd.read_pickle('merged_data_pct_change.pkl')

In [6]:
filtered_merged_data = merged_data[merged_data['1day pct change'].abs() > 3.]

In [7]:
len(filtered_merged_data)

1093

# Create train data and test data

In [8]:
X = filtered_merged_data['rawText']
y = filtered_merged_data['1day pct change'] > 0
y = np_utils.to_categorical(y, 2)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Tf-Idf

In [10]:
# not used
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t.lower(), 'v') for t in word_tokenize(doc)]

In [11]:
tfidf = TfidfVectorizer(max_df=0.5, min_df=50, ngram_range=(1,2), stop_words='english')
svd = TruncatedSVD(2000)
#lsa = make_pipeline(tfidf, svd)
lsa = make_pipeline(tfidf)

# Run SVD on the training data, then project the training data.
train_vectors = lsa.fit_transform(X_train)
test_vectors = lsa.transform(X_test)

In [12]:
train_vectors.shape

(874, 6116)

In [13]:
train_vectors = train_vectors.toarray().reshape((train_vectors.shape[0], 1, train_vectors.shape[1], 1))
test_vectors = test_vectors.toarray().reshape((test_vectors.shape[0], 1, test_vectors.shape[1], 1))

In [14]:
train_vectors.shape

(874, 1, 6116, 1)

In [15]:
y_train.shape

(874, 2)

# Model

In [None]:
cnn = Sequential()
cnn.add(Conv2D(16, (4, 1), padding="same", activation="relu", input_shape=(1, train_vectors.shape[2], 1)))
#cnn.add(Conv2D(64, (10, 1), padding="same", activation="relu"))
cnn.add(MaxPooling2D(pool_size=(1,4)))

#cnn.add(Conv2D(128, (8, 1), padding="same", activation="relu"))
#cnn.add(Conv2D(128, (8, 1), padding="same", activation="relu"))
#cnn.add(Conv2D(128, (8, 1), padding="same", activation="relu"))
#cnn.add(MaxPooling2D(pool_size=(1,5)))
    
cnn.add(Conv2D(16, (4, 1), padding="same", activation="relu"))
cnn.add(Conv2D(32, (4, 1), padding="same", activation="relu"))
cnn.add(Conv2D(64, (4, 1), padding="same", activation="relu"))
cnn.add(MaxPooling2D(pool_size=(1,8)))
    
cnn.add(Flatten())
cnn.add(Dense(512, activation='relu'))
cnn.add(Dense(2, activation='softmax'))

cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn.fit(train_vectors, y_train, epochs=40, verbose=2)

Epoch 1/40
7s - loss: 0.6942 - acc: 0.4943
Epoch 2/40
4s - loss: 0.6933 - acc: 0.4977
Epoch 3/40
4s - loss: 0.6934 - acc: 0.5046
Epoch 4/40
4s - loss: 0.6932 - acc: 0.5046
Epoch 5/40
4s - loss: 0.6932 - acc: 0.5046
Epoch 6/40
4s - loss: 0.6934 - acc: 0.5046
Epoch 7/40
4s - loss: 0.6932 - acc: 0.5046
Epoch 8/40
4s - loss: 0.6932 - acc: 0.5046
Epoch 9/40
4s - loss: 0.6935 - acc: 0.4634
Epoch 10/40
4s - loss: 0.6932 - acc: 0.4748
Epoch 11/40
4s - loss: 0.6932 - acc: 0.5046
Epoch 12/40
4s - loss: 0.6933 - acc: 0.5046
Epoch 13/40
4s - loss: 0.6932 - acc: 0.5046
Epoch 14/40
4s - loss: 0.6932 - acc: 0.5046
Epoch 15/40
4s - loss: 0.6931 - acc: 0.5046
Epoch 16/40
4s - loss: 0.6931 - acc: 0.5046
Epoch 17/40
4s - loss: 0.6932 - acc: 0.5046
Epoch 18/40
4s - loss: 0.6931 - acc: 0.5046
Epoch 19/40
4s - loss: 0.6930 - acc: 0.5046
Epoch 20/40
4s - loss: 0.6937 - acc: 0.5046
Epoch 21/40
4s - loss: 0.6929 - acc: 0.5069
Epoch 22/40
4s - loss: 0.6903 - acc: 0.5481
Epoch 23/40
4s - loss: 0.6614 - acc: 0.59

# Prediction

In [23]:
y_predicted = cnn.predict_classes(test_vectors)



In [36]:
y_test = y_test[:,1].astype(int)

In [37]:
print(classification_report(y_test, y_predicted))

             precision    recall  f1-score   support

          0       0.52      0.50      0.51       121
          1       0.41      0.42      0.41        98

avg / total       0.47      0.47      0.47       219

