# Reddit stock market prediction

### Import libraries

In [2]:
# pull libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

### Preprocess data

In [3]:
# pull data
djia_data = pd.read_csv('./stocknews/Combined_News_DJIA.csv', parse_dates=True)

djia_data['combined'] = djia_data[djia_data.columns[2:]].apply(
    lambda x: ' '.join(x.astype(str)),
    axis=1
)

def sanitise_row(row):
    return re.sub('[^A-Za-z ]+', '', row.replace("b\"", "").replace("b'", ""))

def remove_stop_words(sentence):
    sentence = sanitise_row(sentence)
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(sentence) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

djia_data['combined'] = djia_data['combined'].map(lambda x: remove_stop_words(x))

# print(type(djia_data['combined']))

In [4]:
# Ratio of label 1s and 0s
data_label = np.array(djia_data['Label'])

print(np.sum(data_label == 1) / data_label.shape[0])

0.5354449472096531


### Preprocess data

In [15]:
# prepare train and test data
all_data = djia_data['combined']
data_label = np.array(djia_data['Label'])

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(all_data)
# print(vectorizer.get_feature_names())
# print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(
    all_data, data_label, test_size=0.50, random_state=23)
# we also need dictionary that maps word to number of occurences -> 
# to handle cases where word is not found in the training corpus

In [63]:
# We need to convert training data into a vector for LSTM
# how does the vector look like?
# need to build word_index first!

def gen_word_indexes(data): 
    word_index = {}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2  # unknown
    word_index["<UNUSED>"] = 3

    index = 4
    # loop through everything in all_data
    for row in data:
        for word in row.split():
            word = word.lower()
            if word not in word_index:
                word_index[word] = index
                index += 1

    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    return (word_index, reverse_word_index)


def vectorise_text_data(data, word_index):
    vectorised_data = []
    for row in data:
        current_row = []
        for word in row.split():
            word = word.lower()
            current_row.append(word_index[word])
        vectorised_data.append(current_row)
    return vectorised_data

In [68]:
(word_index, reverse_word_index) = gen_word_indexes(all_data)
X_vectorised = vectorise_text_data(all_data, word_index)

In [101]:
from keras.preprocessing import sequence

# number of most-frequent words to use
nb_words = 40000
# cut texts after this number of words
maxlen = 200

X_v_train, X_v_test, y_v_train, y_v_test = train_test_split(
    X_vectorised, data_label, test_size=0.20, random_state=23)

X_v_pad_train = sequence.pad_sequences(X_v_train,
                                       value=word_index["<PAD>"],
                                       padding='post',
                                       maxlen=maxlen)
X_v_pad_test = sequence.pad_sequences(X_v_test, 
                                      value=word_index["<PAD>"],
                                      padding='post',
                                      maxlen=maxlen)

print(X_v_pad_train.shape, X_v_pad_test.shape)
print(X_v_pad_train[2])

(1591, 200) (398, 200)
[20016  6591   586  1746 20017  1740  1331   679  6555 19006   534  4183
   356   215   356 11047  1818   453    83  6386  1887  9140  2584 20018
   712  6846 20019 20020   853   365  3480  1041   844  1088    66   634
  4161  2750  2842   831  4624  3316    42    40    13   176  2251    80
  8738   763   862   264   619  3931 14426  5305    97   318  4442    58
   175 20021   318  1716   799  3183  1092  2841  3907   950   892  1311
  3897 20022   118   574  1464  7092  2408 11785  2056  1638  1189  2754
  1635    12  3638  1669    58    12  6300  3897   799   841   901  4974
   123  4043  1189  2171  2351  2845  3372    83  3638  3203  1669  4700
 20023 20024   238  4651    29 20024  5880  6625  2519   769   841   301
    79  5359   933  7283  2426   929 12239 20025  4653  8497 20026  3239
  5203 20027    58  2821  1581   439  3648  4054 20028  3648  4054  2117
 10117    91  2171   405  3410  7587  6340 11804  1626   439  3648  4054
  2117 20029  1581  2822  28

Now what? We train!

### Training using Naive Bayes

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# LogisticRegression(random_state=0, solver='lbfgs')

v = CountVectorizer()

text_clf = Pipeline([
    ('vect', v),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)  

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


0.5386934673366834

### Training using LSTM

In [105]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D
from keras.layers import LSTM

EMBEDDING_DIM = 50 # dimension for dense embeddings for each token
LSTM_DIM = 32 # total LSTM units

model = Sequential()
model.add(Embedding(input_dim=nb_words, output_dim=EMBEDDING_DIM, input_length=maxlen))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(LSTM_DIM, dropout=0.5, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])

In [106]:
batch_size = 128
model.fit(X_v_pad_train, y_v_train, epochs=5, batch_size=batch_size, 
          shuffle=True, validation_split=0.1, verbose=1)

Train on 1431 samples, validate on 160 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x150772ef0>

In [104]:
scores = model.evaluate( X_v_pad_test, y_v_test, verbose=2)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 56.53%
