# Reddit stock market prediction

### Import libraries

In [72]:
# pull libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

### Preprocess data

In [73]:
# pull data
djia_data = pd.read_csv('./stocknews/Combined_News_DJIA.csv', parse_dates=True)

djia_data['combined'] = djia_data[djia_data.columns[2:]].apply(
    lambda x: ' '.join(x.astype(str)),
    axis=1
)

def sanitise_row(row):
    return re.sub('[^A-Za-z ]+', '', row.replace("b\"", "").replace("b'", ""))

def remove_stop_words(sentence):
    sentence = sanitise_row(sentence)
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(sentence) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

djia_data['combined'] = djia_data['combined'].map(lambda x: remove_stop_words(x))

# print(type(djia_data['combined']))

In [74]:
# Ratio of label 1s and 0s
data_label = np.array(djia_data['Label'])

print(np.sum(data_label == 1) / data_label.shape[0])

0.5354449472096531


### Preprocess data

In [75]:
# prepare train and test data
all_data = djia_data['combined']
data_label = np.array(djia_data['Label'])

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(all_data)
# print(vectorizer.get_feature_names())
# print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(
    all_data, data_label, test_size=0.80, random_state=23)
# we also need dictionary that maps word to number of occurences -> 
# to handle cases where word is not found in the training corpus

In [76]:
# We need to convert training data into a vector for LSTM
# how does the vector look like?
# need to build word_index first!


# generate word numbers
def gen_word_indexes(data): 
    word_index = {}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2  # unknown
    word_index["<UNUSED>"] = 3

    index = 4
    # loop through everything in all_data
    for row in data:
        for word in row.split():
            word = word.lower()
            if word not in word_index:
                word_index[word] = index
                index += 1

    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    return (word_index, reverse_word_index)


# text --> numbers corresponding to words
def vectorise_text_data(data, word_index):
    vectorised_data = []
    for row in data:
        current_row = []
        for word in row.split():
            word = word.lower()
            if word not in word_index:
                current_row.append(word_index["<UNUSED>"])
            else:
                current_row.append(word_index[word])
        vectorised_data.append(current_row)
    return vectorised_data

def aggregate_previous_days(num_days, data):
    new = data.copy()
    for i in range(num_days, len(data) - num_days):
        for d in range(1, num_days):
            new.at[i] = new[i] + " " + data[d+i-1]
    return new    

In [77]:
(word_index, reverse_word_index) = gen_word_indexes(all_data)
# save word_index somewhere
# import pickle
# pickle_out = open("word_index.pickle","wb")
# pickle.dump(word_index, pickle_out)
# pickle_out.close()

X_vectorised = vectorise_text_data(all_data, word_index)
# time_series_data = aggregate_previous_days(5, all_data)
# X_series_vectorised = vectorise_text_data(time_series_data, word_index)

In [9]:
# print(data_label)
# data_label_5day = data_label.copy()

# # remove the first 4 and last data label:
# data_label_5day = data_label_5day[5:]
# X_5day_vectorised = X_series_vectorised[4:-1]

# print("sizes" + str(len(X_5day_vectorised)) + " " +  str(len(data_label_5day)))

[0 1 0 ... 1 1 1]
sizes1984 1984


In [80]:
from keras.preprocessing import sequence

# number of most-frequent words to use
nb_words = 40000
# cut texts after this number of words
maxlen = 1000

# we can change the "X???_vectorised" based on the number of cumulative days:
X_v_train, X_v_test, y_v_train, y_v_test = train_test_split(
    X_vectorised, data_label, test_size=0.20, random_state=32)

X_v_pad_train = sequence.pad_sequences(X_v_train,
                                       value=word_index["<PAD>"],
                                       padding='post',
                                       maxlen=maxlen)
X_v_pad_test = sequence.pad_sequences(X_v_test, 
                                      value=word_index["<PAD>"],
                                      padding='post',
                                      maxlen=maxlen)

print(X_v_pad_train.shape, X_v_pad_test.shape)
print(X_v_pad_train[6])

(1591, 1000) (398, 1000)
[   7 1003 1004 3171 3172 3173 3174 1530   16 3175 1812   83  758 1318
 1252 3176 3177   89 3178 3179 3180   12  794 3181   81 3182 2841  180
  586 3183 2128   19   35   83 1339 1761 2388  100 3114 3184  499 1098
  942 3185 3186  527 3187 3188  599  847  128 1055 3189   42   83 2487
  879  836 3190 1923   83 2469 3191   16 3192 3193 2384 3194 1996   87
 3195 3196 1350 1129 1138 1364 1365 2540   87 3197 3198 3199   40 3131
 2953 3132 1021 1778  163 3200 1746  715   83  163  797 1673 3201 2854
 3202 3203 1768 3204 3205 3206  589 3207 3208 3209 3210 1749 1332  797
 1695 3211 1212 3212 3213 3214 2739  215 3215 2576   58 3216 1872 3217
 3218  156  973 1096 3219 3220 1388 2423 3221 1628  180 2518 2602 3222
 3223 3224 1794 2883 3225  289  752  272 3226  121 1283 3227 2789  318
 2536 1812 3228  197 3229  258 2485 3230 2067  280 3231 2519 1313    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0  

Now what? We train!

### Training using Naive Bayes

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# LogisticRegression(random_state=0, solver='lbfgs')

v = CountVectorizer()

text_clf = Pipeline([
    ('vect', v),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)  

0.535175879396985

### Training using LSTM

In [81]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D
from keras.layers import LSTM

EMBEDDING_DIM = 128 # dimension for dense embeddings for each token
LSTM_DIM = 64 # total LSTM units

model = Sequential()
model.add(Embedding(input_dim=nb_words, output_dim=EMBEDDING_DIM, input_length=maxlen))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(LSTM_DIM, dropout=0.8, recurrent_dropout=0.8))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])

In [None]:
batch_size = 128
model.fit(X_v_pad_train, y_v_train, epochs=7, batch_size=batch_size, 
          shuffle=True, validation_split=0.3, verbose=1)

Instructions for updating:
Use tf.cast instead.
Train on 1110 samples, validate on 477 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
 256/1110 [=====>........................] - ETA: 9s - loss: 0.6874 - acc: 0.5547 

In [20]:
scores = model.evaluate(X_v_pad_test, y_v_test, verbose=2)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 53.90%


In [21]:
# Save the model
from keras.models import model_from_json

#serialize to JSON
model_json = model.to_json()
with open("model3.json", "w") as json_file:
    json_file.write(model_json)
#serialize weights to HDF5
model.save_weights("model3.h5")
print("Saved model to the disk")

Saved model to the disk


In [90]:
# load some stuff

json_file = open('model2.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_model_json)
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# get data from json
data_from_server = [
    "If You Cannot Trust Saudis With Bone Saw, Says US Lawmaker, 'You Should Not Trust Them With Nuclear Weapons': Trump administration's secret authorizations of nuclear technology sales to Saudi Arabia spark alarm in Congress",
    "French healthcare system 'should not fund homeopathy' - French medical and drug experts say homeopathic medicines should no longer be paid for by the country\u2019s health system because there is no evidence they work.",
    "Trump says Navy SEAL charged with war crimes, committing premeditated murder and shooting at unarmed Iraqi civilians, will be moved to 'less restrictive confinement'",
    "Years of Mark Zuckerberg's old Facebook posts have vanished. The company says it 'mistakenly deleted' them.",
    "France and Germany hold historic first joint parliamentary session, commit to joint defence and \"a common military culture\"",
    "Russia tells Trump its troops will stay in Venezuela for 'as long as needed' in a blunt rejection of his demand they leave immediately",
    "Trump cuts all direct assistance to Northern Triangle countries Honduras, El Salvador, Guatemala",
    "Churchill's policies caused millions of Indian famine deaths, study says",
    "Man jailed for harbouring Paris attackers",
    "Secret tape increases pressure on Trudeau",
    "Puerto Rico governor warns White House: 'If the bully gets close, I'll punch the bully in the mouth'",
    "Philippines beach resort slams 'freeloading' social media influencers",
    "Huawei says US has 'loser's attitude' because it can't compete",
    "A new study reveals the Amazon is losing surface water | A major new study of the Amazon has revealed an alarming trend, with the region losing as much as 350 km2 of surface freshwater every year on average. The loss is related to the construction of hydropower dams, deforestation and climate change",
    "The day North Korea talks collapsed, Trump passed Kim a note demanding he turn over his nukes",
    "Burundi bans BBC and suspends Voice of America, activists cry foul",
    "Schoolboy finds lost medieval gravestone",
    "Questions are mounting over special counsel Mueller\u2019s inquiry into whether Trump obstructed justice as lawmakers on Capitol Hill await the release of his report.",
    "China's Antarctic bases within Australia's claim are going unchecked",
    "Norway agreed on Thursday to hand back thousands of artefacts removed from Easter Island by the explorer Thor Heyerdahl during his trans-Pacific raft expeditions in the 1950s.",
    "Brunei defends tough new Islamic laws that would allow death by stoning for adultery and homosexuality against growing backlash",
    "Pope, Morocco's king, say Jerusalem must be open to all faiths",
    "Video of father and son illegally killing mama bear, shrieking cubs released in Alaska",
    "WTO rules against the US and Boeing in mammoth trade row with the EU",
    "French 'yellow vests' stage 20th day of protests"
]

data_from_server_vector = vectorise_text_data(data_from_server, word_index)

data_from_server_vector_pad = sequence.pad_sequences(data_from_server_vector, 
                                      value=word_index["<PAD>"],
                                      padding='post',
                                      maxlen=maxlen)

loaded_model.predict(data_from_server_vector_pad)

array([[0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209],
       [0.4938209]], dtype=float32)