# Reddit stock market prediction

### Import libraries

In [72]:
# pull libraries
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

### Preprocess data

In [73]:
# pull data
djia_data = pd.read_csv('./stocknews/Combined_News_DJIA.csv', parse_dates=True)

djia_data['combined'] = djia_data[djia_data.columns[2:]].apply(
    lambda x: ' '.join(x.astype(str)),
    axis=1
)

def sanitise_row(row):
    return re.sub('[^A-Za-z ]+', '', row.replace("b\"", "").replace("b'", ""))

def remove_stop_words(sentence):
    sentence = sanitise_row(sentence)
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(sentence) 
    filtered_sentence = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_sentence)

djia_data['combined'] = djia_data['combined'].map(lambda x: remove_stop_words(x))

# print(type(djia_data['combined']))

In [74]:
# Ratio of label 1s and 0s
data_label = np.array(djia_data['Label'])

print(np.sum(data_label == 1) / data_label.shape[0])

0.5354449472096531


### Preprocess data

In [75]:
# prepare train and test data
all_data = djia_data['combined']
data_label = np.array(djia_data['Label'])

# vectorizer = CountVectorizer()
# X = vectorizer.fit_transform(all_data)
# print(vectorizer.get_feature_names())
# print(X.shape)

X_train, X_test, y_train, y_test = train_test_split(
    all_data, data_label, test_size=0.80, random_state=23)
# we also need dictionary that maps word to number of occurences -> 
# to handle cases where word is not found in the training corpus

In [76]:
# We need to convert training data into a vector for LSTM
# how does the vector look like?
# need to build word_index first!


# generate word numbers
def gen_word_indexes(data): 
    word_index = {}
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2  # unknown
    word_index["<UNUSED>"] = 3

    index = 4
    # loop through everything in all_data
    for row in data:
        for word in row.split():
            word = word.lower()
            if word not in word_index:
                word_index[word] = index
                index += 1

    reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
    return (word_index, reverse_word_index)


# text --> numbers corresponding to words
def vectorise_text_data(data, word_index):
    vectorised_data = []
    for row in data:
        current_row = []
        for word in row.split():
            word = word.lower()
            if word not in word_index:
                current_row.append(word_index["<UNUSED>"])
            else:
                current_row.append(word_index[word])
        vectorised_data.append(current_row)
    return vectorised_data

def aggregate_previous_days(num_days, data):
    new = data.copy()
    for i in range(num_days, len(data) - num_days):
        for d in range(1, num_days):
            new.at[i] = new[i] + " " + data[d+i-1]
    return new    

In [77]:
(word_index, reverse_word_index) = gen_word_indexes(all_data)
# # X_vectorised = vectorise_text_data(all_data, word_index)

time_series_data = aggregate_previous_days(1, all_data)
# print(time_series_data[6])
# print(all_data[6])
print("all data:")
print(all_data[5])

In [7]:
print(time_series_data[5])




In [8]:
X_series_vectorised = vectorise_text_data(time_series_data, word_index)
print(X_series_vectorised)
print(len(X_series_vectorised))

[[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 7, 26, 27, 28, 29, 21, 22, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 38, 47, 7, 26, 48, 21, 22, 49, 4, 50, 6, 7, 51, 13, 4, 52, 21, 22, 16, 53, 54, 55, 56, 57, 58, 59, 60, 61, 47, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 33, 19, 72, 73, 74, 29, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 4, 12, 16, 85, 86, 87, 88, 89, 90, 91, 42, 83, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 4, 12, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 9, 122, 82, 123, 12, 124, 125, 17, 4, 52, 21, 22, 16, 126, 127, 128, 129, 4, 130, 131, 132, 12, 133, 134, 135, 136, 137, 85, 58, 83, 54, 138, 139, 93, 140, 91, 93, 141, 142, 143, 144, 89, 145, 146, 147, 148, 94, 149, 150, 151, 152, 58, 153, 154, 155, 156, 157, 91, 158, 159, 160, 4, 161, 162, 163, 164, 165, 7, 166, 167, 168, 169, 21, 22, 170, 171, 172, 173, 91, 174, 175, 83, 176, 177, 123, 178, 179, 180, 4, 

In [9]:
# print(data_label)
# data_label_5day = data_label.copy()

# remove the first 4 and last data label:
data_label_5day = data_label_5day[1:]
X_5day_vectorised = X_series_vectorised[0:-1]

# print("sizes" + str(len(X_5day_vectorised)) + " " +  str(len(data_label_5day)))

[0 1 0 ... 1 1 1]
sizes1988 1988


In [80]:
from keras.preprocessing import sequence

# number of most-frequent words to use
nb_words = 40000
# cut texts after this number of words
maxlen = 500

# we can change the "X???_vectorised" based on the number of cumulative days:
X_v_train, X_v_test, y_v_train, y_v_test = train_test_split(
    X_vectorised, data_label, test_size=0.20, random_state=32)

X_v_pad_train = sequence.pad_sequences(X_v_train,
                                       value=word_index["<PAD>"],
                                       padding='post',
                                       maxlen=maxlen)
X_v_pad_test = sequence.pad_sequences(X_v_test, 
                                      value=word_index["<PAD>"],
                                      padding='post',
                                      maxlen=maxlen)

print(X_v_pad_train.shape, X_v_pad_test.shape)
print(X_v_pad_train[6])

(1590, 500) (398, 500)
[ 2413  3553  1009   238 23290  4355  4761 19422  1605   156  9135  2841
  3823  3501  1088  1029   206  5129  1576    35 30441  1548  2296  7177
  7693    94  5129    35   158  4953  1491  1492 10807  1356  6704 30797
  9096   353 11195  6909  6368  1491  1492  7827    93  2998  6573 11290
 17898 11748    89   768    58  2310  3444  4561  9812 22268  5807  9693
  4061  6057  3780   879  8469 20630  4172  3886   176   462 30798 30799
   214  1088  2296   238  4306  1669  1670  3034  3884  3592 17113    68
  7656   392   101 30800   101 19200   101 30801  8717  2310  2842   238
  1483  3831  5083  2430 15804  3584  6901 13763  1127   238    16   100
  7751 30802  5989   256   704  2302  4070   213  4071  3848  2077  2630
   231  8372  1095 16801   529   453  1357   827  8997   453    58   681
  5320   827    36  3967    37  8572  4368  1037    21    29   947  3232
  1514  7046   213  6309  6450  2371  4551  1903  3592  3578   150   448
   346   347  4249  8083  10

Now what? We train!

### Training using Naive Bayes

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# LogisticRegression(random_state=0, solver='lbfgs')

v = CountVectorizer()

text_clf = Pipeline([
    ('vect', v),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)  

0.535175879396985

### Training using LSTM

In [81]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, SpatialDropout1D
from keras.layers import LSTM

EMBEDDING_DIM = 128 # dimension for dense embeddings for each token
LSTM_DIM = 64 # total LSTM units

model = Sequential()
model.add(Embedding(input_dim=nb_words, output_dim=EMBEDDING_DIM, input_length=maxlen))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(LSTM_DIM, dropout=0.8, recurrent_dropout=0.8))
model.add(Dense(1, activation="sigmoid"))

model.compile(loss="binary_crossentropy", optimizer="adam",
              metrics=["accuracy"])

In [13]:
batch_size = 128
model.fit(X_v_pad_train, y_v_train, epochs=7, batch_size=batch_size, 
          shuffle=True, validation_split=0.3, verbose=1)

Instructions for updating:
Use tf.cast instead.
Train on 1113 samples, validate on 477 samples
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f95be05f518>

In [14]:
scores = model.evaluate(X_v_pad_test, y_v_test, verbose=2)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

acc: 57.79%


In [15]:
# Save the model
from keras.models import model_from_json

#serialize to JSON
model_json = model.to_json()
with open("1daymodel.json", "w") as json_file:
    json_file.write(model_json)
#serialize weights to HDF5
model.save_weights("1daymodel_predicts.h5")
print("Saved model to the disk")

Saved model to the disk


In [92]:
# load some stuff

json_file = open('1daymodel.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_model_json)
loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# get data from json
data_from_server = [
    "If You Cannot Trust Saudis With Bone Saw, Says US Lawmaker, 'You Should Not Trust Them With Nuclear Weapons': Trump administration's secret authorizations of nuclear technology sales to Saudi Arabia spark alarm in Congress",
    "French healthcare system 'should not fund homeopathy' - French medical and drug experts say homeopathic medicines should no longer be paid for by the country\u2019s health system because there is no evidence they work.",
    "Trump says Navy SEAL charged with war crimes, committing premeditated murder and shooting at unarmed Iraqi civilians, will be moved to 'less restrictive confinement'",
    "Years of Mark Zuckerberg's old Facebook posts have vanished. The company says it 'mistakenly deleted' them.",
    "France and Germany hold historic first joint parliamentary session, commit to joint defence and \"a common military culture\"",
    "Russia tells Trump its troops will stay in Venezuela for 'as long as needed' in a blunt rejection of his demand they leave immediately",
    "Trump cuts all direct assistance to Northern Triangle countries Honduras, El Salvador, Guatemala",
    "Churchill's policies caused millions of Indian famine deaths, study says",
    "Man jailed for harbouring Paris attackers",
    "Secret tape increases pressure on Trudeau",
    "Puerto Rico governor warns White House: 'If the bully gets close, I'll punch the bully in the mouth'",
    "Philippines beach resort slams 'freeloading' social media influencers",
    "Huawei says US has 'loser's attitude' because it can't compete",
    "A new study reveals the Amazon is losing surface water | A major new study of the Amazon has revealed an alarming trend, with the region losing as much as 350 km2 of surface freshwater every year on average. The loss is related to the construction of hydropower dams, deforestation and climate change",
    "The day North Korea talks collapsed, Trump passed Kim a note demanding he turn over his nukes",
    "Burundi bans BBC and suspends Voice of America, activists cry foul",
    "Schoolboy finds lost medieval gravestone",
    "Questions are mounting over special counsel Mueller\u2019s inquiry into whether Trump obstructed justice as lawmakers on Capitol Hill await the release of his report.",
    "China's Antarctic bases within Australia's claim are going unchecked",
    "Norway agreed on Thursday to hand back thousands of artefacts removed from Easter Island by the explorer Thor Heyerdahl during his trans-Pacific raft expeditions in the 1950s.",
    "Brunei defends tough new Islamic laws that would allow death by stoning for adultery and homosexuality against growing backlash",
    "Pope, Morocco's king, say Jerusalem must be open to all faiths",
    "Video of father and son illegally killing mama bear, shrieking cubs released in Alaska",
    "WTO rules against the US and Boeing in mammoth trade row with the EU",
    "French 'yellow vests' stage 20th day of protests"
]

data_from_server_vector = vectorise_text_data(data_from_server, word_index)

data_from_server_vector_pad = sequence.pad_sequences(data_from_server_vector, 
                                      value=word_index["<PAD>"],
                                      padding='post',
                                      maxlen=500)

loaded_model.predict(data_from_server_vector_pad)

array([[0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386],
       [0.5174386]], dtype=float32)