In [1]:
import pandas as pd
import numpy as np
import glob as glob
import json
import sklearn
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from random import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score, explained_variance_score


import keras
from keras.layers import Dense, Dropout, RepeatVector, BatchNormalization, Convolution1D, Flatten, Lambda, Permute, MaxPooling1D, AlphaDropout
from keras.models import Sequential
from keras.utils import to_categorical
import keras.backend as K
from sklearn.model_selection import train_test_split
from keras.models import load_model

Using TensorFlow backend.


In [3]:
root_path = 'features_by_valence/'
label_set = ['low', 'high']
low_files = glob.glob(root_path + label_set[0] + '/*')
high_files = glob.glob(root_path + label_set[1] + '/*')

all_features = []

for filename in low_files:
    word_histogram = json.loads(open(filename).read())
    all_features.append({
        'dictionary': word_histogram, 
        'filename': filename.split('/')[-1]})
for filename in high_files:
    word_histogram = json.loads(open(filename).read())
    all_features.append({
        'dictionary': word_histogram, 
        'filename': filename.split('/')[-1]})

In [14]:
root_path = 'stock_price_changes_by_ticker_and_date/'

# filenames = [f['filename'] for f in all_features]
for i, f in enumerate(all_features):
    pc = float(open(root_path + f['filename']).read())
    all_features[i]['percent_change'] = pc

In [17]:
n_total = int(len(all_features))
n_train = int(n_total * 0.8)
n_test = int(n_total * 0.2)

print('total: ' + str(n_total))
print('train: ' + str(n_train))
print('test: ' + str(n_test))

total: 252
train: 201
test: 50


In [18]:
# one hot encoding

def word_list(percent_occurance):
    key_set = {}

    for feature_d in all_features:
        keys = feature_d['dictionary'].keys()
        for key in keys:
            key_set[key] = key_set.get(key, 0) + 1
    key_set
    # Return words that occur in the top percent_occurance% of documents
    key_set = dict([[key, key_set[key]] for key in key_set if (key_set[key]/float(n_total) > float(percent_occurance))])

    words_sorted_alphabetically = list(key_set.keys())
    words_sorted_alphabetically.sort()
    return words_sorted_alphabetically

In [19]:
def get_index(reference, key):
    try:
        return reference.index(key)
    except:
        return len(reference)

In [20]:
ohe = {}
i_2_label = {}

def one_hot_encode(reference, dictionary):
    ohe = np.zeros(len(reference) + 1)
    for key in dictionary.keys():
        pos = get_index(reference, key)
        ohe[pos] = dictionary[key]
    norm_lookup = float(np.linalg.norm(ohe))
    normalized_ohe = [float(item)/float(norm_lookup) for item in ohe]
    return normalized_ohe

In [21]:
def preds_and_true(clf, af, ohe_words):
    correct_preds = 0
    all_preds = 0
    preds = []
    true = []
    for d_item in af:
        feature = d_item['ohe_feature']
        label = d_item['percent_change']
        classification = clf.predict([feature])[0]
        preds.append(classification)
        true.append( label )
    return [preds, true]

In [22]:
def encode_features_and_words(all_features, word_threshold):
    ohe_words = word_list(word_threshold)
    
    for i, f in enumerate(all_features):
        ohe_feature = one_hot_encode(ohe_words, f['dictionary'])
        all_features[i]['ohe_feature'] = ohe_feature
    return [all_features, ohe_words]


In [23]:
# n = 0.001
len(word_list(0.25))

2822

In [25]:
def get_train_test(all_features, n=0.25):
    all_features, ohe_words = encode_features_and_words(all_features, n)

    shuffle(all_features)

    train_features = all_features[0:n_train]
    test_features = all_features[n_train:n_total]

    X = [f['ohe_feature'] for f in train_features]
    y = [f['percent_change'] for f in train_features]
    
    x_val = [f['ohe_feature'] for f in test_features]
    y_val = [f['percent_change'] for f in test_features]
    return [X, y, x_val, y_val]

In [26]:
# 0.25 is a solid value for n. Lower values mean letting in larger amounts of words (features)
# Once validation accuracy is 66.67%, save the weights. That's the max. 
n = 0.25
X, y, x_val, y_val = get_train_test(all_features, n)

In [71]:
np.average(np.array(y)*100)

0.53579222257765668

In [27]:
shape = len(word_list(n)) + 1
shape

2823

In [75]:
shape = len(word_list(n)) + 1

m1 = Sequential([
    BatchNormalization(input_shape=(shape,)),
    Dense(40, activation='relu'),
    Dropout(0.5),
    Dense(20, activation='tanh'),
    Dropout(0.5),
    BatchNormalization(),
    Dense(1)   
])

m1.compile(optimizer='rmsprop',
              loss='mean_squared_error',
              metrics=['mae'])

sched = [[0.0001, 2], [0.001, 20], [0.01, 2], [0.1, 2], [0.5, 1], [0.1, 5], [0.01, 20], [0.001, 40], [0.0001, 80], [0.00005, 120]]


In [None]:
for info in sched:
    lr, epochs = info
    m1.optimizer.lr = lr
    m1.fit(np.array(X), np.array(y)*100, epochs=epochs,  batch_size=64, validation_data=(np.array(x_val), np.array(y_val)*100))

Train on 201 samples, validate on 51 samples
Epoch 1/2
Epoch 2/2
Train on 201 samples, validate on 51 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 201 samples, validate on 51 samples
Epoch 1/2
Epoch 2/2
Train on 201 samples, validate on 51 samples
Epoch 1/2
Epoch 2/2
Train on 201 samples, validate on 51 samples
Epoch 1/1
Train on 201 samples, validate on 51 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Train on 201 samples, validate on 51 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 201 samples, validate on 51 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40


Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Train on 201 samples, validate on 51 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80

array([[ 0.00646421]], dtype=float32)

In [63]:
for i in range(len(X)):
    print(m1.predict(np.array([X[i]])))
    print(y[i])
    print("\n")

[[-0.03393828]]
-0.0005292405398252453


[[-0.03410605]]
0.008333333333333255


[[-0.01393568]]
-0.04828973843058361


[[-0.05682974]]
0.008258680999914739


[[-0.06315508]]
-0.03969128996692402


[[ 0.0414346]]
-0.010504201680672232


[[-0.07775185]]
0.00330429558425952


[[-0.01503236]]
-0.017325680272108797


[[-0.05437827]]
0.0689941812136326


[[-0.01032812]]
-0.004714757190004782


[[ 0.03748613]]
0.11038961038961026


[[ 0.03277944]]
0.06974477958236659


[[ 0.0618001]]
0.060831509846826996


[[-0.00046651]]
-0.018439716312056723


[[-0.0925583]]
-0.019653179190751477


[[-0.02209948]]
0.017944535073409516


[[ 0.01048759]]
0.009907755380936084


[[ 0.03909699]]
0.03676012461059189


[[-0.05168632]]
0.02627257799671595


[[-0.00342466]]
-0.02448210922787202


[[ 0.05398517]]
0.05106382978723409


[[ 0.00389128]]
-0.021212121212121165


[[ 0.01657477]]
0.3072440944881889


[[ 0.0322951]]
0.040425531914893585


[[-0.06094308]]
-0.06740389678778293


[[ 0.00656767]]
0.0138888888888