In [34]:
import numpy as np
import pandas as pd 
import datetime

# imports
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

# Import TFIDFVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import text 

#tokenizers
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re

#neural network RNN
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GRU, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.callbacks import EarlyStopping



In [2]:
# to load file 
daily = pd.read_csv('/Users/plarkin/Downloads/daily.csv')
daily.head()

Unnamed: 0.1,Unnamed: 0,date,text,price,pct_change,price_direction,day_of_week,is_holiday
0,0,2011-07-29,"Drug App Comes Free, Ads Included. Epocrates h...",1292.28,,first,4,0
1,1,2011-07-30,Global Concern Over U.S. Debt Ceiling Disagree...,1292.28,0.0,same,5,0
2,2,2011-07-31,"Deal May Avert Default, but Some Ask, ‘Is That...",1292.28,0.0,same,6,0
3,3,2011-08-01,"Charging a Premium for Movies, at a Cost. High...",1286.94,-0.004132,down,0,0
4,4,2011-08-02,"Pearls, Finer but Still Cheap, Flow From China...",1254.05,-0.025557,down,1,0


## Sentiment Analysis

#### Vader Sentiment Analysis

#### Textblob Sentiment Analysis
source : https://neptune.ai/blog/sentiment-analysis-python-textblob-vs-vader-vs-flair

https://textblob.readthedocs.io/en/dev/

In [3]:
#Adding in Sentiment analysis with designated columns for each output (pos, neg, neu, compound)
analyzer = SentimentIntensityAnalyzer()

#daily['vader'] = daily['text'].map(lambda x:analyzer.polarity_scores(str(x)))


daily['vader_compound'] = [analyzer.polarity_scores(x)['compound'] for x in daily['text']]
# draft_df['vd_neg'] = [analyzer.polarity_scores(x)['neg'] for x in draft_df['alltext']]
# draft_df['vd_neu'] = [analyzer.polarity_scores(x)['neu'] for x in draft_df['alltext']]
# draft_df['vd_pos'] = [analyzer.polarity_scores(x)['pos'] for x in draft_df['alltext']]

%time

from textblob import TextBlob
#testimonial = TextBlob()
#draft_df['tb_polarity'] = [testimonial.polarity(x)['polarity'] for x in draft_df['alltext']]
#draft_df['tb_subj'] = [testimonial.sentiment(x)['subjectivity'] for x in draft_df['alltext']]
daily['textblob_polarity'] = daily['text'].map(lambda words: TextBlob(str(words)).polarity) #polarity is more applicable and comparable to vader compound. subjectivity is more about opinion vs fact 

%time

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs
CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.77 µs


### Text Cleaning
eliminate the punctuation, URL, and @
#source: https://monkeylearn.com/blog/text-cleaning/

In [4]:
#Use this to remove http, punctuation, URL, and @
daily['text'] = daily['text'].map(lambda x: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", str(x.lower())))
#convert price_direction to numerical and drop first row with NA value
daily.dropna(inplace=True)
daily['price_direction'] = daily['price_direction'].map({'down' : -1,'same' : 0 , 'up' : 1})

tokenize and lemmatize
(no longer lemmatizing, results from gridsearch showed superior accuracy without lemmatizing)

## RNN with TFIDF

In [5]:
#declaring features and target variable for tfidf. It will not take an array as the X input
the_text = daily['text']
the_target = daily['price_direction']

# TFIDF, increased to 5000 to capture more dates
rnn_tvec = TfidfVectorizer(stop_words='english', max_features = 5000, ngram_range= (1,2))
rnn_tvec_df = pd.DataFrame(rnn_tvec.fit_transform(the_text).todense(), columns=rnn_tvec.get_feature_names())


In [6]:
daily.head(1)

Unnamed: 0.1,Unnamed: 0,date,text,price,pct_change,price_direction,day_of_week,is_holiday,vader_compound,textblob_polarity
1,1,2011-07-30,global concern over us debt ceiling disagreeme...,1292.28,0.0,0,5,0,0.664,0.072709


In [7]:
rnn_tvec_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3652 entries, 0 to 3651
Columns: 5000 entries, 10 to zuckerberg mark
dtypes: float64(5000)
memory usage: 139.3 MB


In [8]:
#merging vectorized dataframe with original dataset including sentiment analysis. More features!
merged_df = rnn_tvec_df.join(daily, how ='inner',lsuffix = '_')
merged_df.head(2)


Unnamed: 0.1,10,10 billion,10 percent,100,100 million,11,12,12 billion,13,14,...,Unnamed: 0,date,text,price,pct_change,price_direction,day_of_week,is_holiday,vader_compound,textblob_polarity
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,2011-07-30,global concern over us debt ceiling disagreeme...,1292.28,0.0,0,5,0,0.664,0.072709
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,2011-07-31,deal may avert default but some ask is that go...,1292.28,0.0,0,6,0,0.9962,0.094384


In [9]:
#setting date as index and dropping text column for modeling
merged_df.set_index('date', inplace=True)
merged_df.sort_index(inplace=True)

merged_df.drop(columns= ['text'], inplace=True)


### Train Test Split

In [10]:
X = merged_df.drop(columns= ['price_direction'])
y = merged_df[['price_direction']].values
yy = merged_df['price_direction']


In [11]:
yy.value_counts(normalize=True)

 1    0.380992
-1    0.311148
 0    0.307861
Name: price_direction, dtype: float64

In [12]:
#sticking with a test size of 0.20 to save 2 years of data to test on
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, shuffle=False, test_size=0.2)


In [13]:
#need to one hot encode multiclass target in order to process in nn
y_train = to_categorical(y_train, 3)
y_test = to_categorical(y_test, 3)

#source https://stackoverflow.com/questions/61550026/valueerror-shapes-none-1-and-none-3-are-incompatible

In [14]:
# Scale for neural networks
ss = StandardScaler()

Xs_train = ss.fit_transform(X_train)
Xs_test = ss.transform(X_test)

In [15]:
# the length parameter dictates how many rows will constitute a sample
train_sequences = TimeseriesGenerator(Xs_train, y_train, length=90, batch_size=268) #increased batch sizes from 64 to 268
# test sequences
test_sequences = TimeseriesGenerator(Xs_test, y_test, length=90, batch_size=268)

In [16]:
train_sequences[0][0].shape

(268, 90, 5007)

In [17]:
input_shape = train_sequences[0][0][0].shape
input_shape

(90, 5007)

# idea, add some more regularization 
https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17

In [None]:
#modeling and layers
model = Sequential()

model.add(GRU(8, input_shape=input_shape, return_sequences= True))
model.add(GRU(8, return_sequences= False))


model.add(Dense(32, activation='relu'))#added and increased both hidden layers to 32
model.add(BatchNormalization()) #added to help regularize model
model.add(Dropout(0.4)) #added to help regularize model
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.4)) #added to help regularize model
model.add(Dense(3, activation='softmax')) #softmax for multi-classification

#compile it
model.compile(optimizer='Adam', loss='CategoricalCrossentropy', metrics=['acc']) #categorical crossentropy for multi-classification

#fit it
#adding early stopping as a regularization technique
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')


history = model.fit(train_sequences, validation_data=test_sequences, epochs=20, verbose=0, callbacks=[early_stop]) #increased epochs from 100 to 300


In [None]:
#plot our results
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Test loss')
plt.legend();


In [None]:
plt.plot(history.history['acc'], label='Train accuracy')
plt.plot(history.history['val_acc'], label='Test accuracy')
plt.legend();

In [None]:
SVM XGB

In [22]:
history.summary()

AttributeError: 'History' object has no attribute 'summary'

In [51]:
max(history.history['val_acc'])

0.4836193323135376

In [None]:
import pickle
import joblib

In [None]:
# save the model to disk
filename = 'finalized_model.sav'
all_files.append(filename)
joblib.dump(history, filename)

# Create and train a new model instance.
model = create_model()
model.fit(train_images, train_labels, epochs=5)

# Save the entire model as a SavedModel.
!mkdir -p saved_model
model.save('saved_model/my_model') 

In [None]:
# load the model from disk
loaded_model = joblib.load(filename)
result = loaded_model.score(X_test, Y_test)
print(result)

In [None]:
# save the model to disk
#all_files= []
filename = 'model2_500epochs.sav'
all_files.append(filename)

pickle.dump(model, open(filename, 'wb'))
 

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)

RNN Model history/specs

FIRST
max test accuracy 0.45553821325302124


#modeling and layers
model = Sequential()

model.add(GRU(8, input_shape=input_shape, return_sequences= True))
model.add(GRU(8, return_sequences= False))


model.add(Dense(2, activation='relu'))
model.add(Dense(2, activation='relu'))
model.add(Dense(3, activation='softmax'))

#compile it
model.compile(optimizer=Adam(learning_rate=.0005), loss='CategoricalCrossentropy', metrics=['acc'])

#fit it
history = model.fit(train_sequences, validation_data=test_sequences, epochs=100, verbose=0)

notes: maxing out after 15-20 epochs 

SECOND
max test accuracy 0.4570982754230499

#modeling and layers
model = Sequential()

model.add(GRU(8, input_shape=input_shape, return_sequences= True))
model.add(GRU(8, return_sequences= False))


model.add(Dense(8, activation='relu'))#added and increased both hidden layers to 8
model.add(Dense(8, activation='relu'))
model.add(Dense(3, activation='softmax'))

#compile it
model.compile(optimizer=Adam(learning_rate=.0005), loss='CategoricalCrossentropy', metrics=['acc'])

#fit it
history = model.fit(train_sequences, validation_data=test_sequences, epochs=300, verbose=0) #increased epochs from 100 to 300

notes: maxing out after 20 epochs 

THIRD
0.4586583375930786

#modeling and layers
model = Sequential()

model.add(GRU(8, input_shape=input_shape, return_sequences= True))
model.add(GRU(8, return_sequences= False))


model.add(Dense(32, activation='relu'))#added and increased both hidden layers to 32
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))

#compile it
model.compile(optimizer='Adam', loss='CategoricalCrossentropy', metrics=['acc'])

#fit it

history = model.fit(train_sequences, validation_data=test_sequences, epochs=20, verbose=0) #decreased epochs from 300 to 20


FOURTH
0.4602183997631073
added earlystopping

#modeling and layers
model = Sequential()

model.add(GRU(8, input_shape=input_shape, return_sequences= True))
model.add(GRU(8, return_sequences= False))


model.add(Dense(32, activation='relu'))#added and increased both hidden layers to 32
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax')) #softmax for multi-classification

#compile it
model.compile(optimizer='Adam', loss='CategoricalCrossentropy', metrics=['acc']) #categorical crossentropy for multi-classification

#fit it
#adding early stopping as a regularization technique
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')


history = model.fit(train_sequences, validation_data=test_sequences, epochs=20, verbose=0, callbacks=[early_stop]) 

FIFTH
0.4492979645729065

#modeling and layers
model = Sequential()

model.add(GRU(8, input_shape=input_shape, return_sequences= True))
model.add(GRU(8, return_sequences= False))


model.add(Dense(32, activation='relu'))#added and increased both hidden layers to 32
model.add(BatchNormalization()) #added to help regularize model
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax')) #softmax for multi-classification

#compile it
model.compile(optimizer='Adam', loss='CategoricalCrossentropy', metrics=['acc']) #categorical crossentropy for multi-classification

#fit it
#adding early stopping as a regularization technique
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')


history = model.fit(train_sequences, validation_data=test_sequences, epochs=20, verbose=0, callbacks=[early_stop]) #increased epochs from 100 to 300


SIXTH
0.4836193323135376
adding dropout 0.2

#modeling and layers
model = Sequential()

model.add(GRU(8, input_shape=input_shape, return_sequences= True))
model.add(GRU(8, return_sequences= False))


model.add(Dense(32, activation='relu'))#added and increased both hidden layers to 32
model.add(BatchNormalization()) #added to help regularize model
model.add(Dropout(0.2)) #added to help regularize model
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2)) #added to help regularize model
model.add(Dense(3, activation='softmax')) #softmax for multi-classification

#compile it
model.compile(optimizer='Adam', loss='CategoricalCrossentropy', metrics=['acc']) #categorical crossentropy for multi-classification

#fit it
#adding early stopping as a regularization technique
early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto')


history = model.fit(train_sequences, validation_data=test_sequences, epochs=20, verbose=0, callbacks=[early_stop]) #increased epochs from 100 to 300


In [None]:
accuracy_score(y_test, preds, sample_weight=None)
#aim for best accuracy, but also reduce false positives and negatives. 

In [46]:
# calculate null accuracy (for multi-class classification problems)
# .head(1) assesses the value 1208
null_accuracy = y_test.value_counts().head(1) / len(y_test)
print('Null accuracy:', null_accuracy)

AttributeError: 'numpy.ndarray' object has no attribute 'value_counts'

In [47]:
# print message text for the false negatives (spam incorrectly classified as ham)

fn= pd.DataFrame(X_test[preds < y_test])
fn['vader_compound']
# alternative less elegant but easier to understand
# X_test[(y_pred_class=0) & (y_test=1)]

NameError: name 'preds' is not defined

# TODO" model best params with multinomial bayes, how accurate is it?


## Evaluation of Models

