In [1]:
import warnings
warnings.filterwarnings('ignore')

import math
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Dense, Activation

import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

from sklearn import preprocessing, metrics
from sklearn.preprocessing import MinMaxScaler

In [2]:
stock_price = pd.read_csv('AAPL.csv')
stock_headlines = pd.read_csv('india-news-headlines.csv')

In [3]:
stock_price.head()


Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-05-15,75.087502,76.974998,75.052498,76.927498,76.412621,166348400
1,2020-05-18,78.292503,79.125,77.580002,78.739998,78.212997,135178400
2,2020-05-19,78.7575,79.629997,78.252502,78.285004,77.76104,101729600
3,2020-05-20,79.169998,79.879997,79.129997,79.807503,79.273354,111504800
4,2020-05-21,79.665001,80.222504,78.967499,79.212502,78.682327,102688800


In [4]:
stock_price.shape

(252, 7)

In [5]:
stock_headlines.head()

Unnamed: 0,publish_date,headline_category,headline_text
0,20010102,unknown,Status quo will not be disturbed at Ayodhya; s...
1,20010102,unknown,Fissures in Hurriyat over Pak visit
2,20010102,unknown,America's unwanted heading for India?
3,20010102,unknown,For bigwigs; it is destination Goa
4,20010102,unknown,Extra buses to clear tourist traffic


In [6]:
stock_headlines.shape

(3424067, 3)

In [7]:
stock_price.isna().any(), stock_headlines.isna().any()

(Date         False
 Open         False
 High         False
 Low          False
 Close        False
 Adj Close    False
 Volume       False
 dtype: bool,
 publish_date         False
 headline_category    False
 headline_text        False
 dtype: bool)

Stock Price Data

In [8]:
stock_price = stock_price.drop_duplicates()

In [9]:
stock_price['Date'] = pd.to_datetime(stock_price['Date']).dt.normalize()

In [10]:
stock_price = stock_price.filter(['Date', 'Close', 'Open', 'High', 'Low', 'Volume'])

In [11]:
stock_price.set_index('Date', inplace= True)

In [12]:
stock_price = stock_price.sort_index(ascending=True, axis=0)
stock_price

Unnamed: 0_level_0,Close,Open,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-05-15,76.927498,75.087502,76.974998,75.052498,166348400
2020-05-18,78.739998,78.292503,79.125000,77.580002,135178400
2020-05-19,78.285004,78.757500,79.629997,78.252502,101729600
2020-05-20,79.807503,79.169998,79.879997,79.129997,111504800
2020-05-21,79.212502,79.665001,80.222504,78.967499,102688800
...,...,...,...,...,...
2021-05-10,126.849998,129.410004,129.539993,126.809998,88071200
2021-05-11,125.910004,123.500000,126.269997,122.769997,126142800
2021-05-12,122.769997,123.400002,124.639999,122.250000,112172300
2021-05-13,124.970001,124.580002,126.150002,124.260002,105752400


Stock News Headlines Data

In [13]:
stock_headlines = stock_headlines.drop_duplicates()

In [14]:
stock_headlines['publish_date'] = stock_headlines['publish_date'].astype(str)
stock_headlines['publish_date'] = stock_headlines['publish_date'].apply(lambda x: x[0:4]+'-'+x[4:6]+'-'+x[6:8])
stock_headlines['publish_date'] = pd.to_datetime(stock_headlines['publish_date']).dt.normalize()

In [15]:
stock_headlines = stock_headlines.filter(['publish_date', 'headline_text'])

In [16]:
stock_headlines = stock_headlines.groupby(['publish_date'])['headline_text'].apply(lambda x: ','.join(x)).reset_index()

In [17]:
stock_headlines.set_index('publish_date', inplace= True)

In [18]:
stock_headlines = stock_headlines.sort_index(ascending=True, axis=0)
stock_headlines

Unnamed: 0_level_0,headline_text
publish_date,Unnamed: 1_level_1
2001-01-02,Status quo will not be disturbed at Ayodhya; s...
2001-01-03,"Powerless north India gropes in the dark,Think..."
2001-01-04,The string that pulled Stephen Hawking to Indi...
2001-01-05,Light combat craft takes India into club class...
2001-01-06,Light combat craft takes India into club class...
...,...
2020-12-27,#BigInterview! Dhritiman Chatterjee: Nobody da...
2020-12-28,Horoscope Today; 28 December 2020: Check astro...
2020-12-29,Man recovers charred remains of 'thief' from h...
2020-12-30,Numerology Readings 30 December 2020: Predicti...


Combaining both data

In [19]:
stock_data = pd.concat([stock_price, stock_headlines], axis=1)

In [20]:
stock_data.dropna(axis=0, inplace=True)


In [21]:
stock_data


Unnamed: 0,Close,Open,High,Low,Volume,headline_text
2020-05-15,76.927498,75.087502,76.974998,75.052498,166348400.0,Poll prospects hang on how we tackle corona ou...
2020-05-18,78.739998,78.292503,79.125000,77.580002,135178400.0,#CoronaCrisis With fashion shows being called ...
2020-05-19,78.285004,78.757500,79.629997,78.252502,101729600.0,10 more council seats set to fall vacant on Ma...
2020-05-20,79.807503,79.169998,79.879997,79.129997,111504800.0,Plans of doubling production at HGMCL hit lock...
2020-05-21,79.212502,79.665001,80.222504,78.967499,102688800.0,"J&K's first lady honours frontline warriors,Co..."
...,...,...,...,...,...,...
2020-12-24,131.970001,131.320007,133.460007,131.100006,54930100.0,How to set the mood for sex during cold winter...
2020-12-28,136.690002,133.990005,137.339996,133.509995,124486200.0,Horoscope Today; 28 December 2020: Check astro...
2020-12-29,134.869995,138.050003,138.789993,134.339996,121047300.0,Man recovers charred remains of 'thief' from h...
2020-12-30,133.720001,135.580002,135.990005,133.399994,96452100.0,Numerology Readings 30 December 2020: Predicti...


In [22]:
stock_data['compound'] = ''
stock_data['negative'] = ''
stock_data['neutral'] = ''
stock_data['positive'] = ''
stock_data.head()

Unnamed: 0,Close,Open,High,Low,Volume,headline_text,compound,negative,neutral,positive
2020-05-15,76.927498,75.087502,76.974998,75.052498,166348400.0,Poll prospects hang on how we tackle corona ou...,,,,
2020-05-18,78.739998,78.292503,79.125,77.580002,135178400.0,#CoronaCrisis With fashion shows being called ...,,,,
2020-05-19,78.285004,78.7575,79.629997,78.252502,101729600.0,10 more council seats set to fall vacant on Ma...,,,,
2020-05-20,79.807503,79.169998,79.879997,79.129997,111504800.0,Plans of doubling production at HGMCL hit lock...,,,,
2020-05-21,79.212502,79.665001,80.222504,78.967499,102688800.0,"J&K's first lady honours frontline warriors,Co...",,,,


In [23]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata

In [24]:
sid = SentimentIntensityAnalyzer()


In [None]:
stock_data['compound'] = stock_data['headline_text'].apply(lambda x: sid.polarity_scores(x)['compound'])
stock_data['negative'] = stock_data['headline_text'].apply(lambda x: sid.polarity_scores(x)['neg'])
stock_data['neutral'] = stock_data['headline_text'].apply(lambda x: sid.polarity_scores(x)['neu'])
stock_data['positive'] = stock_data['headline_text'].apply(lambda x: sid.polarity_scores(x)['pos'])

In [None]:
stock_data.head()

In [None]:
stock_data = stock_data[['Close', 'compound', 'negative', 'neutral', 'positive', 'Open', 'High', 'Low', 'Volume']]

In [None]:
stock_data.head()


In [None]:
stock_data.to_csv('stock_data.csv')

In [None]:
stock_data = pd.read_csv('stock_data.csv', index_col = False)

In [None]:
stock_data.rename(columns={'Unnamed: 0':'Date'}, inplace = True)

In [None]:
stock_data.set_index('Date', inplace=True)

In [None]:
stock_data.head()

In [None]:
stock_data.shape

In [None]:
stock_data.isna().any()

In [None]:
stock_data.describe()

In [None]:
stock_data.info()

Visualization

In [None]:
plt.figure(figsize=(16,10))
stock_data['Close'].plot()

plt.title("Close Price")
plt.xlabel('Date')
plt.ylabel('Close Price ($)')

In [None]:
stock_data.rolling(7).mean().head(20)

In [None]:
plt.figure(figsize=(16,10))
stock_data['Close'].plot()
stock_data.rolling(window=30).mean()['Close'].plot()

In [None]:
percentage_of_data = 1.0
data_to_use = int(percentage_of_data*(len(stock_data)-1))

# using 80% of data for training
train_end = int(data_to_use*0.8)
total_data = len(stock_data)
start = total_data - data_to_use

# printing number of records in the training and test datasets
print("Number of records in Training Data:", train_end)
print("Number of records in Test Data:", total_data - train_end)

In [None]:
# predicting one step ahead
steps_to_predict = 1

# capturing data to be used for each column
close_price = stock_data.iloc[start:total_data,0] #close
compound = stock_data.iloc[start:total_data,1] #compound
negative = stock_data.iloc[start:total_data,2] #neg
neutral = stock_data.iloc[start:total_data,3] #neu
positive = stock_data.iloc[start:total_data,4] #pos
open_price = stock_data.iloc[start:total_data,5] #open
high = stock_data.iloc[start:total_data,6] #high
low = stock_data.iloc[start:total_data,7] #low
volume = stock_data.iloc[start:total_data,8] #volume

# printing close price
print("Close Price:")
close_price

In [None]:
close_price_shifted = close_price.shift(-1) 

# shifting next day compound
compound_shifted = compound.shift(-1) 

# concatenating the captured training data into a dataframe
data = pd.concat([close_price, close_price_shifted, compound, compound_shifted, volume, open_price, high, low], axis=1)

# setting column names of the revised stock data
data.columns = ['close_price', 'close_price_shifted', 'compound', 'compound_shifted','volume', 'open_price', 'high', 'low']

# dropping nulls
data = data.dropna()    
data.head(10)

In [None]:
y = data['close_price_shifted']
y

In [None]:
cols = ['close_price', 'compound', 'compound_shifted', 'volume', 'open_price', 'high', 'low']
x = data[cols]
x

In [None]:

# scaling the feature dataset
scaler_x = preprocessing.MinMaxScaler (feature_range=(-1, 1))
x = np.array(x).reshape((len(x) ,len(cols)))
x = scaler_x.fit_transform(x)

# scaling the target variable
scaler_y = preprocessing.MinMaxScaler (feature_range=(-1, 1))
y = np.array (y).reshape ((len( y), 1))
y = scaler_y.fit_transform (y)

# displaying the scaled feature dataset and the target variable
x, y

In [None]:
# preparing training and test dataset
X_train = x[0 : train_end,]
X_test = x[train_end+1 : len(x),]    
y_train = y[0 : train_end] 
y_test = y[train_end+1 : len(y)]  

# printing the shape of the training and the test datasets
print('Number of rows and columns in the Training set X:', X_train.shape, 'and y:', y_train.shape)
print('Number of rows and columns in the Test set X:', X_test.shape, 'and y:', y_test.shape)

In [None]:
X_train = X_train.reshape (X_train.shape + (1,)) 
X_test = X_test.reshape(X_test.shape + (1,))

# printing the re-shaped feature dataset
print('Shape of Training set X:', X_train.shape)
print('Shape of Test set X:', X_test.shape)

In [None]:
np.random.seed(2016)

# setting the model architecture
model=Sequential()
model.add(LSTM(100,return_sequences=True,activation='tanh',input_shape=(len(cols),1)))
model.add(Dropout(0.1))
model.add(LSTM(100,return_sequences=True,activation='tanh'))
model.add(Dropout(0.1))
model.add(LSTM(100,activation='tanh'))
model.add(Dropout(0.1))
model.add(Dense(1))

# printing the model summary
model.summary()

In [None]:

# compiling the model
model.compile(loss='mse' , optimizer='adam')

# fitting the model using the training dataset
model.fit(X_train, y_train, validation_split=0.2, epochs=10, batch_size=8, verbose=1)


In [None]:
# performing predictions
predictions = model.predict(X_test) 

# unscaling the predictions
predictions = scaler_y.inverse_transform(np.array(predictions).reshape((len(predictions), 1)))

# printing the predictions
print('Predictions:')
predictions[0:5]

In [None]:

# calculating the training mean-squared-error
train_loss = model.evaluate(X_train, y_train, batch_size = 1)

# calculating the test mean-squared-error
test_loss = model.evaluate(X_test, y_test, batch_size = 1)

# printing the training and the test mean-squared-errors
print('Train Loss =', round(train_loss,4))
print('Test Loss =', round(test_loss,4))

In [None]:

# calculating root mean squared error
root_mean_square_error = np.sqrt(np.mean(np.power((y_test - predictions),2)))
print('Root Mean Square Error =', round(root_mean_square_error,4))

In [None]:
# calculating root mean squared error using sklearn.metrics package
rmse = metrics.mean_squared_error(y_test, predictions)
print('Root Mean Square Error (sklearn.metrics) =', round(np.sqrt(rmse),4))

In [None]:
# unscaling the test feature dataset, x_test
X_test = scaler_x.inverse_transform(np.array(X_test).reshape((len(X_test), len(cols))))

# unscaling the test y dataset, y_test
y_train = scaler_y.inverse_transform(np.array(y_train).reshape((len(y_train), 1)))
y_test = scaler_y.inverse_transform(np.array(y_test).reshape((len(y_test), 1)))

In [None]:
# plotting
plt.figure(figsize=(16,10))

# plt.plot([row[0] for row in y_train], label="Training Close Price")
plt.plot(predictions, label="Predicted Close Price")
plt.plot([row[0] for row in y_test], label="Testing Close Price")
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.05), fancybox=True, shadow=True, ncol=2)
plt.show()