# Topic            : Stock Market Prediction using Numerical and Textual Analysis
## Task                :  7

## Level               : Advanced
## Submitted by : Mohd Tarique Khan
## Date                : 07/08/2021

# Objective: Create a hybrid model for stock price/performance prediction using numerical analysis of historical stock prices, and sentimental analysis of news headlines 
# Stock to analyze and predict - SENSEX (S&P BSE SENSEX)
# Data used:  for historical stock prices from finance.yahoo.com
# Data used:  for textual (news) data from https://bit.ly/36fFPI6


In [1]:
# Import Liabraries

# To ignore warnings during the session
import warnings
warnings.filterwarnings('ignore') 

# Importing essential liabraries
import pandas as pd
import numpy as np
import math

# Importing Data Visualization Liabraries
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

KeyboardInterrupt: 

# Downloading Historical Stock Prices from Yahoo Finance

In [None]:
# Importing required libraries
import time
import datetime

In [None]:
# Creating Variables, we can create data file for many companies but here i will take Microsoft only for assessment 
# # tickers = ['MSFT', 'GOOG', 'AAPL', RELIANCE.NS] # For downloading Multiple Companies data from Yahoo Finance
tickers = ['RELIANCE.NS']
interval = '1d'
period1 = int(time.mktime(datetime.datetime(2016,8,10,23,59).timetuple()))
period2 = int(time.mktime(datetime.datetime(2020,12,31,23,59).timetuple()))
print(period1)
print(period2)

In [None]:
xlwriter = pd.ExcelWriter('Historical_Stock_Prices.xlsx', engine='openpyxl')

for ticker in tickers:
    query_string =f'https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={period1}&period2={period2}&interval={interval}&events=history&includeAdjustedClose=true'
#                   https://query1.finance.yahoo.com/v7/finance/download/{ticker}?period1={period1}&period2={period2}&interval={interval}&events=history&includeAdjustedClose=true
    df = pd.read_csv(query_string)
    df.to_excel(xlwriter, sheet_name=ticker, index=False)
    

# xlwriter.save() # For saving an excel file
df.to_csv('Historical_Stock_Prices_csv.csv', index=False) # For saving a csv file

In [None]:
# reading the datasets into pandas

stock_price = pd.read_csv('C:/Users/IT/TSF GRIP/TSF-GRIP-DSBA-Internship/Historical_Stock_Prices_csv.csv')

In [None]:
stock_price.head()

# Loading already downloaded Times of India News Headlines from Harvard Dataverse

In [None]:
news_headlines = pd.read_csv('india-news-headlines.csv')

In [None]:
news_headlines.head()

In [None]:
news_headlines.tail()

# Data Visualization & Cleaning

In [None]:
# Stock Price Data

In [None]:
# checking for null values in both the datasets
stock_price.isna().any()

In [None]:
news_headlines.isna().any()

In [None]:
stock_price.info()

In [None]:
news_headlines.info()

In [None]:
# coverting the "Date' column datatype from 'object' to 'datetime'
stock_price['Date'] = pd.to_datetime(stock_price['Date']).dt.normalize()

# filtering the important columns required
stock_price = stock_price.filter(['Date', 'Close', 'Open', 'High', 'Low', 'Volume'])

# setting column 'Date' as the index column
stock_price.set_index('Date', inplace= True)

# sorting the data according to the index i.e 'Date'
stock_price = stock_price.sort_index(ascending=True, axis=0)
stock_price

In [None]:
from matplotlib.pyplot import figure

figure(figsize=(14, 6), dpi=80)
plt.plot(stock_price["Close"], label="Close")
plt.plot(stock_price["Open"], label="Open")
plt.plot(stock_price["High"], label="High")
plt.plot(stock_price["Low"], label="Low")

plt.legend()
plt.title("Stock Prices of Microsoft")
plt.xlabel("Date")
plt.ylabel("Price")

In [None]:
figure(figsize=(14, 5), dpi=80)
plt.plot(stock_price["Volume"], label="Volume")

plt.legend()
plt.title("Stock Prices of Microsoft")
plt.xlabel("Date")
plt.ylabel("Price")

# News Data

In [None]:
# coverting the "Date' column datatype from 'object' to 'datetime'
news_headlines['publish_date'] = news_headlines['publish_date'].astype(str)
news_headlines['publish_date'] = news_headlines['publish_date'].apply(lambda x: x[0:4]+'-'+x[4:6]+'-'+x[6:8])
news_headlines['publish_date'] = pd.to_datetime(news_headlines['publish_date']).dt.normalize()

# filtering the important columns required
news_headlines = news_headlines.filter(['publish_date', 'headline_text'])

# grouping the news headlines according to 'Date'
news_headlines = news_headlines.groupby(['publish_date'])['headline_text'].apply(lambda x: ','.join(x)).reset_index()

# setting column 'Date' as the index column
news_headlines.set_index('publish_date', inplace= True)

# sorting the data according to the index i.e 'Date'
news_headlines = news_headlines.sort_index(ascending=True, axis=0)
news_headlines

# Concatenating(combining) both Stock Price and News data

In [None]:
# concatenating the datasets stock_price and news_headlines
concat_data = pd.concat([stock_price, news_headlines], axis=1)

# dropping the null values if any
concat_data.dropna(axis=0, inplace=True)

# displaying the combined stock_data
concat_data

# Adding Columns for Sentiment scoring

In [None]:
# adding columns to concat_data
concat_data['Compound'] = ''
concat_data['Negative'] = ''
concat_data['Neutral'] = ''
concat_data['Positive'] = ''
concat_data.head()

In [None]:
# Importing Libraries for developing Sentiment Analysis
import nltk
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import unicodedata
# nltk.download('vader_lexicon')


In [None]:
# instantiating the Sentiment Analyzer
s_analyzer = SentimentIntensityAnalyzer()

# calculating sentiment scores
concat_data['Compound'] = concat_data['headline_text'].apply(lambda x: s_analyzer.polarity_scores(x)['compound'])
concat_data['Negative'] = concat_data['headline_text'].apply(lambda x: s_analyzer.polarity_scores(x)['neg'])
concat_data['Neutral'] = concat_data['headline_text'].apply(lambda x: s_analyzer.polarity_scores(x)['neu'])
concat_data['Positive'] = concat_data['headline_text'].apply(lambda x: s_analyzer.polarity_scores(x)['pos'])

# displaying the stock data
concat_data.head()

In [None]:
concat_data.shape

# Preparing concatenated data for analysis and saving it

In [None]:
# dropping the 'headline_text' which is now not required
concat_data.drop(['headline_text'], inplace=True, axis=1)

# rearranging the columns of the whole concat_data
concat_data = concat_data[['Close', 'Compound', 'Negative', 'Neutral', 'Positive', 'Open', 'High', 'Low', 'Volume']]

concat_data.head()

In [None]:
# Saving concat_data as stockdata_final csv file
concat_data.to_csv('Final_stock_data.csv')

# Opening the "Final_stock_data.csv" as DataFrame

In [None]:
fsdata = pd.read_csv('Final_stock_data.csv')
fsdata.head()

In [None]:
# renaming the index column
fsdata.rename(columns={'Unnamed: 0':'Date'}, inplace = True)

# setting the column 'Date' as the index column
fsdata.set_index('Date', inplace = True)


In [None]:
fsdata.head()

In [None]:
fsdata.shape

In [None]:
fsdata.info()

In [None]:
fsdata.describe()

In [None]:
figure(figsize=(14, 6), dpi=80)
plt.plot(fsdata["Close"], label="Close")
plt.plot(fsdata["Open"], label="Open")

plt.legend()
plt.title("Stock Prices of Microsoft")
plt.xlabel("Date")
plt.ylabel("Price")

In [None]:
# Calculating the 3 days rolling mean or moving average 
# (D1+D2+D3/3=RM1, D2+D3+D4 =RM2......)
fsdata.rolling(3).mean().head(10)

In [None]:
# Creating a visualization on 15 Days moving average
figure(figsize=(14, 6), dpi=80)

fsdata['Close'].plot()
fsdata.rolling(window = 15).mean()['Close'].plot()

# plt.legend()
plt.title("Closing Stock Prices of Microsoft Vs Moving average(15days)")
plt.xlabel("Date")
plt.ylabel("Price")

# Data Preparation for Modelling

In [None]:
# dropping nulls
fsdata = fsdata.dropna()

In [None]:
fsdata.columns

In [None]:
cols = ['Compound', 'Negative', 'Neutral', 'Positive', 'Open', 'High','Low', 'Volume']
len(cols)

In [None]:
features= fsdata[:,1:]
features

In [None]:
target = fsdata[:,:1]
target

# Scaling the data to manage the sensitivity of LSTM Model. 
# Applying the MinMaxScaler

In [None]:
# Importing Libraries for Data Preprocessing
from sklearn import preprocessing
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

In [None]:
# scaling the feature dataset
scaler_features = preprocessing.MinMaxScaler (feature_range=(-1, 1))
x = np.array(features).reshape((len(features) ,len(cols)))
x = scaler_features.fit_transform(x)

# scaling the target variable
scaler_target = preprocessing.MinMaxScaler (feature_range=(-1, 1))
y = np.array (target).reshape ((len( target), 1))
y = scaler_target.fit_transform (y)

# displaying the scaled feature dataset and the target variable
x, y

In [None]:
training_size=int(len(x)*0.70)
test_size=len(x)-training_size
# train_data,test_data= scaled_data[0:training_size,:], scaled_data[training_size:len(scaled_data),:]

In [None]:
training_size,test_size

In [None]:
train_data.shape,test_data.shape

In [None]:
X_train = x[0:training_size,:]
X-test = x[training_size:len(x),:]
y_train = y[0:training_size,:]
y_test = y[training_size:len(x),:]

In [None]:
# scale = MinMaxScaler()
# scaled_data = scale.fit_transform(np.array(fsdata))

In [None]:
# print(scaled_data)

# Splitting dataset into train and test split, in time-series analysis it cannot be separated by train-test split method as it is date dependent.

In [None]:
# training_size=int(len(scaled_data)*0.70)
# test_size=len(scaled_data)-training_size
# train_data,test_data= scaled_data[0:training_size,:], scaled_data[training_size:len(scaled_data),:]

In [None]:
# training_size,test_size

In [None]:
# train_data.shape,test_data.shape

In [None]:
# train_data

In [None]:
# test_data

In [None]:
# test_data.columns

# Separating Train and test data features and target.

In [None]:
# scaled_data.columns

In [None]:
# setting the features dataset for prediction  
# cols = ['Close', 'Compound', 'Negative', 'Neutral', 'Positive', 'Open', 'High','Low', 'Volume']
# X_train = train_data[:,1:]
X_train

In [None]:
# setting the target variable as the shifted close_price
# y_train = train_data[:,:1]
y_train

In [None]:
# cols = ['Close', 'Compound', 'Negative', 'Neutral', 'Positive', 'Open', 'High','Low', 'Volume']
# X_test = test_data[:,1:]
X_test

In [None]:
# setting the target variable as the shifted close_price
# y_test = test_data[:,:1]
y_test

In [None]:

# printing the shape of the training and the test datasets
print('Number of rows and columns in the Training set X:', X_train.shape, 'and y:', y_train.shape)
print('Number of rows and columns in the Test set X:', X_test.shape, 'and y:', y_test.shape)

# Reshape input to be [samples, time steps, features] which is required for LSTM

In [None]:
X_train = X_train.reshape (X_train.shape + (1,)) 
X_test = X_test.reshape(X_test.shape + (1,))

# printing the re-shaped feature dataset
print('Shape of Training set X:', X_train.shape)
print('Shape of Test set X:', X_test.shape)
print('Shape of Training set y:', y_train.shape)
print('Shape of Test set y:', y_test.shape)

In [None]:
# Importing Libraries for Model Building
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Activation


In [None]:
# setting the seed to achieve consistent and less random predictions at each execution
np.random.seed(2000)

In [None]:
# Setting model architecture and compiling it
model=Sequential()
model.add(LSTM(50,return_sequences=True,input_shape=(len(cols),1)))
model.add(LSTM(50,return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1))
model.compile(loss='mean_squared_error',optimizer='adam')
model.summary()

In [None]:
# fitting the model using the training dataset
model.fit(X_train, y_train, validation_data=(X_test,y_test), epochs=100, batch_size=64, verbose=1)


In [None]:
import tensorflow as tf
tf.__version__

In [None]:
### Lets Do the prediction and check performance metrics
train_predict=model.predict(X_train)
test_predict=model.predict(X_test)

In [None]:
train_predict

In [None]:
test_predict

In [None]:
##Transformback to original form
train_predict=scaler_target.inverse_transform(np.array(train_predict).reshape((len(train_predict),1)))
test_predict=scale_target.inverse_transform(np.array(test_predict).reshape((len(test_predict),1)))


In [None]:
# printing the predictions
print('Train Predictions:')
train_predict[0:5]
print('Test Predictions:')
test_predict[0:5]