In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
np.random.seed(1)
import tensorflow
from sklearn import preprocessing
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential, load_model
from keras.layers.core import Dense
from keras.layers.recurrent import LSTM
from keras import optimizers
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
import datetime as dt
import time
plt.style.use('ggplot')

In [2]:
data = yf.download('CL=F', start ='2000-08-23' , end='2021-04-22')
values = data.values
data.tail()

sa_news = pd.read_csv("sa_twitter.csv")
sa_news = sa_news[(9890-5155):]
print(len(data))
print(len(sa_news))

boo = sa_news["polarity"].to_list()

data["SA"] = boo

data.tail()

[*********************100%***********************]  1 of 1 completed
5155
5155


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,SA
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-04-15,62.830002,63.57,62.529999,63.459999,63.459999,243370,-0.7
2021-04-16,63.32,63.880001,62.830002,63.130001,63.130001,87795,0.0
2021-04-19,62.98,63.630001,62.630001,63.380001,63.380001,74384,0.17
2021-04-20,63.470001,64.25,61.5,62.439999,62.439999,476046,-0.7
2021-04-21,62.389999,62.560001,60.860001,61.349998,61.349998,431486,-0.7


In [3]:
data.corr()['Close']

Open         0.998137
High         0.998921
Low          0.999343
Close        1.000000
Adj Close    1.000000
Volume       0.085182
SA          -0.131514
Name: Close, dtype: float64

In [4]:
print(data.describe().Volume) 
data.drop(data[data['Volume']==0].index, inplace = True)

count    5.155000e+03
mean     2.988259e+05
std      2.224149e+05
min      0.000000e+00
25%      1.089310e+05
50%      2.547240e+05
75%      3.993110e+05
max      2.288230e+06
Name: Volume, dtype: float64


In [5]:
n = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=80,  verbose=1, mode='min')
call = [n]

In [6]:
#Build and train the model
def fitting(train,val,timesteps,hl,lr,batch,epochs):
    X_train = []
    Y_train = []
    X_val = []
    Y_val = []
  
    # Loop for training data
    for i in range(timesteps,train.shape[0]):
        X_train.append(train[i-timesteps:i])
        Y_train.append(train[i][0])
    X_train,Y_train = np.array(X_train),np.array(Y_train)
  
    # Loop for val data
    for i in range(timesteps,val.shape[0]):
        X_val.append(val[i-timesteps:i])
        Y_val.append(val[i][0])
    X_val,Y_val = np.array(X_val),np.array(Y_val)
    
    # Adding Layers to the model
    model = Sequential()
    model.add(LSTM(X_train.shape[2],input_shape = (X_train.shape[1],X_train.shape[2]),return_sequences = True,
                   activation = 'relu'))
    for i in range(len(hl)-1):        
        model.add(LSTM(hl[i],activation = 'relu',return_sequences = True))
    model.add(LSTM(hl[-1],activation = 'relu'))
    model.add(Dense(1))
    model.compile(optimizer = optimizers.Adam(lr = lr), loss = 'mean_squared_error')
    #print(model.summary())
  
    # Training the data
    history = model.fit(X_train,Y_train,epochs = epochs,batch_size = batch,validation_data = (X_val, Y_val),verbose = 0,
                        shuffle = False, callbacks=call)
    model.reset_states()
    return model, history.history['loss'], history.history['val_loss']

In [7]:
def eval(model_setup,test,step):
    X_test = []
    Y_test = []

    for i in range(step,test.shape[0]):
        X_test.append(test[i-step:i])
        Y_test.append(test[i][0])
    X_test,Y_test = np.array(X_test),np.array(Y_test)
    Y_hat = model_setup.predict(X_test)

In [8]:
# Plotting the predictions
def plot_data(Y_test,Y_hat):
    plt.plot(Y_test,c = 'r')
    plt.plot(Y_hat,c = 'y')
    plt.xlabel('Day')
    plt.ylabel('Price')
    plt.title('Prediction using Multivariate-LSTM with Twitter model')
    plt.legend(['Actual','Predicted'],loc = 'lower right')
    plt.show()


In [9]:
# Plotting the training errors
def plot_error(train_loss,val_loss):
    plt.plot(train_loss,c = 'r')
    plt.plot(val_loss,c = 'b')
    plt.ylabel('Loss')
    plt.legend(['train','val'],loc = 'upper right')
    plt.show()

In [10]:
# Extracting the series
series = data[['Close','High','SA']] # Picking the series with high correlation
print(series.shape)
print(series.tail())

(5149, 3)
                Close       High    SA
Date                                  
2021-04-15  63.459999  63.570000 -0.70
2021-04-16  63.130001  63.880001  0.00
2021-04-19  63.380001  63.630001  0.17
2021-04-20  62.439999  64.250000 -0.70
2021-04-21  61.349998  62.560001 -0.70


In [11]:
train_data = data[0:int(len(data)*0.6)]
val_data = data[int(len(data)*0.6): int(len(data)*0.8)]
test_data = data[int(len(data)*0.8):]

print(train_data.shape,val_data.shape,test_data.shape)

(3089, 7) (1030, 7) (1030, 7)


In [12]:
# Normalisation
sc = MinMaxScaler()
train = sc.fit_transform(train_data)
val = sc.transform(val_data)
test = sc.transform(test_data)
print(train.shape,val.shape,test.shape)

(3089, 7) (1030, 7) (1030, 7)


In [13]:
step = 50
hl = [40,35]
lr = 1e-3
batch_size = 64
num_epochs = 250

In [None]:
model_setup,train_error,val_error = fitting(train,val,step,hl,lr,batch_size,num_epochs)
plot_error(train_error,val_error)

In [None]:
mse, rmse, r2_value,true,predicted = evaluate_model(model,test,timesteps)
print('MSE = {}'.format(mse))
print('RMSE = {}'.format(rmse))
print('R-Squared Score = {}'.format(r2_value))
plot_data(true,predicted)

In [None]:
f= open("twitter-pred-multi-lstm.txt","w")

for i in range(0,len(predicted)):
    f.write(str(predicted[i][0]) +",")
    
f.close()