In [None]:
import sys
print(sys.version)

In [None]:
# Importing dependencies

import numpy as np
np.random.seed(1)
from tensorflow import set_random_seed
set_random_seed(2)
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential, load_model
from keras.layers.core import Dense
from keras.layers.recurrent import LSTM
from keras import optimizers
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
import datetime as dt
plt.style.use('ggplot')

### Data Preprocessing

In [None]:
# Setting up an early stop
earlystop = EarlyStopping(monitor='val_loss', min_delta=0.0001, patience=80,  verbose=1, mode='min')
callbacks_list = [earlystop]

In [None]:
# Loading the dataset
url = 'https://raw.githubusercontent.com/ninja3697/dataset/master/CSV.csv'
#url = '../../CSV.csv'
df = pd.read_csv(url,parse_dates = True,index_col=0)
df.tail()

In [None]:
# Correlation matrix
df.corr()['Close']

In [None]:
print(df.describe().Volume) 
df.drop(df[df['Volume']==0].index, inplace = True) #Dropping rows with volume value 0
df['Volume'].hist(bins = 10)

In [None]:
timesteps = np.arange(30,55,10)
hl = []
for i in range(30,55,10):
    hl.append([i,i-5])
lr = [1e-5,1e-4,1e-3,1e-2,1e-1]
batch_size = [16,32,64]
num_epochs = [20]

In [None]:
#Build and train the model
def fit_model(train,val,timesteps,hl,lr,batch,epochs):
    X_train = []
    Y_train = []
    X_val = []
    Y_val = []
  
    # Loop for training data
    for i in range(timesteps,train.shape[0]):
        X_train.append(train[i-timesteps:i])
        Y_train.append(train[i][0])
    X_train,Y_train = np.array(X_train),np.array(Y_train)
  
    # Loop for val data
    for i in range(timesteps,val.shape[0]):
        X_val.append(val[i-timesteps:i])
        Y_val.append(val[i][0])
    X_val,Y_val = np.array(X_val),np.array(Y_val)
    
    # Adding Layers to the model
    model = Sequential()
    model.add(LSTM(X_train.shape[2],input_shape = (X_train.shape[1],X_train.shape[2]),return_sequences = True,
                   activation = 'relu'))
    for i in range(len(hl)-1):        
        model.add(LSTM(hl[i],activation = 'relu',return_sequences = True))
    model.add(LSTM(hl[-1],activation = 'relu'))
    model.add(Dense(1))
    model.compile(optimizer = optimizers.Adam(lr = lr), loss = 'mean_squared_error')
    #print(model.summary())
  
    # Training the data
    history = model.fit(X_train,Y_train,epochs = epochs,batch_size = batch,validation_data = (X_val, Y_val),verbose = 0,
                        shuffle = False)#, callbacks=callbacks_list)
    model.reset_states()
    return model, history.history['loss'], history.history['val_loss']


In [None]:
# Evaluating the model
def evaluate_model(model,test,timesteps):
    X_test = []
    Y_test = []

    # Loop for testing data
    for i in range(timesteps,test.shape[0]):
        X_test.append(test[i-timesteps:i])
        Y_test.append(test[i][0])
    X_test,Y_test = np.array(X_test),np.array(Y_test)
    
    # Prediction Time !!!!
    Y_hat = model.predict(X_test)
    rmse = sqrt(mean_squared_error(Y_test,Y_hat))
    r2 = r2_score(Y_test,Y_hat)
    return rmse, r2, Y_test, Y_hat

In [None]:
# Plotting the predictions
def plot_data(Y_test,Y_hat):
    plt.plot(Y_test,c = 'r')
    plt.plot(Y_hat,c = 'y')
    plt.xlabel('Day')
    plt.ylabel('Price')
    plt.title('Stock Prediction Graph using Multivariate-LSTM model')
    plt.legend(['Actual','Predicted'],loc = 'lower right')
    plt.show()

In [None]:
# Plotting the training errors
def plot_error(train_loss,val_loss):
    plt.plot(train_loss,c = 'r')
    plt.plot(val_loss,c = 'b')
    plt.ylabel('Loss')
    plt.legend(['train','val'],loc = 'upper right')
    plt.show()

### Model 1

In [None]:
# Extracting the series
series = df[['Close','High','Volume']] # Picking the series with high correlation
print(series.shape)
print(series.tail())

In [None]:
# Train Val Test Split
train_start = dt.date(1997,1,1)
train_end = dt.date(2006,12,31)
train_data = series.loc[train_start:train_end]

val_start = dt.date(2007,1,1)
val_end = dt.date(2008,12,31)
val_data = series.loc[val_start:val_end]

test_start = dt.date(2009,1,1)
test_end = dt.date(2010,12,31)
test_data = series.loc[test_start:test_end]

print(train_data.shape,val_data.shape,test_data.shape)

In [None]:
# Normalisation
sc = MinMaxScaler()
train = sc.fit_transform(train_data)
val = sc.transform(val_data)
test = sc.transform(test_data)
print(train.shape,val.shape,test.shape)

In [None]:
results = list()
for t in timesteps:
    for l in hl:
        for rate in lr:
            for batch in batch_size:
                for epochs in num_epochs:
                    model,train_loss,val_loss = fit_model(train,val,t,l,rate,batch,epochs)
                    results.append([t,l,rate,batch,train_loss[-1],val_loss[-1]])
pd.DataFrame(results,columns=['Timestep','Hidden_Layers','Learning_Rate','Batch_Size','Train_Loss','Val_Loss']).to_csv('Multivariate-3-LSTM_model1.csv')
        

In [None]:
'''
timesteps = 42
hl = [35,30]
lr = 1.5e-5
batch_size = 32
num_epochs = 500
model,train_error,val_error = fit_model(train,val,timesteps,hl,lr,batch_size,num_epochs)
plot_error(train_error,val_error)
rmse, r2_value,true,predicted = evaluate_model(model,test,42)
print('R-Squared Score = {}'.format(r2_value))
plot_data(true,predicted)
'''

In [None]:
# Save a model
#model.save('model1.h5')

# Load a model
#model = load_model('model1.h5')

### Model 2
Converting volume to log scale and see what changes happen

In [None]:
'''
# Converting Volume to log scale
df['Volume_log'] = np.log(df['Volume'])
print(df['Volume_log'].describe())
df['Volume_log'].hist(bins=20)
'''

In [None]:
'''
# Extracting the series
series = df[['Close','High','Volume_log']] # Picking the multivariate series 
print(series.shape)
print(series.tail())
'''

In [None]:
'''
# Train Val Test Split
train_start = dt.date(1997,1,1)
train_end = dt.date(2006,12,31)
train_data = series.loc[train_start:train_end]

val_start = dt.date(2007,1,1)
val_end = dt.date(2008,12,31)
val_data = series.loc[val_start:val_end]

test_start = dt.date(2009,1,1)
test_end = dt.date(2010,12,31)
test_data = series.loc[test_start:test_end]

print(train_data.shape,val_data.shape,test_data.shape)
'''

In [None]:
'''
# Normalisation
sc = MinMaxScaler()
train = sc.fit_transform(train_data)
val = sc.transform(val_data)
test = sc.transform(test_data)
print(train.shape,val.shape,test.shape)
'''

In [None]:
'''
results = list()
for l in hl:
    for rate in lr:
        for batch in batch_size:
            for epochs in num_epochs:
                model,train_loss,val_loss = fit_model(train,val,timesteps[1],l,rate,batch,epochs)
                results.append([timesteps[1],l,rate,batch,train_loss[-1],val_loss[-1]])
pd.DataFrame(results,columns=['Timestep','Hidden_Layers','Learning_Rate','Batch_Size','Train_Loss','Val_Loss']).to_csv('Multivariate-3-LSTM_1.csv')
'''    

In [None]:
'''
timesteps = 40
hl = [40,35]
lr = 0.001
batch_size = 64
num_epochs = 500
model,train_error,val_error = fit_model(train,val,timesteps,hl,lr,batch_size,num_epochs)
plot_error(train_error,val_error)
rmse, r2_value,true,predicted = evaluate_model(model,test,42)
print('R-Squared Score = {}'.format(r2_value))
plot_data(true,predicted)
'''

In [None]:
# Save a model
#model.save('model_LSTM_30_4035_1e-3.h5')
#del model # Deletes the model
# Load a model
#model = load_model('model2.h5')