In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.optimizers import SGD 
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
import itertools
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Dropout
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.linear_model import LinearRegression
import time


In [None]:
def DataProcessing(data, prev_per, forw_per,jump=1):
    X,Y = [],[]
    data_X=data[:,:-1]
    data_Y=data[:,-1]
    for i in range(0,len(data) -prev_per -forw_per +1, jump):
        X.append(data_X[i:(i+prev_per)])
        Y.append(data_Y[(i+prev_per):(i+prev_per+forw_per)])
    return np.array(X),np.array(Y)

In [None]:
names=['AAPL.txt','GS.txt','EBAY.txt','CSCO.txt']
prev_per = [1,2,3,4,5,10]
FirstLayer = [5,15,25,50]
SecondLayer = [1,5,15,25]
n_epoch = [25]



for name in names:
    
    if name  == 'AAPL.txt':
        long_name = 'Apple Inc.'
        short_name = 'Apple'
    elif name  == 'PG.txt':
        long_name = 'The Procter & Gamble Company'
        short_name = 'PG'
    elif name  == 'CVX.txt':
        long_name = 'CVS Health Corporation'
        short_name = 'CVX'
    elif name  == 'GS.txt':
        long_name = 'The Goldman Sachs Group, Inc.'
        short_name = 'GS'
    elif name  == 'EBAY.txt':
        long_name = 'eBay Inc.'
        short_name = 'EBAY'
    elif name  == 'CSCO.txt':
        long_name = 'Cisco Systems, Inc.'
        short_name = 'CSCO'

    #Read the data
    all_data = pd.read_csv(name, sep=",",parse_dates={'dt' : ['Date', 'Time']},infer_datetime_format=True)
    #Select the columns
    all_data.index = all_data['dt'] 
    all_data = all_data[['Close']]
    #Add a new column. The log of the close price
    all_data['Log_Close']=np.log(all_data['Close'])
    #Substract the previous rows
    all_data['Returns']=all_data.Log_Close.diff()
    #Delete the first row
    all_data = all_data.iloc[1:]
    
    plt.figure()
    plt.plot(all_data['Returns'])
    plt.title('Returns of ' + long_name)
    plt.show()
    
    all_data['Returns^2']=np.square(all_data['Returns'])
    all_data_d=all_data.groupby(by=[all_data.index.year,all_data.index.month,all_data.index.day]).sum()
    data_RV=pd.DataFrame()
    data_RV['RVol_d']=np.sqrt(all_data_d['Returns^2'])
    data_RV = data_RV[['RVol_d']]
    data_RV.index.names=['Year','Month','Day']

    data_RV['RV_w']=data_RV['RVol_d'].rolling(min_periods=1, window=5).mean()
    data_RV['RV_m']=data_RV['RVol_d'].rolling(min_periods=1, window=22).mean()
    data_RV=data_RV.iloc[22:]

    data_RV['RVol_d+1']=data_RV.RVol_d.shift(-1)
    data_RV = data_RV.iloc[:-1]
    
    data_RV_1=data_RV   
    data_RV_1.index=pd.to_datetime(pd.DataFrame(data_RV_1.index.values.tolist(), columns=['year','month','day']))
    
    plt.figure(figsize = (15,10))
    plt.plot(data_RV_1['RVol_d'],label='_nolegend_')
    plt.ylabel('RVol day', size=15)
    plt.xlabel('Time', size=15)
    plt.legend(fontsize=15)
    plt.title('Realized Volatility of ' + long_name)
    plt.show()
    
    data=data_RV
    data=pd.DataFrame(data)
    data=data.values.reshape(data.shape[0],4)
    
    #Number of predicting periods
    forw_per = 1
    # Times to repeat the precition
    num_per = int(len(data_RV)*0.25)
    N_Per=[]
    NL1=[]
    NL2=[]
    N_Epochs=[]
    RMSE=[]
    MSE_VAR=[]
    Predictions=[]
    Elapsed=[]
    
    for prev in prev_per:
        div = len(data) - num_per*forw_per
        data_test = data[div-prev:]
        data_train=data[:div]

        x_test,y_test = DataProcessing(data_test,prev,forw_per,forw_per)
        y_test = np.array([list(a.ravel()) for a in y_test])

        x,y = DataProcessing(data_train,prev,forw_per)
        y = np.array([list(a.ravel()) for a in y])

        x_train, x_val, y_train, y_val = train_test_split(x, y, train_size=0.8, test_size=0.2, shuffle=False)

        for L1 in FirstLayer:
            for L2 in SecondLayer:
                for epoch in n_epoch:
                    start = time.time()
                    model = Sequential()
                    model.add(LSTM(L1,input_shape=(prev,3), return_sequences=True))
                    model.add(LSTM(L2,input_shape=(L1,3)))
                    model.add(Dense(forw_per))
                    model.compile(loss='mean_squared_error', optimizer='adam')

                    history = model.fit(x_train,y_train, epochs=epoch, batch_size=100, 
                                        validation_data=(x_val, y_val))
                    end = time.time()
                    time_elapsed=end-start
                    y_pred = model.predict(x_test)
                    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
                    mse_var=mean_squared_error(np.square(y_test), np.square(y_pred))
                    
                    
                    N_Per.append(prev)
                    NL1.append(L1)
                    NL2.append(L2)
                    N_Epochs.append(epoch)
                    Elapsed.append(time_elapsed)
                    RMSE.append(rmse)
                    MSE_VAR.append(mse_var)
                    Predictions.append(y_pred)
                    del history
                    del model
                    
                    
    
    Results=pd.DataFrame()
    Results['Previous Periods']=N_Per
    Results['Number Layers 1']=NL1
    Results['Number Layers 2']=NL2
    Results['Epochs']=N_Epochs
    Results['Time elapsed']=Elapsed
    Results['RMSE']=RMSE
    Results['MSE_VAR']=MSE_VAR
        
    csv_name= 'Results_'+short_name + ".csv"
    
    Results.to_csv(csv_name, sep=',',index=False)
    
    index_min_rmse = Results['RMSE'].argmin()
    opt_rmse=Results.iloc[index_min_rmse,5]
    opt_mse_var=Results.iloc[index_min_rmse,6]
    opt_y_pred=Predictions[index_min_rmse]
    
    #HAR MODEL

    x_HAR_train=data_RV.iloc[:-num_per,0:2]
    y_HAR_train=data_RV.iloc[:-num_per,3]
    x_HAR_test=data_RV.iloc[-num_per:,0:2]
    y_HAR_test=data_RV.iloc[-num_per:,3]
    
    HAR_model=LinearRegression()
    HAR_model.fit(x_HAR_train,y_HAR_train)
    y_HAR_pred = HAR_model.predict(x_HAR_test)
    rmse_HAR = np.sqrt(mean_squared_error(y_HAR_test, y_HAR_pred))
    mse_var_HAR=mean_squared_error(np.square(y_HAR_test), np.square(y_HAR_pred))
    print('Test RMSE LSTM ' + long_name+' :', opt_rmse)
    print('Test RMSE HAR ' + long_name+' :', rmse_HAR)
    print('Test MSE_VAR LSTM ' + long_name+' :', opt_mse_var)
    print('Test MSE_VAR HAR ' + long_name+' :', mse_var_HAR)
    
    plt.figure(figsize = (15,10))
    aa=[x for x in range(num_per)]
    plt.plot(aa, y_test[:num_per], marker='.', label="actual")
    plt.plot(aa, opt_y_pred[:num_per], 'r', label="prediction LTSM")
    plt.plot(aa, y_HAR_pred[:num_per], 'y', label="prediction HAR")
    plt.ylabel('RVol day', size=15)
    plt.xlabel('Time', size=15)
    plt.legend(fontsize=15)
    plt.title('Prediction of ' + long_name)
    plt.show()
