<a href="https://colab.research.google.com/github/pangxiangyang/StockPricePrediction/blob/master/CNN_LSTM_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from numpy import array
from numpy import hstack
import pandas as pd
import numpy as np
import numpy as np
import keras
import tensorflow
from numpy import mean
from numpy import dstack
from pandas import read_csv
from keras.models import Sequential, load_model
from keras.layers import Dense, TimeDistributed, RepeatVector, Conv1D, MaxPooling1D, AveragePooling1D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import LSTM
import matplotlib.pyplot as plt
import time
from sklearn import metrics
from sklearn import preprocessing
import csv
from sklearn.metrics import mean_squared_error
from math import sqrt
from decimal import *
from keras.utils import plot_model

Using TensorFlow backend.


In [0]:
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps_in, n_steps_out):
	X, y = list(), list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_steps_in
		out_end_ix = end_ix + n_steps_out
		# check if we are beyond the dataset
		if out_end_ix > len(sequences):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix:out_end_ix, -1]
		X.append(seq_x)
		y.append(seq_y)
	return array(X), array(y)

In [0]:
def makePrediction( model, Xtest, ytest, priceScaler, isCNNLSTM= True ):
  if(isCNNLSTM):
    n_steps, n_length, n_features = int(n_steps_in/7 ), 7, Xtest.shape[2]
    x_input = Xtest.reshape((Xtest.shape[0], n_steps, n_length, n_features))
  else:
    x_input = Xtest
  yhat = model.predict(x_input, verbose=0)

  # RMSE
  mse = mean_squared_error(yhat,  ytest.reshape(ytest.shape[0],1))
  rmse = sqrt(mse)
  #print( 'rmse:' , rmse )

  # Check profit
  predictionPrices = priceScaler.inverse_transform( yhat).flatten();
  actualPrices     = priceScaler.inverse_transform( ytest.reshape(ytest.shape[0],1)  ).flatten()

  buyPrices = []
  sellPrices = []
  count = 0;
  
  for i in range(len(predictionPrices)):
    buyPrice    = customRound( actualPrices[i] )
    if( i + 1 < len(predictionPrices)):
      predictedExitPrice   = customRound( predictionPrices[i+1] );
      predictedExitPrice   = Decimal( str(predictedExitPrice) )
      buyPrice             = Decimal( str(buyPrice) )
      if( ( predictedExitPrice - buyPrice ) / buyPrice > 0.01 ): # Only trade if profitabality > 1%
        buyPrices.append( customRound( actualPrices[i] ) )
        actualExitPrice = customRound( actualPrices[i+1] )
        sellPrices.append( actualExitPrice )
        profit = round( actualExitPrice - round( actualPrices[i],3), 3 );
        #print( count, "predicted", predictedExitPrice,  "buy", buyPrice, "sell", actualExitPrice, "profit:", profit, 
        #      "profit percentage:", round( Decimal( profit ) / buyPrice * 100,3) )
    count = count + 1;

  averageprofit = ( np.average( ( array( sellPrices ) - array( buyPrices ) ) * 100  / array(buyPrices)  ) )
  totalprofit = ( sum( ( array( sellPrices ) - array( buyPrices ) ) * 100  / array(buyPrices)  ) )

  print( 'Average profit percentage', round( averageprofit, 3 ) )
  print( 'Total profit percentage', round( totalprofit, 3 ) )
  print( 'Trade made', len(sellPrices) )
  print ( 'Positive trade', sum(array(sellPrices) > array(buyPrices)) )
  return rmse, averageprofit, totalprofit, predictionPrices, actualPrices

In [0]:
def customRound( value):
  if( value > 1.00):
    value = round( value, 2)
  else:
    value = round( value*200 ) / 200
  return value;

In [0]:
def GetData(filepath):
	df = pd.read_csv( filepath )
	cutoffdate = 20200122
	# Add MA10, MA20, RSI14, EMA14, EMA28
	df = AddTechnicalIndicators( df, 14 );
	df.dropna(inplace= True )
 
	# Get train dataset
	index      = df[df['Unnamed: 0']== cutoffdate ].index[-1] # training dataset

	# Get all relevant timeseries
	totalDemand    = array( df['TotalDemand'] )
	totalSupply    = array( df['TotalSupply'] )
	buyUpTrade     = array( df['BuyUpTrade'] )
	sellDownTrade  = array( df['SellDownTrade'] )
	lastDonePrice  = array( df['Last Done Price'] )
	popularity		 = array( df['Popularity'] )
	ma14					 = array( df['MA14'])
	ma28					 = array( df['MA28'])
	rsi14					 = array( df['RSI14'])
	ema14					 = array( df['EMA14'])
	ema28					 = array( df['EMA28'])

	# convert to [rows, columns] structure
	totalDemand   = totalDemand.reshape((len(totalDemand), 1) )
	totalSupply   = totalSupply.reshape((len(totalSupply), 1) )
	buyUpTrade    = buyUpTrade.reshape((len(buyUpTrade), 1) )
	sellDownTrade = sellDownTrade.reshape((len(sellDownTrade), 1) )
	lastDonePrice = lastDonePrice.reshape((len(lastDonePrice), 1) )
	popularity 		= popularity.reshape((len(popularity), 1) )
	ma14					= ma14.reshape((len(ma14), 1) )
	ma28					= ma28.reshape((len(ma28), 1) )
	rsi14					= rsi14.reshape((len(rsi14), 1) )
	ema14					= ema14.reshape((len(ema14), 1) )
	ema28					= ema28.reshape((len(ema28), 1) )

	# choose a number of input / output time steps
	n_steps_in, n_steps_out = 14,1

	# Data normalization
	totalDemand 	= preprocessing.MinMaxScaler().fit_transform(totalDemand)
	totalSupply 	= preprocessing.MinMaxScaler().fit_transform(totalSupply)
	buyUpTrade 		= preprocessing.MinMaxScaler().fit_transform(buyUpTrade)
	sellDownTrade = preprocessing.MinMaxScaler().fit_transform(sellDownTrade)
	popularity 		= preprocessing.MinMaxScaler().fit_transform(popularity)
	ma14 					= preprocessing.MinMaxScaler().fit_transform(ma14)
	ma28 					= preprocessing.MinMaxScaler().fit_transform(ma28)
	rsi14 				= preprocessing.MinMaxScaler().fit_transform(rsi14)
	ema14 				= preprocessing.MinMaxScaler().fit_transform(ema14)
	ema28 				= preprocessing.MinMaxScaler().fit_transform(ema28)
	priceScaler 	= GetPriceScaler ( lastDonePrice)
	lastDonePrice = priceScaler.transform( lastDonePrice)

	# Train dataset
	trainDemand 				=  totalDemand[0:index]
	trainSupply 				=  totalSupply[0:index]
	trainBuyUpTrade 		=  buyUpTrade[0:index]
	trainsellDownTrade 	=  sellDownTrade[0:index]
	trainlastDonePrice 	=  lastDonePrice[0:index]
	trainPopularity			=  popularity[0:index]
	trainMA14						=  ma14[0:index]
	trainMA28						=  ma28[0:index]
	trainRSI14					=  rsi14[0:index]
	trainEMA14					=  ema14[0:index]
	trainEMA28					=  ema28[0:index]
 
	# horizontally stack columns
	dataset = hstack((trainDemand, trainSupply, trainBuyUpTrade, trainsellDownTrade, trainPopularity,
	                  trainMA14, trainMA28, trainRSI14, trainEMA14, trainEMA28, trainlastDonePrice ))

	# covert into input/output
	X, y    = split_sequences(dataset, n_steps_in, n_steps_out)

	# the dataset knows the number of features, e.g. 2
	n_features = X.shape[2]

	# Custom ouput
	y = y.reshape( y.shape[0], y.shape[1], 1 )

	# Test dataset
	testDemand 				=  totalDemand[index:len(totalDemand)-1]
	testSupply 				=  totalSupply[index:len(totalDemand)-1]
	testBuyUpTrade 		=  buyUpTrade[index:len(totalDemand)-1]
	testsellDownTrade =  sellDownTrade[index:len(totalDemand)-1]
	testlastDonePrice =  lastDonePrice[index:len(totalDemand)-1]
	testPopularity	 	=  popularity[index:len(totalDemand)-1]
	testMA14					=  ma14[index:len(totalDemand)-1]
	testMA28					=  ma28[index:len(totalDemand)-1]
	testRSI14					=  rsi14[index:len(totalDemand)-1]
	testEMA14					=  ema14[index:len(totalDemand)-1]
	testEMA28					=  ema28[index:len(totalDemand)-1]
	# horizontally stack columns
	testdataset = hstack((testDemand, testSupply, testBuyUpTrade, testsellDownTrade, testPopularity,
	                      testMA14, testMA28, testRSI14, testEMA14, testEMA28,testlastDonePrice ) )
 
	# covert into input/output
	Xtest, ytest    = split_sequences(testdataset, n_steps_in, n_steps_out)
	# the dataset knows the number of features, e.g. 2
	n_features = X.shape[2]

	# Custom ouput
	ytest = ytest.reshape( ytest.shape[0], ytest.shape[1], 1 )

	return X,y, Xtest, ytest, n_steps_in, n_steps_out, priceScaler

In [0]:
def AddTechnicalIndicators( df, n ):
  # Add MA10, MA20
  df["MA14"]    = df['Last Done Price'].rolling(window=14).mean()
  df["MA28"]    = df['Last Done Price'].rolling(window=28).mean()

  # Add RSI
  df['Price Difference'] = df['Last Done Price'].diff()
  df['Gain']  = df['Price Difference'].mask( df['Price Difference'] < 0, 0.000 )
  df['Loss']  = df['Price Difference'].mask( df['Price Difference'] > 0, 0.000 )
  df.loc[n:,'AverageGain'] = rma( df['Gain'][n+1:].values, n, df.loc[:n, 'Gain'].mean())
  df.loc[n:,'AverageLoss'] = rma( df['Loss'][n+1:].values, n, df.loc[:n, 'Loss'].mean())
  df['RS'] = df['AverageGain'] / df['AverageLoss']
  df['RSI14'] = 100 - (100 / (1 + df['RS'] ) )

  # Add EMA
  df['EMA14']   = df.ewm(span = 14).mean()['Last Done Price']
  df['EMA28']   = df.ewm(span = 28).mean()['Last Done Price']
  return df

In [0]:
# Calculate moving average
# Source: https://stackoverflow.com/questions/57006437/calculate-rsi-indicator-from-pandas-dataframe
def rma(x, n, y0):
    a = (n-1) / n
    ak = a**np.arange(len(x)-1, -1, -1)
    return np.append(y0, np.cumsum(ak * x) / ak / n + y0 * a**np.arange(1, len(x)+1))

In [0]:
def GetPriceScaler( values ):
  priceScaler = preprocessing.MinMaxScaler().fit( values )
  return priceScaler;

In [0]:
# define model
# Normal stacked
def LSTM_Model(X,y, Xtest, ytest, priceScaler, n_steps_in, n_steps_out):
  y= y.squeeze()
  n_features = X.shape[2]
  rmselist = [];
  averageprofits = [];
  totalprofits   = [];
  for i in range(repeat):
    model = Sequential()
    model.add(LSTM(100, activation='relu', return_sequences=True, input_shape=(n_steps_in, n_features)))
    model.add(LSTM(100, activation='relu'))
    model.add(Dense(n_steps_out))
    model.compile(optimizer='adam', loss='mse')
    # fit model
    model.fit(X, y, epochs=epoch, batch_size=32, verbose=0)
    model.summary()
    # prediction and results
    rmse, averageprofit, totalprofit,predictionPrices, actualPrices  = makePrediction(model, Xtest, ytest, priceScaler, False)
    averageprofits.append( averageprofit )
    totalprofits.append( totalprofit)
    rmselist.append( rmse)

  print( 'Average RMSE:', np.average(  rmselist ) )
  print( 'Total profit:', np.average(  totalprofits ) )
  print( 'average profit:',np.average(  averageprofits ) )
  return model

In [0]:
# define model mode
# CNN
def CNN_Model(X,y,n_steps_in, Xtest, ytest, priceScaler):
  y= y.squeeze()
  n_features = X.shape[2]
  rmselist = [];
  averageprofits = [];
  totalprofits   = [];
  for i in range(repeat):
    model = Sequential()
    model.add(Conv1D(64, 2, activation='relu', input_shape=(n_steps_in, n_features)))
    model.add(Conv1D(64, 2, activation='relu') )
    model.add(MaxPooling1D())
    model.add(Flatten())
    model.add(Dense(50, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    # fit model
    model.fit(X, y, batch_size=32, epochs=epoch, verbose=0)
    rmse, averageprofit, totalprofit ,predictionPrices, actualPrices = makePrediction(model, Xtest, ytest, priceScaler, False)
    averageprofits.append( averageprofit )
    totalprofits.append( totalprofit)
    rmselist.append( rmse)

  print( 'Average RMSE:', np.average(  rmselist ) )
  print( 'Total profit:', np.average(  totalprofits ) )
  print( 'average profit:',np.average(  averageprofits ) )
  return model

In [0]:
# define model 
# CNN+LSTM
def CNN_LSTM_Model(X,y,n_steps_in, Xtest, ytest, priceScaler):
  x1 = X;
  n_timesteps, n_features= X.shape[1], X.shape[2]
  # reshape data into time steps of sub-sequences
  n_steps, n_length = int(n_steps_in/7 ), 7 # come from 14 input timestep
  n_filters, n_kernel = 64, 2
  n_nodes = 100
  x1 = x1.reshape((X.shape[0], n_steps, n_length, n_features))
  inputshape = (None,n_length,n_features)
  y1= y.squeeze()
  rmselist = [];
  averageprofits = [];
  totalprofits   = [];

  for i in range(repeat):
    model = Sequential()
    model.add(TimeDistributed(Conv1D(n_filters, n_kernel, activation='relu', input_shape=inputshape)))
    model.add(TimeDistributed(Conv1D(n_filters, n_kernel, activation='relu')))
    model.add(TimeDistributed(MaxPooling1D()))
    model.add(TimeDistributed(Flatten()))
    model.add(LSTM(n_nodes,  return_sequences=True))
    model.add(LSTM(n_nodes))
    model.add(Dense(n_nodes, activation='relu'))
    model.add(Dense(1))
    model.compile(loss='mse', optimizer='adam', metrics=['mae', 'mse'])

    # plot model 
    plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
    #print(x1.shape, y1.shape, inputshape, n_filters, n_kernel)
    # fit
    
    model.fit(x1, y1, batch_size=32, epochs=epoch, verbose=0)
    #model.summary()
    rmse, averageprofit, totalprofit, predictionPrices, actualPrices = makePrediction(model, Xtest, ytest, priceScaler, True)
    averageprofits.append( averageprofit )
    totalprofits.append( totalprofit)
    rmselist.append( rmse)
    

  print( 'Average RMSE:', np.average(  rmselist ) )
  print( 'Total profit:', np.average(  totalprofits ) )
  print( 'average profit:',np.average(  averageprofits ) )
  return model

In [0]:
def getFilePaths():
  filepaths = [];
  filepaths.append( "/content/drive/My Drive/FinalizedData/MAYBANK.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/REVENUE.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/HIBISCS.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/AIRASIA.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/EKOVEST.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/FGV.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/GENM.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/JAKS.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/TOPGLOVE.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/DIGI.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/PMETAL.csv" );
  filepaths.append( "/content/drive/My Drive/FinalizedData/PENTA.csv" );
  return filepaths;

In [0]:
epoch = 1;#200
repeat = 1;#5

filepaths = getFilePaths()
for filepath in filepaths:
  X,y,Xtest,ytest,n_steps_in, n_steps_out, priceScaler = GetData(filepath) # Get data
  # Get filename
  fileName = GetFileName( filepath)
  print( fileName )

  # deep learning model
  print( '--------------------------------LSTM---------------------------')
  model = LSTM_Model(X,y, Xtest, ytest, priceScaler, n_steps_in, n_steps_out)
  model.save('/content/drive/My Drive/Models/LSTM/' + fileName + "_LSTM") 

  print( '--------------------------------CNN----------------------------')
  model = CNN_Model(X,y, n_steps_in, Xtest, ytest,priceScaler )
  model.save('/content/drive/My Drive/Models/CNN/' + fileName + "_CNN") 

  print( '------------------------------LSTM+CNN-------------------------')
  model = CNN_LSTM_Model(X,y,n_steps_in,Xtest, ytest, priceScaler)
  model.save('/content/drive/My Drive/Models/CNN_LSTM/' + fileName + "_LSTM_CNN") 
  #model = CNN_LSTM_Model2( X,y, n_steps_in)
  model2 = model;
  print( '=================================================================')
  print( '=================================================================')

In [0]:
def LoadModelAndPredict( algorithm ):
  filepaths = getFilePaths();
  for filepath in filepaths:
    X,y,Xtest,ytest,n_steps_in, n_steps_out, priceScaler = GetData(filepath) # Get data
    # Get filename
    fileName = GetFileName( filepath)
    print( fileName)
    isTransform = True if algorithm == "LSTM_CNN" else False;
    model = keras.models.load_model('/content/drive/My Drive/Models/' + algorithm + '/' + fileName+'_' + algorithm)
    rmse, averageprofit, totalprofit, prediction, actual = makePrediction(model, Xtest, ytest, priceScaler, isTransform )
    print( '-----------------------------')
    plotLineChart( actual, prediction, fileName, algorithm)

In [0]:
LoadModelAndPredict('LSTM')
LoadModelAndPredict('CNN')
LoadModelAndPredict( 'LSTM_CNN')

In [0]:
def plotLineChart( actual, prediction, name, algorithm ):
  x = range(len(actual))
  # plotting the points  
  plt.plot(x, actual, label = 'Actual')
  plt.plot(x, prediction, label = 'Prediction')
  # naming the x axis 
  plt.xlabel('Timebin') 
  # naming the y axis 
  plt.ylabel('Sharep rice') 
 
  # giving a title to my graph 
  plt.title( name + ' ' + algorithm + ' share price prediction') 
  
  plt.legend()
  # function to show the plot 
  plt.show() 

In [0]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [0]:
def GetFileName( filepath):
  startIndex = filepath.find("Data/",0) + 5;
  endIndex   = len(filepath) - 4;
  fileName   = filepath[startIndex:endIndex]
  return fileName

In [0]:
def plotScatterChart():
  import numpy as np
  import matplotlib.pyplot as plt
  import matplotlib.patches as mpatches

  # Plot scatter plot
  cnnlstm = (0.292, 0.076, 5.062)
  lstm    = (0.304, 0.098, 4.59 )
  cnn     = (0.330, 0.078, 2.764 )

  data = ( cnnlstm, lstm, cnn )
  colors = ("red", "green", "blue")
  groups = ("CNN+LSTM", "LSTM", "CNN")

  # Create plot
  fig = plt.figure( figsize= (10,10) )
  ax = fig.add_subplot(1, 1, 1 )

  for data, color, group in zip(data, colors, groups):
    x, y, z = data
    ax.scatter(x, y, alpha=0.8, c=color, s=z*300, edgecolors='none', label=group)

  plt.title('Average results')
  plt.xlabel('Average profit(%)')
  plt.ylabel('RMSE')
  #lgnd = plt.legend(loc=1, fontsize = 5, scatterpoints=1, prop={'size': 26})

  legend_dict = { 'CNN+LSTM' : 'red', 'LSTM' : 'green', 'CNN' : 'blue' }
  patchList = []
  for key in legend_dict:
          data_key = mpatches.Patch(color=legend_dict[key], label=key)
          patchList.append(data_key)

  patchList.append( mpatches.Patch( color ='white', label = "Size is Total Profit (%)"))
  plt.legend(handles=patchList)
  plt.savefig('legend.png', bbox_inches='tight')
  plt.show()