In [7]:
from datetime import datetime
from os import listdir
from os.path import isfile, join
import pandas as pd
from collections import deque
import random
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint, ModelCheckpoint
import time
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [8]:
#data format specified for data gotten from https://www.dukascopy.com/swiss/english/marketwatch/historical/
class Dukascopy_Historical_Data:
    
    def __init__(self,folder_path):
        self.folder_path = folder_path                                                                                 #name of the folder containing forex timeseries data     
        self.filenames = [name for name in listdir(folder_path)]                                                       #extract all the file names in the folder
        self.tickers = [filename.split('_')[0] for filename in self.filenames]                                         #extract ticker name from the file names
        self.candlestick = self.filenames[0].split('_')[2] + ' ' + self.filenames[0].split('_')[3]                     #extract candlestick size from the filenames (assumption: candlestick size is the same for all the files
        self.bid_ask = self.filenames[0].split('_')[4]                                                                 #are these bid prices or ask prices contained in the data?
        self.time_frame = self.filenames[0].split('_')[5].split('.')[0]                                                #start time and end time                                                                                           
        
    def pd(self):
        dict_ = {}                                                                                                     #create a dictionary such that {ticker: pd.DataFrame(OCHL data)} for each ticker in self.tickers
        for ticker,filename in zip(self.tickers,self.filenames):
            dict_[ticker] = pd.read_csv(self.folder_path+f'\\{filename}')
            dict_[ticker].set_index(['Gmt time'],inplace=True)
            dict_[ticker].set_index(pd.to_datetime(dict_[ticker].index,
                                                   infer_datetime_format=True),inplace=True)
            dict_[ticker].columns = [ticker+' '+str(col) for col in dict_[ticker].columns]
        return dict_

In [9]:
class Currency_Pair:
    
    def __init__(self,
                 ticker,
                 candlestick,
                 dataset):
                                                                                                                        #dataset assumed to be Dukascopy_Historical_Data object
        self.ticker = ticker                                                                        
        self.candlestick = candlestick                                                                                  #candlestick as fraction of 1 hour. eg 15min candlestick is 1/4
        self.data = dataset.pd()[self.ticker]                                                                           #data in pd.DataFrame()
        
    #short_long = -1 {short trade}, short_long = 1 {long trade}    
    def return_in_t(self,trade_type,start_date,end_date,):                                                              #trade simulation. given long or short opened at "start_date" compute return at "close_date"
        short_long = {'Short':-1,'Long':1}                                                                              #trade type coefficient
        output = ((self.data[end_date]['Close'][0]-self.data[start_date]['Close'][0])*short_long[trade_type],           #trades are entered end exited at "Close" price. return tuple s.t (net profit (commision excluded),
                  (self.data[end_date]['Close'][0]/self.data[start_date]['Close'][0] - 1)*short_long[trade_type])       #                                                                  gain percentage))
        return output        

In [10]:
class Data_Matrix:
    
    def __init__(self,
                 dataset,
                 ochl='Close'):
        
        self.data = dataset                                                                                             #assumed to be Dukascopy_Historical_Data object
        self.ochl = ochl                                                                                                #Open,High,Close,Low to be the price taken into account in computations
        
    def pearson_corr_matrix(self):                                                                                      #for each ticker give tuple s.t (cross-currency correlation coefficient, 
        output = pd.DataFrame(columns=self.data.tickers,                                                                #                           p-value for h0 -> data follows normal distribution)
                              index=self.data.tickers)                                                                  #put tuples of each ticker pair in n*n matrix 
        for base in self.data.tickers:
            for quote in self.data.tickers:
                if base==quote:
                    output[base][quote] = 1
                else:
                    output[base][quote] = pearsonr(self.data.pd()[base][self.ochl],
                                                   self.data.pd()[quote][self.ochl])
        return output
    
    def normality_test(self,log=True):                                                                                  #test if data follows normal distribution return outcome and p value s.t h0 >> data is normal
        output = pd.DataFrame(columns=self.data.tickers,
                              index=self.data.tickers)
    
    def close(self,):                                                                                                                                                           
        tickers = [ticker for ticker in self.data.tickers]                                                             #ticker names. ie access the dataset dictionary
        close_vals = [self.data.pd()[ticker][self.ochl] for ticker in tickers]                                         #data taken form specified columns (ochl) for each ticker
        output = pd.DataFrame(dict(zip(tickers,close_vals)))                                                           #combined dataframe
        return output                                                                                                  #return pd.DataFrame 
    
    def volume_close(self):                                                                                            
        tickers = [ticker for ticker in self.data.tickers]                                                             #tickers
        volumes = [self.data.pd()[ticker]['Volume'] for ticker in tickers]                                             #volumes                   
        closes = [self.data.pd()[ticker]['Close'] for ticker in tickers]                                               #close prices
        output = pd.DataFrame()                                                                                        #output dataframe
        for i in range(len(tickers)):                                                                                  #for i smaller than size of ticker set...
            output[tickers[i] + ' Close'] = closes[i]                                                                  #assign close values for ticker number i
            output[tickers[i] + ' Volume'] = volumes[i]                                                                #assign volume values for ticker number i
        return output
    
    def volume_close_ma(self,ma_len):                                                                                  #same as above but with volume,close and moving average of len ma_len
        tickers = [ticker for ticker in self.data.tickers]
        volumes = [self.data.pd()[ticker][ticker+' Volume'] for ticker in tickers]
        closes = [self.data.pd()[ticker][ticker+' Close'] for ticker in tickers]
        MAs = [self.data.pd()[ticker][ticker+' Close'].rolling(ma_len).mean() for ticker in tickers]
        output = pd.DataFrame()
        for i in range(len(tickers)):
            output[tickers[i] + ' Close'] = closes[i]
            output[tickers[i] + ' Volume'] = volumes[i]
            output[tickers[i] + ' MA'] = MAs[i]
        return output
        
    #+def strength_rating(self):

In [11]:
class Model:                                                                                                         #collective class for variety of models                                                                                         

    class RNN:
        
        def __init__(self,                                                     
                     data,                                                                                           #pd.DataFram() as data
                     candlestick,                                                                                    #candlestick type (1m,15m,1h,...)  
                     y_ratio,                                                                                        #price growth/drop of specific ticker model will predict
                     period_to_predict,                                                                              #how many candles ahead is the model predicting growth/drop
                     sequence_length,                                                                                #on how many data points is the predictoin based
                     validation_size,                                                                                #% of the data set used as the validation set
                     scale_range,                                                                                    #all data scaled within this range
                     epochs,                                                                                         #how many epochs per model
                     batch_size):                                                                                    #batch size ie data input size for each epoch
            
            self.candlestick = candlestick
            self.data = data
            self.y_ratio = y_ratio
            self.period_to_predict = period_to_predict
            self.sequence_length = sequence_length
            self.validation_size = validation_size
            self.scale_range = scale_range
            self.epochs = epochs
            self.batch_size = batch_size
            self.name = f'{self.sequence_length} {self.candlestick}-SEQ-{self.period_to_predict}-PRED-{int(time.time())}'       #name used to save model and weights (for later comparison)
            
        def classify(self,current,future):                                                                          #classify if the y_ticker data was greater(1) or smaller (0) period_to_predict steps ahead of current val
            return (future>current).astype(int)                    
        
        def format_data(self):                                                                                      #format the data
            X = self.data                                                                                           #predictors
            y = self.data[self.y_ratio].shift(-self.period_to_predict)                                              #value to be predicted (y_ticker value shifted periods_to_predict steps "from the future")
            main_df = X                                                                                             #create pd.DataFram()
            main_df[f'future {self.y_ratio}'] = y                                                                   #create 'future price' column  of values of y                                     
            main_df['target'] = self.classify(main_df[self.y_ratio],                                                #compare present value with future value using classify function. Add result as a calumn
                                              main_df[f'future {self.y_ratio}'])
            validation_df = main_df[int((1-self.validation_size)*len(main_df)):]                                    #split the formated data to training set and validation set
            train_df = main_df[~main_df.isin(validation_df)].dropna()
            return (train_df,validation_df)                                                                         #return tuple s.t (train set, validation set)
        
        def preprocess_df(self,df):
            df = df[df[f'{self.y_ratio.split()[0]} Volume']!=0]                                                                        #every trading day (exclude weekends ie days with 0 volume)
            df = df.drop(f'future {self.y_ratio}',axis=1)                                                                              #get rid of the future data to avoid look-ahead bias
            df = df.dropna()                                                                                                           #drop NaN values
            for col in df.columns:                                                                                                     #for each column...                                                                                    
                if col != 'target':                                                                                                    #except for "target"...
                    df[col]=df[col].pct_change()                                                                                       #change values from absolute values to relative % change                                                                                                 
                    df.dropna(inplace=True)                                                                                            #drop na values caused by the above transformation
                    df[col] = MinMaxScaler(feature_range=self.scale_range).fit_transform(df[col].values.reshape(-1,1))                 #scale all the values within the column to be in range self.scale_range
                    #df[col] = preprocessing.scale(df[col].values)
            df.dropna(inplace=True)                                                                                                    #drop the na values again
              
            sequential_data = []                                                                                    #a set of price sequences of length self.sequence_length 
            prev_candles = deque(maxlen=self.sequence_length)                                                       #creates each of the sequences 
            for row in df.values:                                                                                   #for each row of data in df...
                prev_candles.append([value for value in row[:-1]])                                                  #add the row to deque (except for target)
                if len(prev_candles)==self.sequence_length:                                                         #if the size of the current sequence is of length defined by self.sequence length...
                    sequential_data.append([np.array(prev_candles),row[-1]])                                        #add the entire sequence and the target to the sequantial_data set ie [data,target]
            random.shuffle(sequential_data)                                                                         #mix the order of the sequences inside the set
            
            buys = []                                                                                               #number of optimal long trades
            sells = []                                                                                              #number of optimal short trades
            for seq,target in sequential_data:                                                                      #for sequence and target (1 or 0) in sequantial data...
                if target==0:                                                                                       #if target = 0...
                    sells.append([seq,target])                                                                      #add the sequence,target to sells as sequence that should 'predict' to short
                else: 
                    buys.append([seq,target])                                                                       #otherwise add the sequence to buys as sequence that 'predict' to buy
            random.shuffle(buys)                                                                                    #shuffle the order of the sequences in buys 
            random.shuffle(sells)                                                                                   #shuffle the order of the sequences in sells
            lower = min(len(buys),len(sells))                                                                       #the size of smaller set ie either size of buys or size of sells
            buys = buys[:lower]                                                                                     #make the buy and sell sets sizes equal to lower (do this to prevent model from setting target to the value
            sells = sells[:lower]                                                                                   #more often occuring in the dataset ie "60% of the sequences in the data has target 1 ---> always predict long")                                                                        
                                                                                                                      
            sequential_data = buys+sells                                                                            #join the buys and sells sets
            random.shuffle(sequential_data)                                                                         #shuffle
            X = []                                                                                                  #predictors (data)
            y = []                                                                                                  #target
            for seq, target in sequential_data:                                                                     #for sequence and target in sequential_data...
                X.append(seq)                                                                                       #add sequence to X(predictors)                                                                
                y.append(target)                                                                                    #add target to y(target)
                
            return np.array(X),y                                                                                    #return tuple of  numpy array of X and y
        
        def train_set(self):
            return self.preprocess_df(self.format_data()[0])                                                        #create a training set
            
        def validation_set(self):
            return self.preprocess_df(self.format_data()[1])                                                        #create a validation set
        
        def run(self):
            model = Sequential()                                                                                    #initialize sequential model
            model.add(LSTM(128,input_shape=(self.train_set()[0].shape[1:]),return_sequences=True))                  #add first layer for the train_set 
            model.add(Dropout(0.2))                                                                                 #dropout to prevent overfitting 
            model.add(BatchNormalization())
                      
            model.add(LSTM(128,input_shape=(self.train_set()[0].shape[1:]),return_sequences=True))                 #add second layer
            model.add(Dropout(0.2))
            model.add(BatchNormalization())
                      
            model.add(LSTM(128,input_shape=(self.train_set()[0].shape[1:])))                                       #add third layer
            model.add(Dropout(0.2))
            model.add(BatchNormalization())
                      
            model.add(Dense(32,activation='tanh'))                                                                 #add activation layer 
            model.add(Dropout(0.2))
            
            model.add(Dense(2,activation='softmax'))                                                               #add activation layer
                      
            opt = tf.keras.optimizers.Adam(lr=0.001,decay=1e-6)                                                    #model optimezer
            
            model.compile(loss='sparse_categorical_crossentropy',                                                  #compile the model given the above parameters
                          optimizer=opt,
                          metrics=['accuracy'])
            
            history = model.fit(self.train_set()[0],self.train_set()[1],
                                batch_size=self.batch_size,
                                epochs=self.epochs,
                                validation_data=(self.validation_set()[0],self.validation_set()[1]))               #fit the train_data, validation_data into the compiled model
            
            return history                                                                                         #return fitted model 

In [13]:
if __name__ =='__main__':
    
    data = Dukascopy_Historical_Data('1h_majors')                    #create data objet
    df_multi = Data_Matrix(data).volume_close_ma(20)                 #create dataframe for multiple tickers
    gbpusd = data.pd()['GBPUSD']                                     #create dataframe for one given ticker
    
    multi_df_model = Model.RNN(data=df_multi,                       #model fitting multiple currency pairs as the predictors
                 candlestick='1h',
                 y_ratio='GBPUSD Close',
                 period_to_predict=1,
                 sequence_length=72,
                 validation_size=0.2,
                 scale_range=(0,1),
                 epochs=10,
                 batch_size=512)
    
    single_df_model = Model.RNN(data=gbpusd,                      #model fitting just single currency pair as the predictor
                 candlestick='1h',
                 y_ratio='GBPUSD Close',
                 period_to_predict=1,
                 sequence_length=72,
                 validation_size=0.2,
                 scale_range=(0,1),
                 epochs=10,
                 batch_size=512)
    
    single_df_model.run()                                     #run both the models
    multi_df_model.run()

W1014 15:33:18.542689 17296 deprecation.py:506] From C:\Users\mateu\Anaconda3\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Train on 630 samples, validate on 94 samples


W1014 15:33:19.920172 17296 deprecation.py:323] From C:\Users\mateu\Anaconda3\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Train on 596 samples, validate on 76 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Train on 630 samples, validate on 94 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1d7ec0f6d30>