In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime as dt
import oandapy as opy
import logging
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.layers import Dense, Activation, Dropout, Conv2D, Flatten
from tensorflow.keras.layers import LSTM, GRU
from tensorflow.keras.models import Sequential

from tensorflow.keras.callbacks import TensorBoard, EarlyStopping

from tqdm import tqdm_notebook as tqdm

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import statsmodels.api as sm
import matplotlib.pyplot as plt
from tpot import TPOTClassifier, TPOTRegressor
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
import math

%matplotlib notebook

  from pandas.core import datetools


### Oanda

In [2]:
def get_forex(instrument, instruments, granularity, start, end, candleformat, freq, trading=False):
    """
    Obtiene datos de FX de Oanda para los instrumentos que elijamos
    
    Args:
        instrument (str): Instrumento a predecir
        instruments (list): Divisas
        granularity (str): Time Window
        start (str): Primer día
        end (str): último día
        candleformat (str): 'bidask' o 'midpoint'
        freq (str): Timeframe
        trading (bool): Si estamos en producción
    Returns:
        df (DataFrame)
    
    """
    oanda = opy.API(environment='live')
    divs = {}

    for j in instruments:
        print(j)
        # Extraemos datos cada 2 días (por simplicidad)
        d1 = start
        d2 = end
        dates = pd.date_range(start=d1, end=d2, freq=freq)
        df = pd.DataFrame()
        print('Descargando:')
        pbar = tqdm(total=len(dates) - 1)
        
        if trading:
            data = oanda.get_history(instrument=j, 
                                         candleFormat=candleformat,
                                         since=d1, 
                                         granularity=granularity)
            df = pd.DataFrame(data['candles'])
        else:
            
            for i in range(0, len(dates) - 1):
                # Oanda toma las fechas en este formato
                d1 = str(dates[i]).replace(' ', 'T')
                d2 = str(dates[i+1]).replace(' ', 'T')
                try:
                    # Concatenamos cada día en el dataframe

                    data = oanda.get_history(instrument=j, 
                                             candleFormat=candleformat,
                                             start=d1, 
                                             end=d2, 
                                             granularity=granularity)
              
                    df = df.append(pd.DataFrame(data['candles']))
                    pbar.update(1)
                except:
                    pass
                
        pbar.close()
        date = pd.DatetimeIndex(df['time'], tz='UTC')
        df['date'] = date
        cols = [j + '_' + k for k in df.columns]
        df.columns = cols
        divs[j] = df
        
    dat = divs[instruments[0]]
    for i in instruments[1:]:
        join_id = [k for k in divs[i].columns if 'date' in k][0]
        dat = pd.merge(dat, 
                       divs[i],
                       left_on=instrument + '_date', 
                       right_on=join_id, how='left')
            
    return dat

### Ajustes de datos

In [3]:
##### Checar logaritmos de volumenes

def adjust_lags(dat, min_window=None, instrument='USD_JPY', pricediff=True, candleformat='midpoint', log=True, trading=False):
    """
    Ajusta intervalos de tiempo en rangos de una hora
    
    Args:
        dat (DataFrame): Datos
        instrument (str): Divisa
        min_window (int): De cuántos minutos es cada intervalo
        pricediff (bool): Si queremos diferencias en precios
        candleformat (str): ['bidask', 'midpoint']
        log (bool): Si queremos transformación logarítmica
        trading (bool): Si estamos en producción
    Returns:
        df (DataFrame): Datos transformados y ajustados
    """
    
    df = dat.copy()
    date = '{}_date'.format(instrument)
    drops = [k for k in df.columns if date not in k and ('date' in k or 'complete' in k or 'time' in k)]
    df = df.drop(drops, axis=1)
    if trading == False:
        df = df[100:] # Falla en API
    df = df.reset_index(drop=True)
    df = df.fillna(method='ffill')
    df = df.fillna(method='bfill')
    drops = []
    if min_window:
        step = int(60/min_window)
    if candleformat == 'bidask':
        if pricediff:
            if log:
                for i in df.columns:
                    try:
                        df['Diff ' + i] = np.log(df[i]).diff(1)
                        drops.append(i)
                    except Exception as e:
                        print(e)
            else:

                for i in df.columns:
                    try:
                        df['Diff ' + i] = df[i] - df[i].shift(1)
                        drops.append(i)
                    except Exception as e:
                        print(e)
                        
            if min_window:
                
                open_bid = ['Diff openBid' + str(min_window*(i+1)) for i in range(step)]
                open_ask = ['Diff openAsk' + str(min_window*(i+1)) for i in range(step)]
                close_bid = ['Diff closeBid' + str(min_window*(i+1)) for i in range(step)]
                close_ask = ['Diff closeAsk' + str(min_window*(i+1)) for i in range(step)]
                low_bid = ['Diff lowBid' + str(min_window*(i+1)) for i in range(step)]
                low_ask = ['Diff lowAsk' + str(min_window*(i+1)) for i in range(step)]
                high_bid = ['Diff highBid' + str(min_window*(i+1)) for i in range(step)]
                high_ask = ['Diff highAsk' + str(min_window*(i+1)) for i in range(step)]
                volume = ['volume' + str(min_window*(i+1)) for i in range(step)]

                shifts = list(range(1,step+1))

                for v, ob, oa, cb, ca, lb, la, hb, ha, s in zip(volume,
                                                             open_bid, 
                                                             open_ask, 
                                                             close_bid, 
                                                             close_ask, 
                                                             low_bid, 
                                                             low_ask, 
                                                             high_bid, 
                                                             high_ask, 
                                                             shifts):
                    df[v] = df['volume'].shift(s)
                    df[ob] = df['Diff openBid'].shift(s)
                    df[oa] = df['Diff openAsk'].shift(s)
                    df[cb] = df['Diff closeBid'].shift(s)
                    df[ca] = df['Diff closeAsk'].shift(s)
                    df[lb] = df['Diff lowBid'].shift(s)
                    df[la] = df['Diff lowAsk'].shift(s)
                    df[hb] = df['Diff highBid'].shift(s)
                    df[ha] = df['Diff highAsk'].shift(s)

        else:

            if log:
                for i in df.columns:
                    try:
                        df[i] = np.log(df[i])
                    except Exception as e:
                        print(e)
            if min_window:
                open_bid = ['openBid' + str(min_window*(i+1)) for i in range(step)]
                open_ask = ['openAsk' + str(min_window*(i+1)) for i in range(step)]
                close_bid = ['closeBid' + str(min_window*(i+1)) for i in range(step)]
                close_ask = ['closeAsk' + str(min_window*(i+1)) for i in range(step)]
                low_bid = ['lowBid' + str(min_window*(i+1)) for i in range(step)]
                low_ask = ['lowAsk' + str(min_window*(i+1)) for i in range(step)]
                high_bid = ['highBid' + str(min_window*(i+1)) for i in range(step)]
                high_ask = ['highAsk' + str(min_window*(i+1)) for i in range(step)]
                volume = ['volume' + str(min_window*(i+1)) for i in range(step)]

                shifts = list(range(1,step+1))

                for v, ob, oa, cb, ca, lb, la, hb, ha, s in zip(volume,
                                                             open_bid, 
                                                             open_ask, 
                                                             close_bid, 
                                                             close_ask, 
                                                             low_bid, 
                                                             low_ask, 
                                                             high_bid, 
                                                             high_ask, 
                                                             shifts):
                    df[v] = df['volume'].shift(s)
                    df[ob] = df['openBid'].shift(s)
                    df[oa] = df['openAsk'].shift(s)
                    df[cb] = df['closeBid'].shift(s)
                    df[ca] = df['closeAsk'].shift(s)
                    df[lb] = df['lowBid'].shift(s)
                    df[la] = df['lowAsk'].shift(s)
                    df[hb] = df['highBid'].shift(s)
                    df[ha] = df['highAsk'].shift(s)
    else:  

        if pricediff:
            if log:
                for i in df.columns:
                    try:
                        df['Diff ' + i] = np.log(df[i]).diff(1)
                        drops.append(i)
                    except Exception as e:
                        print(e)
            else:
                for i in df.columns:
                    try:
                        df['Diff ' + i] = df[i] - df[i].shift(1)
                        drops.append(i)
                    except Exception as e:
                        print(e)
            if min_window:
                open_ = ['Diff openMid' + str(min_window*(i+1)) for i in range(step)]
                close = ['Diff closeMid' + str(min_window*(i+1)) for i in range(step)]
                low = ['Diff lowMid' + str(min_window*(i+1)) for i in range(step)]
                high = ['Diff highMid' + str(min_window*(i+1)) for i in range(step)]
                volume = ['volume' + str(min_window*(i+1)) for i in range(step)]

                shifts = list(range(1,step+1))

                for v, o, c, l, h, s in zip(volume,
                                             open_, 
                                             close, 
                                             low, 
                                             high, 
                                             shifts):
                    df[v] = df['volume'].shift(s)
                    df[o] = df['Diff openMid'].shift(s)
                    df[c] = df['Diff closeMid'].shift(s)
                    df[l] = df['Diff lowMid'].shift(s)
                    df[h] = df['Diff highMid'].shift(s)

        else:

            if log:
                for i in df.columns:
                    try:
                        df[i] = np.log(df[i])
                    except Exception as e:
                        print(e)
                        
            if min_window:
                
                open_ = ['openMid' + str(min_window*(i+1)) for i in range(step)]
                close = ['closeMid' + str(min_window*(i+1)) for i in range(step)]
                low = ['lowMid' + str(min_window*(i+1)) for i in range(step)]
                high = ['highMid' + str(min_window*(i+1)) for i in range(step)]
                volume = ['volume' + str(min_window*(i+1)) for i in range(step)]

                shifts = list(range(1,step+1))

                for v, o, c, l, h, s in zip(volume,
                                             open_, 
                                             close, 
                                             low, 
                                             high, 
                                             shifts):
                    df[v] = df['volume'].shift(s)
                    df[o] = df['openMid'].shift(s)
                    df[c] = df['closeMid'].shift(s)
                    df[l] = df['lowMid'].shift(s)
                    df[h] = df['highMid'].shift(s)
    high = instrument + '_highMid'
    low = instrument + '_lowMid'
    close = instrument + '_closeMid'
    hcdiff = 'Diff High-Close'
    cldiff = 'Diff Close-Low'
    hldiff = 'Diff High-Low'
    df[hcdiff] = df[high] - df[close]
    df[cldiff] = df[close] - df[low]
    df[hldiff] = df[high] - df[low]
    drops = [i for i in drops if i not in [date, hcdiff, cldiff, hldiff] and 'volume' not in i]
    df = df.drop(drops, axis=1)
    df = df[1:]
    if min_window:
        fake_drop = [i for i in df.columns if 'volume' in i or 'date' in i]
        df['High'] = df.drop(fake_drop, 1).max(axis=1)
        df['Low'] = df.drop(fake_drop, 1).min(axis=1)
        df['vol'] = df[volume].sum(axis=1)
        df = df[step+1:]
    df[date] = df[date].astype(str)
    if min_window:
        df['d2'] = df[date].str[14:]
        df = df[df['d2'] == '00:00+00:00']
        df = df.reset_index(drop=True)
        df = df.drop('d2', axis=1)
    df[date] = df[date].str[:13]
    df = df.fillna(method='ffill')
    df = df.fillna(method='bfill')
    
    return df

### VIF (Variance inflation factor)

In [4]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def get_vif(X):
    """
    Nos da el factor de inflación de la varianza de cada variable independiente

    Args:
        X (DataFrame): DataFrame con datos de nuestras variables independientes
    Returns:
        vif (DataFrame): DataFrame con el factor de inflación de la varianza
                         de cada variable
    """
    vif = pd.DataFrame()
    X['intercept'] = 1
    x = X.values
    vif['vif'] = [variance_inflation_factor(x, i) for i in range(x.shape[1])]
    vif['feature'] = X.columns

    return vif

In [5]:
def reduce_multicol_randomly(data, instrument, pricediff):
    """
    Reduce multicolinealidad aleatoriamente
    
    Args:
        data (DataFrame): Datos
        instrument (str): Divisa
        pricediff (bool): Diferencia de precios
    
    Returns:
        df (DataFrame): Datos con multicolinealidad reducida
    """
    
    df = data.copy()
    if pricediff:
        dontdrop = ['Diff {}_closeMid'.format(instrument), 
                    'Diff {}_highMid'.format(instrument), 
                    'Diff {}_lowMid'.format(instrument)]
    else:
        dontdrop = ['{}_closeMid'.format(instrument), 
                    '{}_highMid'.format(instrument), 
                    '{}_lowMid'.format(instrument)]

    dropping = [1, 2]

    dat = df.drop(dontdrop, axis=1)

    while len(dropping) >= 2:

        vif = get_vif(dat)
        svif = vif.sort_values('vif').reset_index(drop=True)
        display(svif)
        dropping = svif[svif['vif'] >= 100]
        try:
            vif_drops = list(dropping.sample(n=int(len(dropping)/2))['feature'].values)
            dat = dat.drop(vif_drops, axis=1)   
        except:
            print(dropping)
            display(svif)
            
    df['intercept'] = 1
    dfcols = list(dat.columns) + dontdrop
    df = df[dfcols]
    
    return df

In [6]:
def train_test(df, response, train_size=0.75, time_series=False, scaling=None):
    """
    Regresa train y test sets

    Args:
        df (DataFrame): Datos listos para el modelo
        response (str): Variable respuesta
        train_size (float): % Train Size
        time_series (boolean): Si es serie de tiempo o no
        scaling (str): ['standard', 'minmax', 'maxabs', 'robust', 'quantile']
    Returns:
        X_train (Array): conjunto de datos de entrenamiento (indep)
        X_test (Array): conjunto de datos de prueba (indep)
        y_train (Array): conjunto de datos de entrenamiento (dep)
        y_test (Array): conjunto de datos de prueba (dep)
    """

    data = df.copy()
    X = data.drop(response, 1)
    y = data[response]

    logging.info('X columns')
    logging.info(list(X.columns))
    logging.info('Response')
    logging.info(response)

    if time_series:
        trainsize = int(train_size*len(X))
        X_train = X[:trainsize].values
        X_test = X[trainsize:].values
        y_train = y[:trainsize].values
        y_test = y[trainsize:].values

    else:
        X_train, X_test, y_train, y_test = train_test_split(X.values,
                                                            y.values,
                                                            random_state=0,
                                                            train_size=train_size)
    if scaling == 'standard':
        scaler = preprocessing.StandardScaler()
    if scaling == 'minmax':
        scaler = preprocessing.MinMaxScaler()
    if scaling == 'maxabs':
        scaler = preprocessing.MaxAbsScaler()
    if scaling == 'robust':
        scaler = preprocessing.RobustScaler()
    if scaling == 'quantile':
        scaler = preprocessing.QuantileTransformer()

    if scaling != None:
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test

## Data batching

In [7]:
class DataGeneratorSubSeq(object):
    """
    Clase que prepara los datos en batches para entrenar series de tiempo
    """
    def __init__(self,X,Y,window_size):
        """
        Constructor para la clase
        """
        self._X = X
        self._Y = Y
        self._window_size = window_size
        self._num_windows = int(X.shape[0]//window_size)
        print(f'window_size: {window_size}')
    def get_batches(self):
        """
        Obtiene los datos en (num_unroll) batches con tamaño (batch_size)
        """
        batch_data = []
        batch_labels = [] 
        print(self._window_size)
        for i in range(0, len(self._X)-self._window_size):
            batch_data.append(self._X[i:i+self._window_size])
            batch_labels.append(self._Y[i+self._window_size])
        
        return np.asarray(batch_data), np.asarray(batch_labels)

## Stateful LSTM

In [8]:
def make_stateful_lstm(layers,neurons, num_features, batch_size, look_back, drop_out):
    print('Build STATEFUL model...')
    model = Sequential()
    for i in range(layers):
        model.add(LSTM(round(neurons/(i+1)), batch_input_shape=(batch_size, look_back, num_features), stateful=True, return_sequences=True))
        model.add(Dropout(drop_out))
    model.add(LSTM(10, batch_input_shape=(batch_size, look_back, num_features), return_sequences=False, stateful=True))
    model.add(Dropout(drop_out))
    model.add(Dense(1, activation='relu'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    
    return model

# Modelo y entrenamiento

In [9]:
def train_lstm_stateful(model, X_train, y_train, X_test, y_test, epochs):
    num_windows_train = X_train.shape[0]
    num_windows_test = X_test.shape[0]
    look_back = X_train.shape[1]
    
    print('Train...')
    pbar_epochs = tqdm(range(epochs))
    for epoch in range(epochs):
        print(f'epoch: {epoch+1}')
        mean_tr_acc = []
        mean_tr_loss = []
        pbar_train = tqdm(range(num_windows_train))
        for i in range(num_windows_train):
            X_model_train = np.reshape(X_train[i], (1,X_train.shape[1],X_train.shape[2]))
            y_model_train = np.reshape(y_train[i], (1,1))
            
            tr_loss, tr_acc = model.train_on_batch(X_model_train,y_model_train)
            
            mean_tr_acc.append(tr_acc)
            mean_tr_loss.append(tr_loss)
            model.reset_states()
            pbar_train.update(1)
        pbar_train.close()
    
        print(f'accuracy training = {np.mean(mean_tr_acc)}')
        print(f'loss training = {np.mean(mean_tr_loss)}')

        mean_te_acc = []
        mean_te_loss = []
        pbar_test = tqdm(range(num_windows_test))
        for i in range(num_windows_test):
            X_model_test = np.reshape(X_test[i], (1,X_test.shape[1],X_test.shape[2]))
            y_model_test = np.reshape(y_test[i], (1,1))
            
            te_loss, te_acc = model.test_on_batch(X_model_test,y_model_test)
            mean_te_acc.append(te_acc)
            mean_te_loss.append(te_loss)
            model.reset_states()

        #for i in range(num_windows_test-1):
            #y_pred = model.predict_on_batch(np.reshape(X_model_test[i], (1, X_model_test.shape[1],X_model_test.shape[2])))
            #model.reset_states()
    
            pbar_test.update(1)
        pbar_test.close()
        print(f'accuracy testing = {np.mean(mean_te_acc)}')
        print(f'loss testing = {np.mean(mean_te_loss)}')
        print('___________________________________')
        pbar_epochs.update(1)
    pbar_epochs.close()

In [10]:
def model_creation(dat, instrument, pricediff, prints, scaling):
    """
    Crea modelos para pronosticar
    
    Args:
        dat (DataFrame): Datos para modelo
        instrument (str): Divisa
        pricediff (bool): Si queremos diferencia en precios
        prints (int): Cuántos datos imprimir en el plot
        scaling (str): Estandarización
    
    Returns:
        models (dict): Diccionario con modelos
        variables (dict): Diccionario con variables para cada modelo
    """
    window_size = 12
    df = dat.copy()
    DF = df.copy()

    if pricediff:
        Actuals = ['Diff {}_closeMid'.format(instrument),
                   'Diff {}_highMid'.format(instrument),
                   'Diff {}_lowMid'.format(instrument)]

        Responses = ['future diff close', 
                     'future diff high',
                     'future diff low']
    else:
        Actuals = ['{}_closeMid'.format(instrument), 
                   '{}_highMid'.format(instrument),
                   '{}_lowMid'.format(instrument)]
        Responses = ['future close',
                     'future high',
                     'future low']

    models = {}
    variables = {}

    for actual,response in zip(Actuals, Responses):
        df = DF.copy()
        df[response] = df[actual].shift(-1).apply(lambda x: 1 if x>0.0 else 0)
        df = df.drop(Actuals, axis=1)
        #df = get_bestvars(df, response, 0.05, dontdrop=None, fecha=None)
        df = df.dropna()
        display(df.head())
        display(df.corr()[[response]].sort_values(response))
        X_train, X_test, y_train, y_test = train_test(df, 
                                                      response,
                                                      train_size=0.75,
                                                      time_series=True,
                                                      scaling=scaling)

        
        #batching
        print(f'window_size: {window_size}')
        dg_train_sub = DataGeneratorSubSeq(X_train, y_train, window_size)
        dg_test_sub = DataGeneratorSubSeq(X_test, y_test,  window_size)

        X_seq_train_sub, y_seq_train_sub = dg_train_sub.get_batches()
        X_seq_test_sub, y_seq_test_sub = dg_test_sub.get_batches()

        print(f'\nTamaño de los datos de entrenamiento: X {len(X_train)}, Y {len(y_train)}')
        print(f'Número de batches: X {len(X_seq_train_sub)}, Y {len(y_seq_train_sub)}')
        print(f'Número de ejemplos por batch: X {len(X_seq_train_sub[0])}, Y {len(y_seq_train_sub)}')
        print(f'Número de features por ejemplo: X {X_seq_train_sub[0].shape[1]}, Y {y_seq_train_sub.shape[0]}')


        print(f'\nTamaño  de los datos de test: X {len(X_test)}, Y {len(y_test)}')
        print(f'Número de batches: X {len(X_seq_test_sub)}, Y {len(y_seq_test_sub)}')
        print(f'Número de ejemplos por batch: X {len(X_seq_test_sub[0])}, Y {len(y_seq_test_sub)}')
        print(f'Número de features por ejemplo: X {y_seq_test_sub[0].shape}, Y {y_seq_test_sub[0].shape}')
        
        X_train = sm.add_constant(X_train, prepend=True, has_constant='skip')
        X_test = sm.add_constant(X_test, prepend=True, has_constant='skip')
        
        added_layers = 1
        number_neurons = 10
        batch_size = 1
        look_back = X_seq_train_sub.shape[1]
        num_features = X_seq_train_sub.shape[2]
        drop_out = 0.3

        print(f'num_features: {num_features}')
        rnn = make_stateful_lstm(added_layers, number_neurons, num_features, batch_size, look_back, drop_out)
        
        print('RNN LSTM')
        train_lstm_stateful(rnn, X_seq_train_sub, y_seq_train_sub, X_seq_test_sub, y_seq_test_sub, 10)
        
        model_precision(y_test, rnn.predict(X_test), 0.5)
        display(bucket_scores(y_test, rnn.predict(X_test)))
     
    
    return models, variables

def model_precision(y_test, predictions, lim):
    """
    Args:
        y_test (array): Instancias de la variable dependiente
        predictions (array): Predicciones
        lim (float): Entre 0 y 1 que marca el límite de clasificación (arriba de lim se considera cierre)
    
    Returns:
        Accuracy (float): (tp+tn)/(tp+tn+fp+fn)
        Precision (float): tp/(tp+fp)
        Recall (float): tp/(tp+fn)
        F1_score (float): 2/(1/Precision+1/Recall) Media armónica entre Precision y Recall
        MCC (float): Matthiews Correlation Coefficient (tp*tn-fp*fn)/(math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))
    
    """
          
    y_test.shape = [y_test.shape[0],1]
    predictions.shape = [predictions.shape[0],1]
    
    test = np.concatenate((y_test, predictions),axis=1)

    tp = ((test[:,0] == 1) & (test[:,1] >= lim)).sum()
    fp = ((test[:,0] == 0) & (test[:,1] >= lim)).sum()
    tn = ((test[:,0] == 0) & (test[:,1] < lim)).sum()
    fn = ((test[:,0] == 1) & (test[:,1] < lim)).sum()

    print(tp)
    print(fp)
    Accuracy = (tp+tn)/(tp+tn+fp+fn)
    Precision = tp/(tp+fp)
    Recall = tp/(tp+fn)
    F1_score = 2/(1/Precision+1/Recall)
    MCC = (tp*tn-fp*fn)/(math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn)))
    

    print('Accuracy')
    print(100*Accuracy)
    print('\nPrecision')
    print(100*Precision)
    print('\nRecall')
    print(100*Recall)
    print('\nF1 Score')
    print(100*F1_score)
    print('\nMCC')
    print(100*MCC)
    
    return Accuracy, Precision, Recall, F1_score, MCC

def bucket_scores(y_test, predictions):
    """
    Precision por cubeta de 10 en 10
    
    Args:
        y_test (array): Instancias de la variable dependiente
        predictions (array): Predicciones
    
    Returns:
        res (DataFrame): Positive rate por scores de 1 a 100 en cubetas de 10
    
    """
    scoresindex = ['0-10','10-20','20-30','30-40','40-50','50-60','60-70','70-80','80-90','90-100']
    scorescolumns = ['Total','Positives']
    res = pd.DataFrame(0, index=scoresindex, columns=scorescolumns)
    
    y_test.shape = [y_test.shape[0],1]
    predictions.shape = [predictions.shape[0],1]
    
    test = np.concatenate((y_test,predictions),axis=1)
    
    low = 0
    up = 0.1
    for i in scoresindex:
        res.loc[i]['Total'] = ((test[:,1] >= low) & (test[:,1] < up)).sum()
        res.loc[i]['Positives'] = ((test[:,1] >= low) & (test[:,1] < up) & (test[:,0] == 1)).sum()
        low += 0.1
        up += 0.1
    res['Positive Rate'] = res['Positives']/res['Total']*100  
    
    return res


# -----------------------------------------------------------------

In [11]:
candleformat = 'midpoint' # ['midpoint', 'bidask']
instrument = 'USD_JPY'
instruments = ['USD_JPY', 
               'USB02Y_USD',
               'USB05Y_USD',
               'USB10Y_USD', 
               'USB30Y_USD',
               'UK100_GBP',
               'UK10YB_GBP',
               'JP225_USD',
               'HK33_HKD',
               'EU50_EUR',
               'DE30_EUR',
               'DE10YB_EUR',
               'WTICO_USD',
               'US30_USD',
               'SPX500_USD']
               
granularity = 'H1'
start = '2010-01-01'
end = str(dt.now())[:10]
freq = '15D'


if not os.path.exists('get_forex.csv'): # si no existe el archivo lo hace 
    gf = get_forex(instrument, instruments, granularity, start, end, candleformat, freq)
    gf.to_csv('get_forex.csv')
    del gf
gf = pd.read_csv('get_forex.csv', index_col=0)

In [12]:
pricediff = True
log = True
min_window = None
candleformat = 'midpoint' # ['midpoint', 'bidask']

In [13]:
al = adjust_lags(gf, 
                 min_window=min_window,
                 instrument=instrument,
                 pricediff=pricediff, 
                 candleformat=candleformat,
                 log=log)

'str' object has no attribute 'log'


In [14]:
DF = al.copy()
date = 'USD_JPY_date'
DF = DF.drop(date, axis=1)

In [15]:
df = reduce_multicol_randomly(DF, instrument, pricediff)

  vif = 1. / (1. - r_squared_i)


Unnamed: 0,vif,feature
0,1.349205,HK33_HKD_volume
1,1.382239,Diff USD_JPY_volume
2,1.383716,Diff USB02Y_USD_volume
3,1.438337,Diff HK33_HKD_volume
4,1.572028,Diff USD_JPY_openMid
5,1.682841,USB02Y_USD_volume
6,1.952760,Diff UK100_GBP_volume
7,1.981457,Diff JP225_USD_volume
8,1.987083,Diff DE30_EUR_volume
9,2.102304,Diff WTICO_USD_volume


Unnamed: 0,vif,feature
0,1.349205,HK33_HKD_volume
1,1.382239,Diff USD_JPY_volume
2,1.383716,Diff USB02Y_USD_volume
3,1.438337,Diff HK33_HKD_volume
4,1.572028,Diff USD_JPY_openMid
5,1.682841,USB02Y_USD_volume
6,1.952760,Diff UK100_GBP_volume
7,1.981457,Diff JP225_USD_volume
8,1.987083,Diff DE30_EUR_volume
9,2.102304,Diff WTICO_USD_volume


Empty DataFrame
Columns: [vif, feature]
Index: []


Unnamed: 0,vif,feature
0,1.349205,HK33_HKD_volume
1,1.382239,Diff USD_JPY_volume
2,1.383716,Diff USB02Y_USD_volume
3,1.438337,Diff HK33_HKD_volume
4,1.572028,Diff USD_JPY_openMid
5,1.682841,USB02Y_USD_volume
6,1.952760,Diff UK100_GBP_volume
7,1.981457,Diff JP225_USD_volume
8,1.987083,Diff DE30_EUR_volume
9,2.102304,Diff WTICO_USD_volume


In [16]:
df.head()

Unnamed: 0,USD_JPY_volume,USB02Y_USD_volume,USB05Y_USD_volume,USB10Y_USD_volume,USB30Y_USD_volume,UK100_GBP_volume,UK10YB_GBP_volume,JP225_USD_volume,HK33_HKD_volume,EU50_EUR_volume,...,Diff SPX500_USD_highMid,Diff SPX500_USD_lowMid,Diff SPX500_USD_openMid,Diff SPX500_USD_volume,Diff Close-Low,Diff High-Low,intercept,Diff USD_JPY_closeMid,Diff USD_JPY_highMid,Diff USD_JPY_lowMid
1,1054,6.0,13.0,11.0,9.0,696.0,152.0,20.0,713.0,1.0,...,0.0,-0.000175,0.0,1.001449,0.0535,0.202,1,-0.001545,-0.001825,0.001103
2,1939,6.0,34.0,40.0,46.0,696.0,152.0,20.0,713.0,1.0,...,0.000438,0.000438,0.0,0.068993,0.0705,0.338,1,-0.000546,0.000726,-0.000728
3,1905,12.0,63.0,45.0,45.0,696.0,152.0,20.0,713.0,1.0,...,0.000263,-0.000263,0.000613,-0.323227,0.1185,0.242,1,-0.000857,-0.002394,-0.001372
4,1438,7.0,22.0,39.0,48.0,696.0,152.0,20.0,2666.0,1.0,...,-0.000438,0.0,-0.000438,0.111918,0.1575,0.232,1,-0.000311,-0.000835,-0.00073
5,526,6.0,14.0,29.0,18.0,696.0,152.0,20.0,2295.0,1.0,...,0.000876,0.000438,0.000263,-0.125163,0.0625,0.074,1,-9.6e-05,-0.000771,0.000923


In [None]:
prints = 50
scaling = None
models, variables = model_creation(df, instrument, pricediff, prints, scaling)

Unnamed: 0,USD_JPY_volume,USB02Y_USD_volume,USB05Y_USD_volume,USB10Y_USD_volume,USB30Y_USD_volume,UK100_GBP_volume,UK10YB_GBP_volume,JP225_USD_volume,HK33_HKD_volume,EU50_EUR_volume,...,Diff US30_USD_volume,Diff SPX500_USD_closeMid,Diff SPX500_USD_highMid,Diff SPX500_USD_lowMid,Diff SPX500_USD_openMid,Diff SPX500_USD_volume,Diff Close-Low,Diff High-Low,intercept,future diff close
1,1054,6.0,13.0,11.0,9.0,696.0,152.0,20.0,713.0,1.0,...,0.0,0.000438,0.0,-0.000175,0.0,1.001449,0.0535,0.202,1,0
2,1939,6.0,34.0,40.0,46.0,696.0,152.0,20.0,713.0,1.0,...,0.805625,0.000263,0.000438,0.000438,0.0,0.068993,0.0705,0.338,1,0
3,1905,12.0,63.0,45.0,45.0,696.0,152.0,20.0,713.0,1.0,...,-0.384412,-0.000438,0.000263,-0.000263,0.000613,-0.323227,0.1185,0.242,1,0
4,1438,7.0,22.0,39.0,48.0,696.0,152.0,20.0,2666.0,1.0,...,-0.421213,0.000175,-0.000438,0.0,-0.000438,0.111918,0.1575,0.232,1,0
5,526,6.0,14.0,29.0,18.0,696.0,152.0,20.0,2295.0,1.0,...,0.287682,0.000701,0.000876,0.000438,0.000263,-0.125163,0.0625,0.074,1,0


Unnamed: 0,future diff close
Diff USD_JPY_openMid,-0.025477
Diff WTICO_USD_volume,-0.015448
Diff Close-Low,-0.015401
Diff JP225_USD_openMid,-0.010724
Diff JP225_USD_highMid,-0.010386
Diff US30_USD_openMid,-0.009287
Diff US30_USD_volume,-0.008128
Diff JP225_USD_lowMid,-0.008049
Diff US30_USD_highMid,-0.007903
Diff HK33_HKD_openMid,-0.007424


window_size: 12
window_size: 12
window_size: 12
12
12

Tamaño de los datos de entrenamiento: X 42464, Y 42464
Número de batches: X 42452, Y 42452
Número de ejemplos por batch: X 12, Y 42452
Número de features por ejemplo: X 90, Y 42452

Tamaño  de los datos de test: X 14155, Y 14155
Número de batches: X 14143, Y 14143
Número de ejemplos por batch: X 12, Y 14143
Número de features por ejemplo: X (), Y ()
num_features: 90
Build STATEFUL model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (1, 12, 10)               4040      
_________________________________________________________________
dropout (Dropout)            (1, 12, 10)               0         
_________________________________________________________________
lstm_1 (LSTM)                (1, 10)                   840       
_________________________________________________________________
dropout_1 (Dropout)     

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

epoch: 1


HBox(children=(IntProgress(value=0, max=42452), HTML(value='')))

accuracy training = 0.5104353427886963
loss training = 0.7376407980918884


HBox(children=(IntProgress(value=0, max=14143), HTML(value='')))

accuracy testing = 0.4858940839767456
loss testing = 0.6937065124511719
___________________________________
epoch: 2


HBox(children=(IntProgress(value=0, max=42452), HTML(value='')))

accuracy training = 0.5144398212432861
loss training = 0.691989004611969


HBox(children=(IntProgress(value=0, max=14143), HTML(value='')))

accuracy testing = 0.4859647750854492
loss testing = 0.693393886089325
___________________________________
epoch: 3


HBox(children=(IntProgress(value=0, max=42452), HTML(value='')))

accuracy training = 0.515334963798523
loss training = 0.6918091177940369


HBox(children=(IntProgress(value=0, max=14143), HTML(value='')))

accuracy testing = 0.4859647750854492
loss testing = 0.6932982206344604
___________________________________
epoch: 4


HBox(children=(IntProgress(value=0, max=42452), HTML(value='')))

accuracy training = 0.5173372030258179
loss training = 0.6918323040008545


HBox(children=(IntProgress(value=0, max=14143), HTML(value='')))

accuracy testing = 0.4978434443473816
loss testing = 0.693545401096344
___________________________________
epoch: 5


HBox(children=(IntProgress(value=0, max=42452), HTML(value='')))

accuracy training = 0.5151229500770569
loss training = 0.6917411684989929


HBox(children=(IntProgress(value=0, max=14143), HTML(value='')))

accuracy testing = 0.4859647750854492
loss testing = 0.6932247281074524
___________________________________
epoch: 6


HBox(children=(IntProgress(value=0, max=42452), HTML(value='')))

In [None]:
def train_lstm_stateful(model, X_train, y_train, X_test, y_test, epochs):
    num_windows_train = X_train.shape[0]
    num_windows_test = X_test.shape[0]
    look_back = X_train.shape[1]
    
    print('Train...')
    pbar = tqdm(range(epochs))
    for epoch in range(epochs):
        print(f'epoch: {epoch+1}')
        mean_tr_acc = []
        mean_tr_loss = []
        for i in range(num_windows_train-1):
            X_model_train = np.reshape(X_train[i], (1,X_train.shape[1],X_train.shape[2]))
            y_model_train = np.reshape(y_train[i], (1,1))
            
            tr_loss, tr_acc = model.train_on_batch(X_model_train,y_model_train)
            
            mean_tr_acc.append(tr_acc)
            mean_tr_loss.append(tr_loss)
            model.reset_states()

        print(f'accuracy training = {np.mean(mean_tr_acc)}')
        print(f'loss training = {np.mean(mean_tr_loss)}')

        mean_te_acc = []
        mean_te_loss = []
        for i in range(num_windows_test - 1):
            X_model_test = np.reshape(X_test[i], (1,X_test.shape[1],X_test.shape[2]))
            y_model_test = np.reshape(y_test[i], (1,1))
            
            te_loss, te_acc = model.test_on_batch(X_model_test,y_model_test)
            mean_te_acc.append(te_acc)
            mean_te_loss.append(te_loss)
            model.reset_states()

        #for i in range(num_windows_test-1):
            #y_pred = model.predict_on_batch(np.reshape(X_model_test[i], (1, X_model_test.shape[1],X_model_test.shape[2])))
            #model.reset_states()
    
    
        print(f'accuracy testing = {np.mean(mean_te_acc)}')
        print(f'loss testing = {np.mean(mean_te_loss)}')
        print('___________________________________')
        pbar.update(1)
    pbar.close()
    
train_lstm_stateful(X_train, y_train, X_test, y_test, 10)

In [None]:
model = Sequential()
for i in range(4):
    model.add(LSTM(round(10/(i+1)), batch_input_shape=(batch_size, look_back, num_features), stateful=True, return_sequences=True))
    model.add(Dropout(drop_out))
model.add(LSTM(10, batch_input_shape=(batch_size, look_back, num_features), return_sequences=False, stateful=True))
model.add(Dropout(drop_out))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())