# Importing Required Libraries

In [None]:
# Loading required libraries
import torch
import numpy as np
import os

# Triple Barrier Labeling Method

For more deatiles on this particular labeling method please refer to: De Prado, M. L. (2018). Advances in financial machine learning. John Wiley & Sons.

In [None]:
def set_label(mid_prices, horizon, threshold, sequence_end):

    # {0:STATIONARY 1:UP, 2:DOWN}
    
    # TRIPLE BARRIER METHOD
    p_0 = mid_prices[sequence_end - 1]
    upper_first_touched = None
    lower_first_touched = None
    window_mid_prices = mid_prices[sequence_end:sequence_end + horizon]

    # Getting the returns
    returns =  (window_mid_prices - p_0) / p_0

    # Converting returns to numpy array
    returns = np.array(returns)
    upper_bar_index = np.where(returns > threshold)[0]
    lower_bar_index = np.where(returns < -threshold)[0]

    if upper_bar_index.size != 0:
        upper_first_touched = upper_bar_index[0]

    if lower_bar_index.size != 0:
        lower_first_touched = lower_bar_index[0]

    # Labeling
    if upper_first_touched is not None:
        label = 1 # UP

        if lower_first_touched is not None:

            if upper_first_touched < lower_first_touched:
                pass # UP
            else:

                label = 2 # DOWN

    elif lower_first_touched is not None:
        label = 2 # DOWN

    else:
        label = 0 # STATIONARY

    return label

# Generating the sequences 

The following code generates no overlapping sequences of length 100 time steps, the prediction horizon is set to 50, and the labeling method used is the one proposed by Marcos Lopez de Prado in his book "Advances in Financial Machine Learning".

In [None]:
# Parameters
mode = 'generate'
labels_missing = False
dataset_split = 'Training' # Options: ['Training', 'Validation', 'Test']
data_type = 'stationary' 
seq_len = 100
stride = 100
horizon= 50
threshol=0.000001

sequences = []
labels = []
index = 0

# We used the normalized files to have the same order in the dataset generation.
normalized_files = [name for name in os.listdir(dataset_split + '/Normalized/') if os.path.splitext(name)[-2] != '.ipynb_checkpoints']

for j, filename in enumerate(normalized_files):
    if mode == 'generate':
        # Loading the stationary data
        with open(dataset_split + '/Normalized/' + filename, 'rb') as f:
            data = torch.load(f)
            
    # Loading the mid prices
    with open(dataset_split + '/Unscaled/' + filename.replace(ext, '.t'), 'rb') as f:
        # Reading the data and loading tensor
        unscaled = torch.load(f)
        # We extract the mid-prices from the unscaled data
        mid_prices = (unscaled[:, 0] + unscaled[:, 2]) / 2
    
    
    start = 0
    end = seq_len + start
    
    if dataset_split == 'Training': # We restart the sequences container as to avoid accumulating all tensors
        sequences = []

    while end <= len(mid_prices) - horizon:

        sequence_labels = []
        if mode == 'generate':
            sequences.append(data[start:end].clone())
        
        if labels_missing:
            # Getting the labels for the sentence
            sequence_labels.append(set_label(mid_prices=mid_prices, 
                                             horizon, 
                                             threshold, 
                                             sequence_end=end))

            labels.append(sequence_labels)

        start = start + stride
        end = seq_len + start
    
    
    
    if mode == 'generate':
        # Stacking all sequences if applicable
        if dataset_split == 'Training':
            # Creating examples
            number_of_sequences = len(sequences)
            for count, value in enumerate(range(index, index + number_of_sequences)):
                example=sequences[count]
                with open(dataset_split + '/Dataset/' + f'{value}.t' , 'wb') as f:
                    torch.save(example, f)

            # Setting new temp len
            index = index + number_of_sequences
        
    # Printing the number of filenames that are already processed
    print('Filename_' + str(j))

    
if labels_missing:
    # Converting labels to torch tensor
    labels = torch.tensor(labels)

    # Saving labels
    with open(dataset_split + '/Labels/' + dataset_split.lower() + '_labels.t' , 'wb') as f:
        torch.save(labels, f)

if mode == 'generate':
    if dataset_split == 'Training':
        pass
    else:
        dataset = torch.stack(sequences) # shape [batch, sequence, features]

        # Saving sequences
        with open(dataset_split + '/Dataset/' + dataset_split.lower() + '_dataset.t' , 'wb') as f:
            torch.save(dataset, f)